date:20231214

In preparation for adding sysfs ABI to toggle memmap_on_memory semantics
for drivers adding memory, export the mhp_supports_memmap_on_memory()
helper. This allows drivers to check if memmap_on_memory support is
available before trying to request it, and display an appropriate
message if it isn't available. As part of this, remove the size argument
to this - with recent updates to allow memmap_on_memory for larger
ranges, and the internal splitting of altmaps into respective memory
blocks, the size argument is meaningless.

Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Dave Hansen 
Cc: Huang Ying 
Suggested-by: David Hildenbrand 
Acked-by: David Hildenbrand 
Signed-off-by: Vishal Verma 
---
 include/linux/memory_hotplug.h |  6 ++
 mm/memory_hotplug.c| 17 ++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7d2076583494..ebc9d528f00c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -121,6 +121,7 @@ struct mhp_params {
 
 bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
 struct range mhp_get_pluggable_range(bool need_mapping);
+bool mhp_supports_memmap_on_memory(void);
 
 /*
  * Zone resizing functions
@@ -262,6 +263,11 @@ static inline bool movable_node_is_enabled(void)
return false;
 }
 
+static bool mhp_supports_memmap_on_memory(void)
+{
+   return false;
+}
+
 static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
 static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 926e1cfb10e9..751664c519f7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1325,7 +1325,7 @@ static inline bool 
arch_supports_memmap_on_memory(unsigned long vmemmap_size)
 }
 #endif
 
-static bool mhp_supports_memmap_on_memory(unsigned long size)
+bool mhp_supports_memmap_on_memory(void)
 {
unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1334,17 +1334,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 * Besides having arch support and the feature enabled at runtime, we
 * need a few more assumptions to hold true:
 *
-* a) We span a single memory block: memory onlining/offlinin;g happens
-*in memory block granularity. We don't want the vmemmap of online
-*memory blocks to reside on offline memory blocks. In the future,
-*we might want to support variable-sized memory blocks to make the
-*feature more versatile.
-*
-* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+* a) The vmemmap pages span complete PMDs: We don't want vmemmap code
 *to populate memory from the altmap for unrelated parts (i.e.,
 *other memory blocks)
 *
-* c) The vmemmap pages (and thereby the pages that will be exposed to
+* b) The vmemmap pages (and thereby the pages that will be exposed to
 *the buddy) have to cover full pageblocks: memory 
onlining/offlining
 *code requires applicable ranges to be page-aligned, for example, 
to
 *set the migratetypes properly.
@@ -1356,7 +1350,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 *   altmap as an alternative source of memory, and we do not 
exactly
 *   populate a single PMD.
 */
-   if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+   if (!mhp_memmap_on_memory())
return false;
 
/*
@@ -1379,6 +1373,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 
return arch_supports_memmap_on_memory(vmemmap_size);
 }
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
 
 static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
 {
@@ -1512,7 +1507,7 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
-   mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
+   mhp_supports_memmap_on_memory()) {
ret = create_altmaps_and_memory_blocks(nid, group, start, size);
if (ret)
goto error;

-- 
2.41.0

[PATCH v6 4/4] dax: add a sysfs knob to control memmap_on_memory behavior

Add a sysfs knob for dax devices to control the memmap_on_memory setting
if the dax device were to be hotplugged as system memory.

The default memmap_on_memory setting for dax devices originating via
pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
preserve legacy behavior. For dax devices via CXL, the default is on.
The sysfs control allows the administrator to override the above
defaults if needed.

Cc: David Hildenbrand 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Dave Hansen 
Cc: Huang Ying 
Tested-by: Li Zhijian 
Reviewed-by: Jonathan Cameron 
Reviewed-by: David Hildenbrand 
Signed-off-by: Vishal Verma 
---
 drivers/dax/bus.c   | 36 +
 Documentation/ABI/testing/sysfs-bus-dax | 17 
 2 files changed, 53 insertions(+)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 6226de131d17..3622b3d1c0de 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1245,6 +1245,41 @@ static ssize_t numa_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(numa_node);
 
+static ssize_t memmap_on_memory_show(struct device *dev,
+struct device_attribute *attr, char *buf)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+
+   return sysfs_emit(buf, "%d\n", dev_dax->memmap_on_memory);
+}
+
+static ssize_t memmap_on_memory_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+   struct dev_dax *dev_dax = to_dev_dax(dev);
+   ssize_t rc;
+   bool val;
+
+   rc = kstrtobool(buf, );
+   if (rc)
+   return rc;
+
+   if (val == true && !mhp_supports_memmap_on_memory()) {
+   dev_dbg(dev, "memmap_on_memory is not available\n");
+   return -EOPNOTSUPP;
+   }
+
+   guard(device)(dev);
+   if (dev_dax->memmap_on_memory != val && dev->driver &&
+   to_dax_drv(dev->driver)->type == DAXDRV_KMEM_TYPE)
+   return -EBUSY;
+   dev_dax->memmap_on_memory = val;
+
+   return len;
+}
+static DEVICE_ATTR_RW(memmap_on_memory);
+
 static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int 
n)
 {
struct device *dev = container_of(kobj, struct device, kobj);
@@ -1271,6 +1306,7 @@ static struct attribute *dev_dax_attributes[] = {
_attr_align.attr,
_attr_resource.attr,
_attr_numa_node.attr,
+   _attr_memmap_on_memory.attr,
NULL,
 };
 
diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
b/Documentation/ABI/testing/sysfs-bus-dax
index 6359f7bc9bf4..b34266bfae49 100644
--- a/Documentation/ABI/testing/sysfs-bus-dax
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -134,3 +134,20 @@ KernelVersion: v5.1
 Contact:   nvd...@lists.linux.dev
 Description:
(RO) The id attribute indicates the region id of a dax region.
+
+What:  /sys/bus/dax/devices/daxX.Y/memmap_on_memory
+Date:  January, 2024
+KernelVersion: v6.8
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RW) Control the memmap_on_memory setting if the dax device
+   were to be hotplugged as system memory. This determines whether
+   the 'altmap' for the hotplugged memory will be placed on the
+   device being hotplugged (memmap_on_memory=1) or if it will be
+   placed on regular memory (memmap_on_memory=0). This attribute
+   must be set before the device is handed over to the 'kmem'
+   driver (i.e.  hotplugged into system-ram). Additionally, this
+   depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
+   memmap_on_memory parameter for memory_hotplug. This is
+   typically set on the kernel command line -
+   memory_hotplug.memmap_on_memory set to 'true' or 'force'."

-- 
2.41.0

[PATCH v6 1/4] Documentatiion/ABI: Add ABI documentation for sys-bus-dax

Add the missing sysfs ABI documentation for the device DAX subsystem.
Various ABI attributes under this have been present since v5.1, and more
have been added over time. In preparation for adding a new attribute,
add this file with the historical details.

Cc: Dan Williams 
Signed-off-by: Vishal Verma 
---
 Documentation/ABI/testing/sysfs-bus-dax | 136 
 1 file changed, 136 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
b/Documentation/ABI/testing/sysfs-bus-dax
new file mode 100644
index ..6359f7bc9bf4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-dax
@@ -0,0 +1,136 @@
+What:  /sys/bus/dax/devices/daxX.Y/align
+Date:  October, 2020
+KernelVersion: v5.10
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RW) Provides a way to specify an alignment for a dax device.
+   Values allowed are constrained by the physical address ranges
+   that back the dax device, and also by arch requirements.
+
+What:  /sys/bus/dax/devices/daxX.Y/mapping
+Date:  October, 2020
+KernelVersion: v5.10
+Contact:   nvd...@lists.linux.dev
+Description:
+   (WO) Provides a way to allocate a mapping range under a dax
+   device. Specified in the format -.
+
+What:  /sys/bus/dax/devices/daxX.Y/mapping[0..N]/start
+What:  /sys/bus/dax/devices/daxX.Y/mapping[0..N]/end
+What:  /sys/bus/dax/devices/daxX.Y/mapping[0..N]/page_offset
+Date:  October, 2020
+KernelVersion: v5.10
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) A dax device may have multiple constituent discontiguous
+   address ranges. These are represented by the different
+   'mappingX' subdirectories. The 'start' attribute indicates the
+   start physical address for the given range. The 'end' attribute
+   indicates the end physical address for the given range. The
+   'page_offset' attribute indicates the offset of the current
+   range in the dax device.
+
+What:  /sys/bus/dax/devices/daxX.Y/resource
+Date:  June, 2019
+KernelVersion: v5.3
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) The resource attribute indicates the starting physical
+   address of a dax device. In case of a device with multiple
+   constituent ranges, it indicates the starting address of the
+   first range.
+
+What:  /sys/bus/dax/devices/daxX.Y/size
+Date:  October, 2020
+KernelVersion: v5.10
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RW) The size attribute indicates the total size of a dax
+   device. For creating subdivided dax devices, or for resizing
+   an existing device, the new size can be written to this as
+   part of the reconfiguration process.
+
+What:  /sys/bus/dax/devices/daxX.Y/numa_node
+Date:  November, 2019
+KernelVersion: v5.5
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) If NUMA is enabled and the platform has affinitized the
+   backing device for this dax device, emit the CPU node
+   affinity for this device.
+
+What:  /sys/bus/dax/devices/daxX.Y/target_node
+Date:  February, 2019
+KernelVersion: v5.1
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) The target-node attribute is the Linux numa-node that a
+   device-dax instance may create when it is online. Prior to
+   being online the device's 'numa_node' property reflects the
+   closest online cpu node which is the typical expectation of a
+   device 'numa_node'. Once it is online it becomes its own
+   distinct numa node.
+
+What:  $(readlink -f 
/sys/bus/dax/devices/daxX.Y)/../dax_region/available_size
+Date:  October, 2020
+KernelVersion: v5.10
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) The available_size attribute tracks available dax region
+   capacity. This only applies to volatile hmem devices, not pmem
+   devices, since pmem devices are defined by nvdimm namespace
+   boundaries.
+
+What:  $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/size
+Date:  July, 2017
+KernelVersion: v5.1
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) The size attribute indicates the size of a given dax region
+   in bytes.
+
+What:  $(readlink -f /sys/bus/dax/devices/daxX.Y)/../dax_region/align
+Date:  October, 2020
+KernelVersion: v5.10
+Contact:   nvd...@lists.linux.dev
+Description:
+   (RO) The align attribute indicates alignment of the dax region.
+   Changes on align may not always be valid, when say certain
+

[PATCH v6 2/4] dax/bus: Use guard(device) in sysfs attribute helpers

Use the guard(device) macro to lock a 'struct device', and unlock it
automatically when going out of scope using Scope Based Resource
Management semantics. A lot of the sysfs attribute writes in
drivers/dax/bus.c benefit from a cleanup using these, so change these
where applicable.

Cc: Joao Martins 
Cc: Dan Williams 
Signed-off-by: Vishal Verma 
---
 drivers/dax/bus.c | 143 ++
 1 file changed, 59 insertions(+), 84 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1ff1ab5fa105..6226de131d17 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -294,13 +294,10 @@ static ssize_t available_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
 {
struct dax_region *dax_region = dev_get_drvdata(dev);
-   unsigned long long size;
 
-   device_lock(dev);
-   size = dax_region_avail_size(dax_region);
-   device_unlock(dev);
+   guard(device)(dev);
 
-   return sprintf(buf, "%llu\n", size);
+   return sprintf(buf, "%llu\n", dax_region_avail_size(dax_region));
 }
 static DEVICE_ATTR_RO(available_size);
 
@@ -309,17 +306,14 @@ static ssize_t seed_show(struct device *dev,
 {
struct dax_region *dax_region = dev_get_drvdata(dev);
struct device *seed;
-   ssize_t rc;
 
if (is_static(dax_region))
return -EINVAL;
 
-   device_lock(dev);
+   guard(device)(dev);
seed = dax_region->seed;
-   rc = sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
-   device_unlock(dev);
 
-   return rc;
+   return sprintf(buf, "%s\n", seed ? dev_name(seed) : "");
 }
 static DEVICE_ATTR_RO(seed);
 
@@ -328,24 +322,28 @@ static ssize_t create_show(struct device *dev,
 {
struct dax_region *dax_region = dev_get_drvdata(dev);
struct device *youngest;
-   ssize_t rc;
 
if (is_static(dax_region))
return -EINVAL;
 
-   device_lock(dev);
+   guard(device)(dev);
youngest = dax_region->youngest;
-   rc = sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
-   device_unlock(dev);
 
-   return rc;
+   return sprintf(buf, "%s\n", youngest ? dev_name(youngest) : "");
 }
 
 static ssize_t create_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
 {
struct dax_region *dax_region = dev_get_drvdata(dev);
+   struct dev_dax_data data = {
+   .dax_region = dax_region,
+   .size = 0,
+   .id = -1,
+   .memmap_on_memory = false,
+   };
unsigned long long avail;
+   struct dev_dax *dev_dax;
ssize_t rc;
int val;
 
@@ -358,38 +356,25 @@ static ssize_t create_store(struct device *dev, struct 
device_attribute *attr,
if (val != 1)
return -EINVAL;
 
-   device_lock(dev);
+   guard(device)(dev);
avail = dax_region_avail_size(dax_region);
if (avail == 0)
-   rc = -ENOSPC;
-   else {
-   struct dev_dax_data data = {
-   .dax_region = dax_region,
-   .size = 0,
-   .id = -1,
-   .memmap_on_memory = false,
-   };
-   struct dev_dax *dev_dax = devm_create_dev_dax();
+   return -ENOSPC;
 
-   if (IS_ERR(dev_dax))
-   rc = PTR_ERR(dev_dax);
-   else {
-   /*
-* In support of crafting multiple new devices
-* simultaneously multiple seeds can be created,
-* but only the first one that has not been
-* successfully bound is tracked as the region
-* seed.
-*/
-   if (!dax_region->seed)
-   dax_region->seed = _dax->dev;
-   dax_region->youngest = _dax->dev;
-   rc = len;
-   }
-   }
-   device_unlock(dev);
+   dev_dax = devm_create_dev_dax();
+   if (IS_ERR(dev_dax))
+   return PTR_ERR(dev_dax);
 
-   return rc;
+   /*
+* In support of crafting multiple new devices simultaneously multiple
+* seeds can be created, but only the first one that has not been
+* successfully bound is tracked as the region seed.
+*/
+   if (!dax_region->seed)
+   dax_region->seed = _dax->dev;
+   dax_region->youngest = _dax->dev;
+
+   return len;
 }
 static DEVICE_ATTR_RW(create);
 
@@ -481,12 +466,9 @@ static int __free_dev_dax_id(struct dev_dax *dev_dax)
 static int free_dev_dax_id(struct dev_dax *dev_dax)
 {
struct device *dev = _dax->dev;
-   int rc;
 
-   device_lock(dev);
-   rc = __free_dev_dax_id(dev_dax);
-   device_unlock(dev);
-   return rc;
+

[PATCH v6 0/4] Add DAX ABI for memmap_on_memory

The DAX drivers were missing sysfs ABI documentation entirely.  Add this
missing documentation for the sysfs ABI for DAX regions and Dax devices
in patch 1. Switch to guard(device) semantics for Scope Based Resource
Management for device_{lock,unlock} flows in drivers/dax/bus.c in patch
2. Export mhp_supports_memmap_on_memory() in patch 3. Add a new ABI for
toggling memmap_on_memory semantics in patch 4.

The missing ABI was spotted in [1], this series is a split of the new
ABI additions behind the initial documentation creation.

[1]: 
https://lore.kernel.org/linux-cxl/651f27b728fef_ae7e729...@dwillia2-xfh.jf.intel.com.notmuch/

---
This series depends on [2] which adds the definition for guard(device).
[2]: 
https://lore.kernel.org/r/170250854466.1522182.17555361077409628655.st...@dwillia2-xfh.jf.intel.com

---

Other Logistics -

Andrew, would you prefer patch 3 to go through mm? Or through the dax
tree with an mm ack? The remaining patches are all contained to dax, but
do depend on the memmap_on_memory set that is currently in mm-stable.

---
Changes in v6:
- Use sysfs_emit() in memmap_on_memory_show() (Greg)
- Change the ABI documentation date for memmap_on_memory to January 2024
  as that's likely when the 6.8 merge window will fall (Greg)
- Fix dev->driver check (Ying)
- Link to v5: 
https://lore.kernel.org/r/20231214-vv-dax_abi-v5-0-3f7b00696...@intel.com

Changes in v5:
- Export and check mhp_supports_memmap_on_memory() in the DAX sysfs ABI
  (David)
- Obtain dax_drv under the device lock (Ying)
- Check dax_drv for NULL before dereferencing it (Ying)
- Clean up some repetition in sysfs-bus-dax documentation entries
  (Jonathan)
- A few additional cleanups enabled by guard(device) (Jonathan)
- Drop the DEFINE_GUARD() part of patch 2, add dependency on Dan's patch
  above so it can be backported / applied separately (Jonathan, Dan)
- Link to v4: 
https://lore.kernel.org/r/20231212-vv-dax_abi-v4-0-1351758f0...@intel.com

Changes in v4:
- Hold the device lock when checking if the dax_dev is bound to kmem
  (Ying, Dan)
- Remove dax region checks (and locks) as they were unnecessary.
- Introduce guard(device) for device lock/unlock (Dan)
- Convert the rest of drivers/dax/bus.c to guard(device)
- Link to v3: 
https://lore.kernel.org/r/20231211-vv-dax_abi-v3-0-acf6cc1bd...@intel.com

Changes in v3:
- Fix typo in ABI docs (Zhijian Li)
- Add kernel config and module parameter dependencies to the ABI docs
  entry (David Hildenbrand)
- Ensure kmem isn't active when setting the sysfs attribute (Ying
  Huang)
- Simplify returning from memmap_on_memory_store()
- Link to v2: 
https://lore.kernel.org/r/20231206-vv-dax_abi-v2-0-f4f4f2336...@intel.com

Changes in v2:
- Fix CC lists, patch 1/2 didn't get sent correctly in v1
- Link to v1: 
https://lore.kernel.org/r/20231206-vv-dax_abi-v1-0-474eb88e2...@intel.com

Cc: 
Cc: 
Cc: 
Cc: David Hildenbrand 
Cc: Dave Hansen 
Cc: Huang Ying 
Cc: Greg Kroah-Hartman 
Cc: 
To: Dan Williams 
To: Vishal Verma 
To: Dave Jiang 
To: Andrew Morton 
To: Oscar Salvador 

---
Vishal Verma (4):
  Documentatiion/ABI: Add ABI documentation for sys-bus-dax
  dax/bus: Use guard(device) in sysfs attribute helpers
  mm/memory_hotplug: export mhp_supports_memmap_on_memory()
  dax: add a sysfs knob to control memmap_on_memory behavior

 include/linux/memory_hotplug.h  |   6 ++
 drivers/dax/bus.c   | 179 +---
 mm/memory_hotplug.c |  17 ++-
 Documentation/ABI/testing/sysfs-bus-dax | 153 +++
 4 files changed, 260 insertions(+), 95 deletions(-)
---
base-commit: a6e0c2ca980d75d5ac6b2902c5c0028eaf094db3
change-id: 20231025-vv-dax_abi-17a219c46076

Best regards,
-- 
Vishal Verma

Re: [PATCH v5 4/4] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-14 Thread Huang, Ying

Vishal Verma  writes:

> Add a sysfs knob for dax devices to control the memmap_on_memory setting
> if the dax device were to be hotplugged as system memory.
>
> The default memmap_on_memory setting for dax devices originating via
> pmem or hmem is set to 'false' - i.e. no memmap_on_memory semantics, to
> preserve legacy behavior. For dax devices via CXL, the default is on.
> The sysfs control allows the administrator to override the above
> defaults if needed.
>
> Cc: David Hildenbrand 
> Cc: Dan Williams 
> Cc: Dave Jiang 
> Cc: Dave Hansen 
> Cc: Huang Ying 
> Tested-by: Li Zhijian 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: David Hildenbrand 
> Signed-off-by: Vishal Verma 
> ---
>  drivers/dax/bus.c   | 38 
> +
>  Documentation/ABI/testing/sysfs-bus-dax | 17 +++
>  2 files changed, 55 insertions(+)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 6226de131d17..f4d3beec507c 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1245,6 +1245,43 @@ static ssize_t numa_node_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(numa_node);
>  
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);
> +}
> +
> +static ssize_t memmap_on_memory_store(struct device *dev,
> +   struct device_attribute *attr,
> +   const char *buf, size_t len)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + struct dax_device_driver *dax_drv;
> + ssize_t rc;
> + bool val;
> +
> + rc = kstrtobool(buf, );
> + if (rc)
> + return rc;
> +
> + if (val == true && !mhp_supports_memmap_on_memory()) {
> + dev_dbg(dev, "memmap_on_memory is not available\n");
> + return -EOPNOTSUPP;
> + }
> +
> + guard(device)(dev);
> + dax_drv = to_dax_drv(dev->driver);

Although "struct driver" is the first member of "struct
dax_device_driver", I feel the code is fragile to depends on that.  Can
we check dev->driver directly instead?

--
Best Regards,
Huang, Ying

> + if (dax_drv && dev_dax->memmap_on_memory != val &&
> + dax_drv->type == DAXDRV_KMEM_TYPE)
> + return -EBUSY;
> + dev_dax->memmap_on_memory = val;
> +
> + return len;
> +}
> +static DEVICE_ATTR_RW(memmap_on_memory);
> +
>  static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, 
> int n)
>  {
>   struct device *dev = container_of(kobj, struct device, kobj);
> @@ -1271,6 +1308,7 @@ static struct attribute *dev_dax_attributes[] = {
>   _attr_align.attr,
>   _attr_resource.attr,
>   _attr_numa_node.attr,
> + _attr_memmap_on_memory.attr,
>   NULL,
>  };
>  
> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index 6359f7bc9bf4..40d9965733b2 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -134,3 +134,20 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:October, 2023
> +KernelVersion:   v6.8
> +Contact: nvd...@lists.linux.dev
> +Description:
> + (RW) Control the memmap_on_memory setting if the dax device
> + were to be hotplugged as system memory. This determines whether
> + the 'altmap' for the hotplugged memory will be placed on the
> + device being hotplugged (memmap_on_memory=1) or if it will be
> + placed on regular memory (memmap_on_memory=0). This attribute
> + must be set before the device is handed over to the 'kmem'
> + driver (i.e.  hotplugged into system-ram). Additionally, this
> + depends on CONFIG_MHP_MEMMAP_ON_MEMORY, and a globally enabled
> + memmap_on_memory parameter for memory_hotplug. This is
> + typically set on the kernel command line -
> + memory_hotplug.memmap_on_memory set to 'true' or 'force'."

Re: [PATCH] ring-buffer: Do not try to put back write_stamp

On Thu, 14 Dec 2023 22:18:03 -0500
Steven Rostedt  wrote:

> From: "Steven Rostedt (Google)" 
> 
> If an update to an event is interrupted by another event between the time
> the initial event allocated its buffer and where it wrote to the
> write_stamp, the code try to reset the write stamp back to the what it had
> just overwritten. It knows that it was overwritten via checking the
> before_stamp, and if it didn't match what it wrote to the before_stamp
> before it allocated its space, it knows it was overwritten.
> 
> To put back the write_stamp, it uses the before_stamp it read. The problem
> here is that by writing the before_stamp to the write_stamp it makes the
> two equal again, which means that the write_stamp can be considered valid
> as the last timestamp written to the ring buffer. But this is not
> necessarily true. The event that interrupted the event could have been
> interrupted in a way that it was interrupted as well, and can end up
> leaving with an invalid write_stamp. But if this happens and returns to
> this context that uses the before_stamp to update the write_stamp again,
> it can possibly incorrectly make it valid, causing later events to have in
> correct time stamps.
> 
> As it is OK to leave this function with an invalid write_stamp (one that
> doesn't match the before_stamp), there's no reason to try to make it valid
> again in this case. If this race happens, then just leave with the invalid
> write_stamp and the next event to come along will just add a absolute
> timestamp and validate everything again.
> 
> Bonus points: This gets rid of another cmpxchg64!
> 

With this patch and 
https://lore.kernel.org/linux-trace-kernel/2023124420.36dde...@gandalf.local.home

I remove two critical uses of cmpxchg64, leaving only two other use cases.
Both of which are pretty much optional!

One is used in the discard path, which I plan on getting rid of anyway, and
if the cmpxchg64 fails there, it just doesn't discard the event but
converts it to padding.

The other location is in another slow path were it tries to get a more
accurate timestamp. If that cmpxchg64 fails, it just uses the timestamp of
the previous event that interrupted it. Which is what the code use to do
before all this timestamp magic was done.

That is, I can still get rid of the 32-bit cmpxchg64 logic and instead just
replace it with:

#ifdef CONFIG_ARCH_DOES_NOT_HAVE_CMPXCHG64 /* I made this up, but you get the 
gist */
static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
return false;
}
#else
static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
return local64_try_cmpxchg(>time, , set);
}
#endif

And that makes things so much easier!

Again, after this and the other patch applied, the cmpxcg64 is just a
nice-to-have and not required for the ring buffer to work.

-- Steve

[PATCH v2] ring-buffer: Do not try to put back write_stamp

From: "Steven Rostedt (Google)" 

If an update to an event is interrupted by another event between the time
the initial event allocated its buffer and where it wrote to the
write_stamp, the code try to reset the write stamp back to the what it had
just overwritten. It knows that it was overwritten via checking the
before_stamp, and if it didn't match what it wrote to the before_stamp
before it allocated its space, it knows it was overwritten.

To put back the write_stamp, it uses the before_stamp it read. The problem
here is that by writing the before_stamp to the write_stamp it makes the
two equal again, which means that the write_stamp can be considered valid
as the last timestamp written to the ring buffer. But this is not
necessarily true. The event that interrupted the event could have been
interrupted in a way that it was interrupted as well, and can end up
leaving with an invalid write_stamp. But if this happens and returns to
this context that uses the before_stamp to update the write_stamp again,
it can possibly incorrectly make it valid, causing later events to have in
correct time stamps.

As it is OK to leave this function with an invalid write_stamp (one that
doesn't match the before_stamp), there's no reason to try to make it valid
again in this case. If this race happens, then just leave with the invalid
write_stamp and the next event to come along will just add a absolute
timestamp and validate everything again.

Cc: sta...@vger.kernel.org
Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running 
time stamp")
Signed-off-by: Steven Rostedt (Google) 
---
Changes since v1: 
https://lore.kernel.org/all/20231214221803.1a923...@gandalf.local.home/

- I forgot to remove the commit that deletes the rb_time_* logic.
  Had to rebase the patch.

 kernel/trace/ring_buffer.c | 29 ++---
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d9caee7f542..2668dde23343 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3612,14 +3612,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu 
*cpu_buffer,
}
 
if (likely(tail == w)) {
-   u64 save_before;
-   bool s_ok;
-
/* Nothing interrupted us between A and C */
  /*D*/ rb_time_set(_buffer->write_stamp, info->ts);
-   barrier();
- /*E*/ s_ok = rb_time_read(_buffer->before_stamp, _before);
-   RB_WARN_ON(cpu_buffer, !s_ok);
+   /*
+* If something came in between C and D, the write stamp
+* may now not be in sync. But that's fine as the before_stamp
+* will be different and then next event will just be forced
+* to use an absolute timestamp.
+*/
if (likely(!(info->add_timestamp &
 (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE
/* This did not interrupt any time update */
@@ -3627,24 +3627,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
else
/* Just use full timestamp for interrupting event */
info->delta = info->ts;
-   barrier();
check_buffer(cpu_buffer, info, tail);
-   if (unlikely(info->ts != save_before)) {
-   /* SLOW PATH - Interrupted between C and E */
-
-   a_ok = rb_time_read(_buffer->write_stamp, 
>after);
-   RB_WARN_ON(cpu_buffer, !a_ok);
-
-   /* Write stamp must only go forward */
-   if (save_before > info->after) {
-   /*
-* We do not care about the result, only that
-* it gets updated atomically.
-*/
-   (void)rb_time_cmpxchg(_buffer->write_stamp,
- info->after, save_before);
-   }
-   }
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
-- 
2.42.0

[PATCH] ring-buffer: Do not try to put back write_stamp

From: "Steven Rostedt (Google)" 

If an update to an event is interrupted by another event between the time
the initial event allocated its buffer and where it wrote to the
write_stamp, the code try to reset the write stamp back to the what it had
just overwritten. It knows that it was overwritten via checking the
before_stamp, and if it didn't match what it wrote to the before_stamp
before it allocated its space, it knows it was overwritten.

To put back the write_stamp, it uses the before_stamp it read. The problem
here is that by writing the before_stamp to the write_stamp it makes the
two equal again, which means that the write_stamp can be considered valid
as the last timestamp written to the ring buffer. But this is not
necessarily true. The event that interrupted the event could have been
interrupted in a way that it was interrupted as well, and can end up
leaving with an invalid write_stamp. But if this happens and returns to
this context that uses the before_stamp to update the write_stamp again,
it can possibly incorrectly make it valid, causing later events to have in
correct time stamps.

As it is OK to leave this function with an invalid write_stamp (one that
doesn't match the before_stamp), there's no reason to try to make it valid
again in this case. If this race happens, then just leave with the invalid
write_stamp and the next event to come along will just add a absolute
timestamp and validate everything again.

Bonus points: This gets rid of another cmpxchg64!

Cc: sta...@vger.kernel.org
Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running 
time stamp")
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 26 ++
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9fdbd08af72f..378ff9e53fe6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3418,12 +3418,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu 
*cpu_buffer,
}
 
if (likely(tail == w)) {
-   u64 save_before;
-
/* Nothing interrupted us between A and C */
  /*D*/ rb_time_set(_buffer->write_stamp, info->ts);
-   barrier();
- /*E*/ rb_time_read(_buffer->before_stamp, _before);
+   /*
+* If something came in between C and D, the write stamp
+* may now not be in sync. But that's fine as the before_stamp
+* will be different and then next event will just be forced
+* to use an absolute timestamp.
+*/
if (likely(!(info->add_timestamp &
 (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE
/* This did not interrupt any time update */
@@ -3431,23 +3433,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
else
/* Just use full timestamp for interrupting event */
info->delta = info->ts;
-   barrier();
check_buffer(cpu_buffer, info, tail);
-   if (unlikely(info->ts != save_before)) {
-   /* SLOW PATH - Interrupted between C and E */
-
-   rb_time_read(_buffer->write_stamp, >after);
-
-   /* Write stamp must only go forward */
-   if (save_before > info->after) {
-   /*
-* We do not care about the result, only that
-* it gets updated atomically.
-*/
-   (void)rb_time_cmpxchg(_buffer->write_stamp,
- info->after, save_before);
-   }
-   }
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
-- 
2.42.0

Re: [PATCH RESEND] mac802154: Fix uninit-value access in ieee802154_hdr_push_sechdr

2023-12-14 Thread Alexander Aring

Hi,

On Thu, Dec 14, 2023 at 10:01 PM Alexander Aring  wrote:
>
> Hi,
>
> On Mon, Dec 4, 2023 at 3:57 AM Miquel Raynal  
> wrote:
> >
> > Hi Zhang,
> >
> > zhang_shur...@foxmail.com wrote on Sat,  2 Dec 2023 22:58:52 +0800:
> >
> > > The syzkaller reported an issue:
> >
> > Subject should start with [PATCH wpan]
> >
> > >
> > > BUG: KMSAN: uninit-value in ieee802154_hdr_push_sechdr 
> > > net/ieee802154/header_ops.c:54 [inline]
> > > BUG: KMSAN: uninit-value in ieee802154_hdr_push+0x971/0xb90 
> > > net/ieee802154/header_ops.c:108
> > >  ieee802154_hdr_push_sechdr net/ieee802154/header_ops.c:54 [inline]
> > >  ieee802154_hdr_push+0x971/0xb90 net/ieee802154/header_ops.c:108
> > >  ieee802154_header_create+0x9c0/0xc00 net/mac802154/iface.c:396
> > >  wpan_dev_hard_header include/net/cfg802154.h:494 [inline]
> > >  dgram_sendmsg+0xd1d/0x1500 net/ieee802154/socket.c:677
> > >  ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96
> > >  sock_sendmsg_nosec net/socket.c:725 [inline]
> > >  sock_sendmsg net/socket.c:748 [inline]
> > >  sys_sendmsg+0x9c2/0xd60 net/socket.c:2494
> > >  ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2548
> > >  __sys_sendmsg+0x225/0x3c0 net/socket.c:2577
> > >  __compat_sys_sendmsg net/compat.c:346 [inline]
> > >  __do_compat_sys_sendmsg net/compat.c:353 [inline]
> > >  __se_compat_sys_sendmsg net/compat.c:350 [inline]
> > >
> > > We found hdr->key_id_mode is uninitialized in 
> > > mac802154_set_header_security()
> > > which indicates hdr.fc.security_enabled should be 0. However, it is set 
> > > to be cb->secen before.
> > > Later, ieee802154_hdr_push_sechdr is invoked, causing KMSAN complains 
> > > uninit-value issue.
> >
> > I am not too deeply involved in the security header but for me it feels
> > like your patch does the opposite of what's needed. We should maybe
> > initialize hdr->key_id_mode based on the value in cb->secen, no? (maybe
> > Alexander will have a better understanding than I have).
>
> I can't help yet with a better answer why syzkaller reports it but it
> will break things as we using skb->cb to pass additional parameters
> through header_ops->create()... in this case it is some sockopts of
> af802154, I guess.
>

Maybe we just need to init some "more" defaults in [0]

- Alex

[0] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/ieee802154/socket.c?h=v6.7-rc5#n474

Re: [PATCH RESEND] mac802154: Fix uninit-value access in ieee802154_hdr_push_sechdr

2023-12-14 Thread Alexander Aring

Hi,

On Mon, Dec 4, 2023 at 3:57 AM Miquel Raynal  wrote:
>
> Hi Zhang,
>
> zhang_shur...@foxmail.com wrote on Sat,  2 Dec 2023 22:58:52 +0800:
>
> > The syzkaller reported an issue:
>
> Subject should start with [PATCH wpan]
>
> >
> > BUG: KMSAN: uninit-value in ieee802154_hdr_push_sechdr 
> > net/ieee802154/header_ops.c:54 [inline]
> > BUG: KMSAN: uninit-value in ieee802154_hdr_push+0x971/0xb90 
> > net/ieee802154/header_ops.c:108
> >  ieee802154_hdr_push_sechdr net/ieee802154/header_ops.c:54 [inline]
> >  ieee802154_hdr_push+0x971/0xb90 net/ieee802154/header_ops.c:108
> >  ieee802154_header_create+0x9c0/0xc00 net/mac802154/iface.c:396
> >  wpan_dev_hard_header include/net/cfg802154.h:494 [inline]
> >  dgram_sendmsg+0xd1d/0x1500 net/ieee802154/socket.c:677
> >  ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96
> >  sock_sendmsg_nosec net/socket.c:725 [inline]
> >  sock_sendmsg net/socket.c:748 [inline]
> >  sys_sendmsg+0x9c2/0xd60 net/socket.c:2494
> >  ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2548
> >  __sys_sendmsg+0x225/0x3c0 net/socket.c:2577
> >  __compat_sys_sendmsg net/compat.c:346 [inline]
> >  __do_compat_sys_sendmsg net/compat.c:353 [inline]
> >  __se_compat_sys_sendmsg net/compat.c:350 [inline]
> >
> > We found hdr->key_id_mode is uninitialized in 
> > mac802154_set_header_security()
> > which indicates hdr.fc.security_enabled should be 0. However, it is set to 
> > be cb->secen before.
> > Later, ieee802154_hdr_push_sechdr is invoked, causing KMSAN complains 
> > uninit-value issue.
>
> I am not too deeply involved in the security header but for me it feels
> like your patch does the opposite of what's needed. We should maybe
> initialize hdr->key_id_mode based on the value in cb->secen, no? (maybe
> Alexander will have a better understanding than I have).

I can't help yet with a better answer why syzkaller reports it but it
will break things as we using skb->cb to pass additional parameters
through header_ops->create()... in this case it is some sockopts of
af802154, I guess.

side note: we should stop doing that with skb->cb and introduce some
802.15.4 specific header_ops callback structure and not depend on such
generic callback which does not fit here (and maybe somebody does a
wrapper around that and reuse skb->cb for their needs, etc.)

- Alex

[PATCH] rpmsg: virtio: free driver_override when rpmsg_remove()

2023-12-14 Thread Xiaolei Wang

free driver_override when rpmsg_remove(), otherwise
the following memory leak will occur:

unreferenced object 0xd55d7080 (size 128):
  comm "kworker/u8:2", pid 56, jiffies 4294893188 (age 214.272s)
  hex dump (first 32 bytes):
72 70 6d 73 67 5f 6e 73 00 00 00 00 00 00 00 00  rpmsg_ns
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
  backtrace:
[<9c94c9c1>] __kmem_cache_alloc_node+0x1f8/0x320
[<2300d89b>] __kmalloc_node_track_caller+0x44/0x70
[<228a60c3>] kstrndup+0x4c/0x90
[<77158695>] driver_set_override+0xd0/0x164
[<3e9c4ea5>] rpmsg_register_device_override+0x98/0x170
[<1c0c89a8>] rpmsg_ns_register_device+0x24/0x30
[<8bbf8fa2>] rpmsg_probe+0x2e0/0x3ec
[] virtio_dev_probe+0x1c0/0x280
[<443331cc>] really_probe+0xbc/0x2dc
[<391064b1>] __driver_probe_device+0x78/0xe0
[] driver_probe_device+0xd8/0x160
[<9c3bd5df>] __device_attach_driver+0xb8/0x140
[<43cd7614>] bus_for_each_drv+0x7c/0xd4
[<3b929a36>] __device_attach+0x9c/0x19c
[] device_initial_probe+0x14/0x20
[<3c999637>] bus_probe_device+0xa0/0xac

Signed-off-by: Xiaolei Wang 
---
 drivers/rpmsg/virtio_rpmsg_bus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index dc87965f8164..1062939c3264 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -378,6 +378,7 @@ static void virtio_rpmsg_release_device(struct device *dev)
struct rpmsg_device *rpdev = to_rpmsg_device(dev);
struct virtio_rpmsg_channel *vch = to_virtio_rpmsg_channel(rpdev);
 
+   kfree(rpdev->driver_override);
kfree(vch);
 }
 
-- 
2.25.1

Re: [PATCH v2] virtio: Add support for no-reset virtio PCI PM

2023-12-14 Thread David Stevens

On Thu, Dec 14, 2023 at 6:47 PM David Hildenbrand  wrote:
>
> On 08.12.23 08:07, David Stevens wrote:
> > If a virtio_pci_device supports native PCI power management and has the
> > No_Soft_Reset bit set, then skip resetting and reinitializing the device
> > when suspending and restoring the device. This allows system-wide low
> > power states like s2idle to be used in systems with stateful virtio
> > devices that can't simply be re-initialized (e.g. virtio-fs).
> >
> > Signed-off-by: David Stevens 
> > ---
> > v1 -> v2:
> >   - Check the No_Soft_Reset bit
> >
> >   drivers/virtio/virtio_pci_common.c | 34 +-
> >   1 file changed, 33 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/virtio/virtio_pci_common.c 
> > b/drivers/virtio/virtio_pci_common.c
> > index c2524a7207cf..3a95ecaf12dc 100644
> > --- a/drivers/virtio/virtio_pci_common.c
> > +++ b/drivers/virtio/virtio_pci_common.c
> > @@ -492,8 +492,40 @@ static int virtio_pci_restore(struct device *dev)
> >   return virtio_device_restore(_dev->vdev);
> >   }
> >
> > +static bool vp_supports_pm_no_reset(struct device *dev)
> > +{
> > + struct pci_dev *pci_dev = to_pci_dev(dev);
> > + u16 pmcsr;
> > +
> > + if (!pci_dev->pm_cap)
> > + return false;
> > +
> > + pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, );
> > + if (PCI_POSSIBLE_ERROR(pmcsr)) {
> > + dev_err(dev, "Unable to query pmcsr");
> > + return false;
> > + }
> > +
> > + return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET;
> > +}
> > +
> > +static int virtio_pci_suspend(struct device *dev)
> > +{
> > + return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev);
> > +}
> > +
> > +static int virtio_pci_resume(struct device *dev)
> > +{
> > + return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev);
> > +}
> > +
> >   static const struct dev_pm_ops virtio_pci_pm_ops = {
> > - SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore)
> > + .suspend = virtio_pci_suspend,
> > + .resume = virtio_pci_resume,
> > + .freeze = virtio_pci_freeze,
> > + .thaw = virtio_pci_restore,
> > + .poweroff = virtio_pci_freeze,
> > + .restore = virtio_pci_restore,
> >   };
> >   #endif
> >
>
> Am I correct with my assumption that this will make s2idle work with 
> virtio-mem-pci as well?
>
> Right now, all suspending is disabled, but really only s4/hibernate is 
> problematic.
>
> [root@vm-0 ~]# echo "s2idle" > /sys/power/mem_sleep
> [root@vm-0 ~]# echo "mem" > /sys/power/state
> [  101.822991] PM: suspend entry (s2idle)
> [  101.828978] Filesystems sync: 0.004 seconds
> [  101.831618] Freezing user space processes
> [  101.834569] Freezing user space processes completed (elapsed 0.001 seconds)
> [  101.836915] OOM killer disabled.
> [  101.838072] Freezing remaining freezable tasks
> [  101.841054] Freezing remaining freezable tasks completed (elapsed 0.001 
> seconds)
> [  101.843538] printk: Suspending console(s) (use no_console_suspend to debug)
> [  101.957676] virtio_mem virtio0: save/restore not supported.
> [  101.957683] virtio-pci :00:02.0: PM: pci_pm_suspend(): 
> virtio_pci_freeze+0x0/0x50 returns -1
> [  101.957702] virtio-pci :00:02.0: PM: dpm_run_callback(): 
> pci_pm_suspend+0x0/0x170 returns -1
> [  101.957718] virtio-pci :00:02.0: PM: failed to suspend async: error -1

QEMU's virtio-pci devices don't advertise no_soft_reset, so this patch
won't affect vanilla QEMU. But if you add PCI_PM_CTRL_NO_SOFT_RESET to
the capability, then it should work. I'm working with crosvm, which
doesn't have virtio-mem implemented, but this patch makes virtio-fs
work with no extra kernel changes.

-David

Re: [PATCH] [v2] nvdimm-btt: fix several memleaks

2023-12-14 Thread Ira Weiny

Dinghao Liu wrote:
> Resources allocated by kcalloc() in btt_freelist_init(),
> btt_rtt_init(), and btt_maplocks_init() are not correctly
> released in their callers when an error happens. For
> example, when an error happens in btt_freelist_init(), its
> caller discover_arenas() will directly free arena, which makes
> arena->freelist a leaked memory. Fix these memleaks by using
> devm_kcalloc() to make the memory auto-freed on driver detach.

Thanks for this work!

> 
> Fixes: 5212e11fde4d ("nd_btt: atomic sector updates")
> Signed-off-by: Dinghao Liu 
> ---
> 
> Changelog:
> 
> v2: -Use devm_kcalloc() to fix the memleaks.
> -Fix the potential leaked memory in btt_rtt_init()
>  and btt_maplocks_init().
> ---
>  drivers/nvdimm/btt.c | 35 ---
>  1 file changed, 16 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> index d5593b0dc700..c55231f42617 100644
> --- a/drivers/nvdimm/btt.c
> +++ b/drivers/nvdimm/btt.c
> @@ -531,13 +531,13 @@ static int arena_clear_freelist_error(struct arena_info 
> *arena, u32 lane)
>   return ret;
>  }
>  
> -static int btt_freelist_init(struct arena_info *arena)
> +static int btt_freelist_init(struct device *dev, struct arena_info *arena)

Both struct arena_info and struct btt contain references to struct nd_btt
which is the device you are passing down this call stack.

Just use the device in the arena/btt rather than passing a device
structure.  That makes the code easier to read and ensures that the device
associated with this arena or btt is used.

>  {
>   int new, ret;
>   struct log_entry log_new;
>   u32 i, map_entry, log_oldmap, log_newmap;
>  
> - arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
> + arena->freelist = devm_kcalloc(dev, arena->nfree, sizeof(struct 
> free_entry),

... devm_kcalloc(>nd_btt.dev, ...)

>   GFP_KERNEL);
>   if (!arena->freelist)
>   return -ENOMEM;
> @@ -718,20 +718,20 @@ static int log_set_indices(struct arena_info *arena)
>   return 0;
>  }
>  
> -static int btt_rtt_init(struct arena_info *arena)
> +static int btt_rtt_init(struct device *dev, struct arena_info *arena)
>  {
> - arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
> + arena->rtt = devm_kcalloc(dev, arena->nfree, sizeof(u32), GFP_KERNEL);
>   if (arena->rtt == NULL)
>   return -ENOMEM;
>  
>   return 0;
>  }
>  
> -static int btt_maplocks_init(struct arena_info *arena)
> +static int btt_maplocks_init(struct device *dev, struct arena_info *arena)
>  {
>   u32 i;
>  
> - arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
> + arena->map_locks = devm_kcalloc(dev, arena->nfree, sizeof(struct 
> aligned_lock),
>   GFP_KERNEL);
>   if (!arena->map_locks)
>   return -ENOMEM;
> @@ -805,9 +805,6 @@ static void free_arenas(struct btt *btt)
>  
>   list_for_each_entry_safe(arena, next, >arena_list, list) {
>   list_del(>list);
> - kfree(arena->rtt);
> - kfree(arena->map_locks);
> - kfree(arena->freelist);

This does not quite work.

free_arenas() is used in the error paths of create_arenas() and
discover_arenas().  In those cases devm_kfree() is probably a better way
to clean up this.

However...

>   debugfs_remove_recursive(arena->debugfs_dir);
>   kfree(arena);

Why can't arena be allocated with devm_*?

We need to change this up a bit more to handle the error path vs regular
device shutdown free (automatic devm frees).

Ira

Re: [PATCH v2 2/2] ARM: dts: qcom: msm8926-motorola-peregrine: Add initial device tree

2023-12-14 Thread Konrad Dybcio





On 12/14/23 21:59, André Apitzsch wrote:

This dts adds support for Motorola Moto G 4G released in 2013.

Add a device tree with initial support for:

- GPIO keys
- Hall sensor
- SDHCI
- Vibrator

Signed-off-by: André Apitzsch 
---

Excellent, thanks!

Reviewed-by: Konrad Dybcio 

[...]


+   pm8226_lvs1: lvs1 {
+   /* Pull-up for I2C lines */
+   regulator-always-on;
+   };

just one q: is this intended to stay a-on, or will it be bound
to some i2c host (presumably camera)?

Konrad

[PATCH v2 1/2] dt-bindings: arm: qcom: Add Motorola Moto G 4G (2013)

2023-12-14 Thread André Apitzsch

Document the compatible for the MSM8926-based Motorola Moto G 4G
smartphone released in 2013.

Acked-by: Krzysztof Kozlowski 
Signed-off-by: André Apitzsch 
---
 Documentation/devicetree/bindings/arm/qcom.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml 
b/Documentation/devicetree/bindings/arm/qcom.yaml
index c968412d86b8..91dc3205d09e 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -185,6 +185,7 @@ properties:
   - enum:
   - microsoft,superman-lte
   - microsoft,tesla
+  - motorola,peregrine
   - const: qcom,msm8926
   - const: qcom,msm8226
 

-- 
2.43.0

[PATCH v2 0/2] ARM: dts: qcom: msm8926-motorola-peregrine: Add initial device tree

2023-12-14 Thread André Apitzsch

This dts adds support for Motorola Moto G 4G released in 2013.

Add a device tree with initial support for:

- GPIO keys
- Hall sensor
- SDHCI
- Vibrator

Signed-off-by: André Apitzsch 
---
Changes in v2:
- Make clear it's about Moto G 4G released in 2013.
- Add a-b to patch 1/2.
- Combine nodes for gpio-keys.
- Link to v1: 
https://lore.kernel.org/r/20231213-peregrine-v1-0-5229e21bc...@apitzsch.eu

---
André Apitzsch (2):
  dt-bindings: arm: qcom: Add Motorola Moto G 4G (2013)
  ARM: dts: qcom: msm8926-motorola-peregrine: Add initial device tree

 Documentation/devicetree/bindings/arm/qcom.yaml|   1 +
 arch/arm/boot/dts/qcom/Makefile|   1 +
 .../dts/qcom/qcom-msm8926-motorola-peregrine.dts   | 291 +
 3 files changed, 293 insertions(+)
---
base-commit: 11651f8cb2e88372d4ed523d909514dc9a613ea3
change-id: 20231208-peregrine-902ab2fb2cf5

Best regards,
-- 
André Apitzsch

Re: [PATCH] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 12:50:29 -0800
Linus Torvalds  wrote:

> > But do all archs have an implementation of cmpxchg64, even if it requires
> > disabling interrupts? If not, then I definitely cannot remove this code.  
> 
> We have a generic header file, so anybody who uses that would get the
> fallback version, ie
> 
> arch_cmpxchg64 -> generic_cmpxchg64_local -> __generic_cmpxchg64_local
> 
> which does that irq disabling thing.
> 
> But no, not everybody is guaranteed to use that fallback. From a quick
> look, ARC, hexagon and CSky don't do this, for example.
> 
> And then I got bored and stopped looking.
> 
> My guess is that *most* 32-bit architectures do not have a 64-bit
> cmpxchg - not even the irq-safe one.

OK, that means I have to completely abandon this change, even for the next
merge window.

I may add a check if cmpxchg64 exists before falling back to the 32bit
cmpxchg version. But even that will have to wait till the next merge 
window, with a fixes tag (but not Cc'd stable) in case anyone would like to
backport it.

Thanks for the advice!

-- Steve

Re: [PATCH] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 at 12:35, Steven Rostedt  wrote:
>
> On Thu, 14 Dec 2023 11:44:55 -0800
> Linus Torvalds  wrote:
>
> > On Thu, 14 Dec 2023 at 08:55, Steven Rostedt  wrote:
> > >
> > > And yes, this does get called in NMI context.
> >
> > Not on an i486-class machine they won't. You don't have a local apic
> > on those, and they won't have any NMI sources under our control (ie
> > NMI does exist, but we're talking purely legacy NMI for "motherboard
> > problems" like RAM parity errors etc)
>
> Ah, so we should not worry about being in NMI context without a 64bit cmpxchg?

.. on x86.

Elsewhere, who knows?

It is *probably* true in most situations. '32-bit' => 'legacy' =>
'less likely to have fancy profiling / irq setups'.

But I really don't know.

> > So no. You need to forget about the whole "do a 64-bit cmpxchg on
> > 32-bit architectures" as being some kind of solution in the short
> > term.
>
> But do all archs have an implementation of cmpxchg64, even if it requires
> disabling interrupts? If not, then I definitely cannot remove this code.

We have a generic header file, so anybody who uses that would get the
fallback version, ie

arch_cmpxchg64 -> generic_cmpxchg64_local -> __generic_cmpxchg64_local

which does that irq disabling thing.

But no, not everybody is guaranteed to use that fallback. From a quick
look, ARC, hexagon and CSky don't do this, for example.

And then I got bored and stopped looking.

My guess is that *most* 32-bit architectures do not have a 64-bit
cmpxchg - not even the irq-safe one.

For the UP case you can do your own, of course.

Linus

ARM64 Livepatch based on SFrame

2023-12-14 Thread Madhavan T. Venkataraman

Hi Mark,

I attended your presentation in the LPC. You mentioned that you could use some 
help with some pre-requisites for the Livepatch feature.
I would like to lend a hand.

What would you like me to implement?

I would also like to implement Unwind Hints for the feature. If you want a 
specific format for the hints, let me know.

Looking forward to help out with the feature.

Madhavan

Re: [PATCH v3] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 12:32:38 -0800
Linus Torvalds  wrote:

> On Thu, 14 Dec 2023 at 12:30, Linus Torvalds
>  wrote:
> >
> > Read my email. Don't do random x86-centric things. We have that
> >
> >   #ifndef system_has_cmpxchg64
> >   #define system_has_cmpxchg64() false
> >   #endif
> >
> > which should work.  
> 
> And again, by "should work" I mean that it would disable this entirely
> on things like arm32 until the arm people decide they care. But at
> least it won't use an unsafe non-working 64-bit cmpxchg.

If archs have no implementation of cmpxchg64 then the code cannot be
removed. If it's just slower and emulated, then it shouldn't be a big deal.
The only thing it may lose is tracing in NMI context, which I'm not sure
that really matters.

> 
> And no, for 6.7, only fix reported bugs. No big reorgs at all,
> particularly for something that likely has never been hit by any user
> and sounds like this all just came out of discussion.

The discussion came out of adding new tests to cover new changes I'm making
in the ring buffer code that happened to trigger subtle bugs in the 32-bit
version of reading the 64-bit timestamps. The reason for that code, is
because of the 64-bit cmpcxhg that is required to implement it. If we are
keeping this code, then there's 2 or 3 fixes to it that I need to send to
you, and also one outstanding one that in theory can be a bug, but in
practice is highly unlikely to ever be hit. The fix for that one is a bit
more involved, and will have to come later.

When I was discussing these fixes and other changes with Mathieu, we
started thinking "is this complexity worth it?" and "does anything actually
need this?".

That's where this patch originated from.

Now, I could also make this special code only compile for the
"!system_has_cmpxchg64" case as well, which shouldn't punish the Atom
processor to do 3 cmpxchg's instead of one cmpxchg8b.

-- Steve

Re: [PATCH] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 11:44:55 -0800
Linus Torvalds  wrote:

> On Thu, 14 Dec 2023 at 08:55, Steven Rostedt  wrote:
> >
> > And yes, this does get called in NMI context.  
> 
> Not on an i486-class machine they won't. You don't have a local apic
> on those, and they won't have any NMI sources under our control (ie
> NMI does exist, but we're talking purely legacy NMI for "motherboard
> problems" like RAM parity errors etc)

Ah, so we should not worry about being in NMI context without a 64bit cmpxchg?

> 
> > I had a patch that added:
> >
> > +   /* ring buffer does cmpxchg, make sure it is safe in NMI context */
> > +   if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
> > +   (unlikely(in_nmi( {
> > +   return NULL;
> > +   }  
> 
> CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG doesn't work on x86 in this context,
> because the issue is that yes, there's a safe 'cmpxchg', but that's
> not what you want.

Sorry, that's from another patch that I combined into this one that I added
in case there's architectures that have NMIs but need to avoid cmpxchg, as
this code does normal long word cmpxchg too. And that change *should* go to
you and stable. It's either just luck that things didn't crash on those
systems today. Or it happens so infrequently, nobody reported it.

> 
> You want the save cmpxchg64, which is an entirely different beast.

Understood, but this code has both. It's just that the "special" code I'm
removing does the 64-bit cmpxchg.

> 
> And honestly, I don't think that NMI_SAFE_CMPXCHG covers the
> double-word case anywhere else either, except purely by luck.

I still need to cover the normal cmpxchg. I'll keep that a separate patch.

> 
> In mm/slab.c, we also use a double-wide cmpxchg, and there the rule
> has literally been that it's conditional on
> 
>  (a) system_has_cmpxchg64() existing as a macro
> 
>  (b) using that macro to then gate - at runtime - whether it actually
> works or not
> 
> I think - but didn't check - that we essentially only enable the
> two-word case on x86 as a result, and fall back to the slow case on
> all other architectures - and on the i486 case.
> 
> That said, other architectures *do* have a working double-word
> cmpxchg, but I wouldn't guarantee it. For example, 32-bit arm does
> have one using ldrexd/strexd, but that only exists on arm v6+.
> 
> And guess what? You'll silently get a "disable interrupts, do it as a
> non-atomic load-store" on arm too for that case. And again, pre-v6 arm
> is about as relevant as i486 is, but my point is, that double-word
> cmpxchg you rely on simply DOES NOT EXIST on 32-bit platforms except
> under special circumstances.
> 
> So this isn't a "x86 is the odd man out". This is literally generic.
> 
> > Now back to my original question. Are you OK with me sending this to you
> > now, or should I send you just the subtle fixes to the 32-bit rb_time_*
> > code and keep this patch for the merge window?  
> 
> I'm absolutely not taking some untested random "let's do 64-bit
> cmpxchg that we know is broken on 32-bit using broken conditionals"
> shit.
> 
> What *would* work is that slab approach, which is essentially
> 
>   #ifndef system_has_cmpxchg64
>   #define system_has_cmpxchg64() false
>   #endif
> 
> ...
> if (!system_has_cmpxchg64())
> return error or slow case
> 
> do_64bit_cmpxchg_case();
> 
> (although the slub case is much more indirect, and uses a
> __CMPXCHG_DOUBLE flag that only gets set when that define exists etc).
> 
> But that would literally cut off support for all non-x86 32-bit architectures.
> 
> So no. You need to forget about the whole "do a 64-bit cmpxchg on
> 32-bit architectures" as being some kind of solution in the short
> term.

But do all archs have an implementation of cmpxchg64, even if it requires
disabling interrupts? If not, then I definitely cannot remove this code.

If they all have an implementation, where some is just very slow, then that
is also perfectly fine. The only time cmpxchg64 gets called is on the slow
path, which is if an event is interrupted by an interrupt, and that
interrupt writes an event to the same buffer.

This doesn't happen often, and if it requires disabling interrupts, then
it shouldn't be much notice.

I just want to avoid the case that it will simply break, which is the NMI
case. In which case, would:

if (!system_has_cmpxchg64() && in_nmi())
return NULL;

Be OK?

-- Steve

Re: [PATCH v3] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 at 12:30, Linus Torvalds
 wrote:
>
> Read my email. Don't do random x86-centric things. We have that
>
>   #ifndef system_has_cmpxchg64
>   #define system_has_cmpxchg64() false
>   #endif
>
> which should work.

And again, by "should work" I mean that it would disable this entirely
on things like arm32 until the arm people decide they care. But at
least it won't use an unsafe non-working 64-bit cmpxchg.

And no, for 6.7, only fix reported bugs. No big reorgs at all,
particularly for something that likely has never been hit by any user
and sounds like this all just came out of discussion.

  Linus

Re: [PATCH v3] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 at 12:18, Steven Rostedt  wrote:
>
> For this issue of the 64bit cmpxchg, is there any config that works for any
> arch that do not have a safe 64-bit cmpxchg? At least for 486, is the
> second half of the if condition reasonable?
>
> if (IS_ENABLED(CONFIG_X86_32) && !IS_ENABLED(CONFIG_X86_CMPXCHG64)) {
> if (unlikely(in_nmi()))
> return NULL;
> }

No.

Read my email. Don't do random x86-centric things. We have that

  #ifndef system_has_cmpxchg64
  #define system_has_cmpxchg64() false
  #endif

which should work.

NOTE! The above is for 32-bit architectures only! For 64-bit ones
either just use cmpxchg directly. And if you need a 128-bit one,
there's system_has_cmpxchg128...

 Linus

Re: [PATCH v3] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 15:19:11 -0500
Steven Rostedt  wrote:

> For this issue of the 64bit cmpxchg, is there any config that works for any
> arch that do not have a safe 64-bit cmpxchg? At least for 486, is the
> second half of the if condition reasonable?
> 
>   if (IS_ENABLED(CONFIG_X86_32) && !IS_ENABLED(CONFIG_X86_CMPXCHG64)) {
>   if (unlikely(in_nmi()))
>   return NULL;
>   }

Ignore this, I'm now reading your other email. I'll have more questions there.

-- Steve

Re: [PATCH RERESEND 00/11] splice(file<>pipe) I/O on file as-if O_NONBLOCK

2023-12-14 Thread Ahelenia Ziemiańska

On Thu, Dec 14, 2023 at 12:06:57PM -0700, Jens Axboe wrote:
> On 12/14/23 11:44 AM, Ahelenia Ziemiańska wrote:
> > This does that, effectively making splice(file -> pipe)
> > request (and require) O_NONBLOCK on reads fron the file:
> > this doesn't affect splicing from regular files and blockdevs,
> > since they're always non-blocking
> > (and requesting the stronger "no kernel sleep" IOCB_NOWAIT is non-sensical),
> Not sure how you got the idea that regular files or block devices is
> always non-blocking, this is certainly not true without IOCB_NOWAIT.
> Without IOCB_NOWAIT, you can certainly be waiting for previous IO to
> complete.
Maybe "always non-blocking" is an abuse of the term, but the terminology
is lost on me. By this I mean that O_NONBLOCK files/blockdevs have the
same semantics as non-O_NONBLOCK files/blockdevs ‒ they may block for a
bit while the I/O queue drains, but are guaranteed to complete within
a relatively narrow bounded time; any contending writer/opener
will be blocked for a short bit but will always wake up.

This is in contrast to pipes/sockets/ttys/, which wait for a peer to
send some data, and block until there is some; any contending writer/opener
will be blocked potentially ad infinitum.

Or, the way I see it, splice(socket -> pipe) can trivially be used to
lock the pipe forever, whereas I don't think splice(regfile -> pipe) can,
regardless of IOCB_NOWAIT, so the specific semantic IOCB_NOWAIT provides
is immaterial here, so not specifying IOCB_NOWAIT in filemap_splice_read()
provides semantics consistent to "file is read as-if it had O_NONBLOCK set".

> > but always returns -EINVAL for ttys.
> > Sockets behave as expected from O_NONBLOCK reads:
> > splice if there's data available else -EAGAIN.
> > 
> > This should all pretty much behave as-expected.
> Should it? Seems like there's a very high risk of breaking existing use
> cases here.
If something wants to splice from a socket to a pipe and doesn't
degrade to read/write if it gets EAGAIN then it will either retry and
hotloop in the splice or error out, yeah.

I don't think this is surmountable.

> Have you at all looked into the approach of enabling splice to/from
> _without_ holding the pipe lock? That, to me, would seem like a much
> saner approach, with the caveat that I have not looked into that at all
> so there may indeed be reasons why this is not feasible.
IIUC Linus prepared a patch on security@ in
  
(you're in To:) and an evolution of this is in
  
https://lore.kernel.org/lkml/CAHk-=wgmld78uslu9a9nspxytm9s6c23ovdin2yja-d8_s0...@mail.gmail.com/t/#u
(you're in Cc:) that does this.

He summarises it below as 
> So while fixing your NULL pointer check should be trivial, I think
> that first patch is actually fundamentally broken wrt pipe resizing,
> and I see no really sane way to fix it. We could add a new lock just
> for that, but I don't think it's worth it.
and
> But it is possible that we need to just bite the bullet and say
> "copy_splice_read() needs to use a non-blocking kiocb for the IO".
so that's what I did.

If Linus, who drew up and maintained this code for ~30 years,
didn't arrive at a satisfactory approach, I, after ~30 minutes,
won't either.

It would be very sane to both not change the semantic and fix the lock
by just not locking but at the top of that thread Christian said
> Splice would have to be
> refactored to not rely on pipe_lock(). That's likely major work with a
> good portion of regressions if the past is any indication.
and clearly this is true, so lacking the ability and capability
to do that and any reasonable other ideas
(You could either limit splices to a proportion of the pipe size,
 but then just doing five splices gets you where we are rn
 (or, as Linus construed it, into "write() returns -EBUSY" territory,
  which is just as bad but at least the writers aren't unkillable),
 and it reduces the I/O per splice significantly.

 You could limit each pipe to one outstanding splice and always leave
 Some space for normal I/O. This falls into "another lock just for this"
 territory I think, and it also sucks for the 99% of normal users.)
I did this because Linus vaguely sanxioned it.

It's probably feasible, but alas it isn't feasible for me.


signature.asc
Description: PGP signature

Re: [PATCH v3] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 11:46:55 -0800
Linus Torvalds  wrote:

> On Thu, 14 Dec 2023 at 09:53, Steven Rostedt  wrote:
> >
> > +   /*
> > +* For architectures that can not do cmpxchg() in NMI, or require
> > +* disabling interrupts to do 64-bit cmpxchg(), do not allow them
> > +* to record in NMI context.
> > +*/
> > +   if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
> > +(IS_ENABLED(CONFIG_X86_32) && 
> > !IS_ENABLED(CONFIG_X86_CMPXCHG64))) &&
> > +   unlikely(in_nmi())) {
> > +   return NULL;
> > +   }  
> 
> Again, this is COMPLETE GARBAGE.
> 
> You're using "ARCH_HAVE_NMI_SAFE_CMPXCHG" to test something that just
> isn't what it's about.

For the 64-bit cmpxchg, on it isn't useful, but this code also calls normal
cmpxchg too. I had that part of the if condition as a separate patch, but
not for this purpose, but for actual architectures that do not support
cmpxchg in NMI. Those are broken here too, and that check fixes it.

> 
> Having a NMI-safe cmpxchg does *not* mean that you actualyl have a
> NMI-safe 64-bit version.

Understood, but we also have cmpxchg in this path as well.

> 
> You can't test it that way.
> 
> Stop making random changes that just happen to work on the one machine
> you tested it on.

They are not random, but they are two different reasons. I still have the
standalone patch that adds the ARCH_HAVE_NMI_SAFE_CMPXCHG for the purpose
of not calling this code on architectures that do not support cmpxchg in
NMI, because if cmpxchg is not supported in NMI, then this should bail.

My mistake was to combine that change into this one which made it looks like
they are related, when in actuality they are not. Which is why there's a
"||" and not an "&&"

For this issue of the 64bit cmpxchg, is there any config that works for any
arch that do not have a safe 64-bit cmpxchg? At least for 486, is the
second half of the if condition reasonable?

if (IS_ENABLED(CONFIG_X86_32) && !IS_ENABLED(CONFIG_X86_CMPXCHG64)) {
if (unlikely(in_nmi()))
return NULL;
}

?

-- Steve

Re: [PATCH v3] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 at 09:53, Steven Rostedt  wrote:
>
> +   /*
> +* For architectures that can not do cmpxchg() in NMI, or require
> +* disabling interrupts to do 64-bit cmpxchg(), do not allow them
> +* to record in NMI context.
> +*/
> +   if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
> +(IS_ENABLED(CONFIG_X86_32) && 
> !IS_ENABLED(CONFIG_X86_CMPXCHG64))) &&
> +   unlikely(in_nmi())) {
> +   return NULL;
> +   }

Again, this is COMPLETE GARBAGE.

You're using "ARCH_HAVE_NMI_SAFE_CMPXCHG" to test something that just
isn't what it's about.

Having a NMI-safe cmpxchg does *not* mean that you actualyl have a
NMI-safe 64-bit version.

You can't test it that way.

Stop making random changes that just happen to work on the one machine
you tested it on.

   Linus

Re: [PATCH] ring-buffer: Remove 32bit timestamp logic

On Thu, 14 Dec 2023 at 08:55, Steven Rostedt  wrote:
>
> And yes, this does get called in NMI context.

Not on an i486-class machine they won't. You don't have a local apic
on those, and they won't have any NMI sources under our control (ie
NMI does exist, but we're talking purely legacy NMI for "motherboard
problems" like RAM parity errors etc)

> I had a patch that added:
>
> +   /* ring buffer does cmpxchg, make sure it is safe in NMI context */
> +   if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
> +   (unlikely(in_nmi( {
> +   return NULL;
> +   }

CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG doesn't work on x86 in this context,
because the issue is that yes, there's a safe 'cmpxchg', but that's
not what you want.

You want the save cmpxchg64, which is an entirely different beast.

And honestly, I don't think that NMI_SAFE_CMPXCHG covers the
double-word case anywhere else either, except purely by luck.

In mm/slab.c, we also use a double-wide cmpxchg, and there the rule
has literally been that it's conditional on

 (a) system_has_cmpxchg64() existing as a macro

 (b) using that macro to then gate - at runtime - whether it actually
works or not

I think - but didn't check - that we essentially only enable the
two-word case on x86 as a result, and fall back to the slow case on
all other architectures - and on the i486 case.

That said, other architectures *do* have a working double-word
cmpxchg, but I wouldn't guarantee it. For example, 32-bit arm does
have one using ldrexd/strexd, but that only exists on arm v6+.

And guess what? You'll silently get a "disable interrupts, do it as a
non-atomic load-store" on arm too for that case. And again, pre-v6 arm
is about as relevant as i486 is, but my point is, that double-word
cmpxchg you rely on simply DOES NOT EXIST on 32-bit platforms except
under special circumstances.

So this isn't a "x86 is the odd man out". This is literally generic.

> Now back to my original question. Are you OK with me sending this to you
> now, or should I send you just the subtle fixes to the 32-bit rb_time_*
> code and keep this patch for the merge window?

I'm absolutely not taking some untested random "let's do 64-bit
cmpxchg that we know is broken on 32-bit using broken conditionals"
shit.

What *would* work is that slab approach, which is essentially

  #ifndef system_has_cmpxchg64
  #define system_has_cmpxchg64() false
  #endif

...
if (!system_has_cmpxchg64())
return error or slow case

do_64bit_cmpxchg_case();

(although the slub case is much more indirect, and uses a
__CMPXCHG_DOUBLE flag that only gets set when that define exists etc).

But that would literally cut off support for all non-x86 32-bit architectures.

So no. You need to forget about the whole "do a 64-bit cmpxchg on
32-bit architectures" as being some kind of solution in the short
term.

   Linus

Re: [PATCH RERESEND 00/11] splice(file<>pipe) I/O on file as-if O_NONBLOCK

2023-12-14 Thread Jens Axboe

On 12/14/23 11:44 AM, Ahelenia Ziemia?ska wrote:
> First:  
> https://lore.kernel.org/lkml/cover.1697486714.git.nabijaczlew...@nabijaczleweli.xyz/t/#u
> Resend: 
> https://lore.kernel.org/lkml/1cover.1697486714.git.nabijaczlew...@nabijaczleweli.xyz/t/#u
> Resending again per 
> https://lore.kernel.org/lkml/20231214093859.01f6e...@kernel.org/t/#u
> 
> Hi!
> 
> As it stands, splice(file -> pipe):
> 1. locks the pipe,
> 2. does a read from the file,
> 3. unlocks the pipe.
> 
> For reading from regular files and blcokdevs this makes no difference.
> But if the file is a tty or a socket, for example, this means that until
> data appears, which it may never do, every process trying to read from
> or open the pipe enters an uninterruptible sleep,
> and will only exit it if the splicing process is killed.
> 
> This trivially denies service to:
> * any hypothetical pipe-based log collexion system
> * all nullmailer installations
> * me, personally, when I'm pasting stuff into qemu -serial chardev:pipe
> 
> This follows:
> 1. 
> https://lore.kernel.org/linux-fsdevel/qk6hjuam54khlaikf2ssom6custxf5is2ekkaequf4hvode3ls@zgf7j5j4ubvw/t/#u
> 2. a security@ thread rooted in
>
> 3. https://nabijaczleweli.xyz/content/blogn_t/011-linux-splice-exclusion.html
> 
> Patches were posted and then discarded on principle or funxionality,
> all in all terminating in Linus posting
>> But it is possible that we need to just bite the bullet and say
>> "copy_splice_read() needs to use a non-blocking kiocb for the IO".
> 
> This does that, effectively making splice(file -> pipe)
> request (and require) O_NONBLOCK on reads fron the file:
> this doesn't affect splicing from regular files and blockdevs,
> since they're always non-blocking
> (and requesting the stronger "no kernel sleep" IOCB_NOWAIT is non-sensical),

Not sure how you got the idea that regular files or block devices is
always non-blocking, this is certainly not true without IOCB_NOWAIT.
Without IOCB_NOWAIT, you can certainly be waiting for previous IO to
complete.

> but always returns -EINVAL for ttys.
> Sockets behave as expected from O_NONBLOCK reads:
> splice if there's data available else -EAGAIN.
> 
> This should all pretty much behave as-expected.

Should it? Seems like there's a very high risk of breaking existing use
cases here.

Have you at all looked into the approach of enabling splice to/from
_without_ holding the pipe lock? That, to me, would seem like a much
saner approach, with the caveat that I have not looked into that at all
so there may indeed be reasons why this is not feasible.

-- 
Jens Axboe

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-14 Thread Rob Herring

On Thu, Dec 14, 2023 at 9:45 AM Alexandru Elisei
 wrote:
>
> Hi,
>
> On Wed, Dec 13, 2023 at 02:30:42PM -0600, Rob Herring wrote:
> > On Wed, Dec 13, 2023 at 11:44 AM Alexandru Elisei
> >  wrote:
> > >
> > > On Wed, Dec 13, 2023 at 11:22:17AM -0600, Rob Herring wrote:
> > > > On Wed, Dec 13, 2023 at 8:51 AM Alexandru Elisei
> > > >  wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > On Wed, Dec 13, 2023 at 08:06:44AM -0600, Rob Herring wrote:
> > > > > > On Wed, Dec 13, 2023 at 7:05 AM Alexandru Elisei
> > > > > >  wrote:
> > > > > > >
> > > > > > > Hi Rob,
> > > > > > >
> > > > > > > On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> > > > > > > > On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
> > > > > > > >  wrote:
> > > > > > > > >
> > > > > > > > > Hi Rob,
> > > > > > > > >
> > > > > > > > > Thank you so much for the feedback, I'm not very familiar 
> > > > > > > > > with device tree,
> > > > > > > > > and any comments are very useful.
> > > > > > > > >
> > > > > > > > > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > > > > > > > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > > > > > > > > >  wrote:
> > > > > > > > > > >
> > > > > > > > > > > Allow the kernel to get the size and location of the MTE 
> > > > > > > > > > > tag storage
> > > > > > > > > > > regions from the DTB. This memory is marked as reserved 
> > > > > > > > > > > for now.
> > > > > > > > > > >
> > > > > > > > > > > The DTB node for the tag storage region is defined as:
> > > > > > > > > > >
> > > > > > > > > > > tags0: tag-storage@8f800 {
> > > > > > > > > > > compatible = "arm,mte-tag-storage";
> > > > > > > > > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > > > > > > > > block-size = <0x1000>;
> > > > > > > > > > > memory = <>;// Associated 
> > > > > > > > > > > tagged memory node
> > > > > > > > > > > };
> > > > > > > > > >
> > > > > > > > > > I skimmed thru the discussion some. If this memory range is 
> > > > > > > > > > within
> > > > > > > > > > main RAM, then it definitely belongs in /reserved-memory.
> > > > > > > > >
> > > > > > > > > Ok, will do that.
> > > > > > > > >
> > > > > > > > > If you don't mind, why do you say that it definitely belongs 
> > > > > > > > > in
> > > > > > > > > reserved-memory? I'm not trying to argue otherwise, I'm 
> > > > > > > > > curious about the
> > > > > > > > > motivation.
> > > > > > > >
> > > > > > > > Simply so that /memory nodes describe all possible memory and
> > > > > > > > /reserved-memory is just adding restrictions. It's also because
> > > > > > > > /reserved-memory is what gets handled early, and we don't need
> > > > > > > > multiple things to handle early.
> > > > > > > >
> > > > > > > > > Tag storage is not DMA and can live anywhere in memory.
> > > > > > > >
> > > > > > > > Then why put it in DT at all? The only reason CMA is there is 
> > > > > > > > to set
> > > > > > > > the size. It's not even clear to me we need CMA in DT either. 
> > > > > > > > The
> > > > > > > > reasoning long ago was the kernel didn't do a good job of 
> > > > > > > > moving and
> > > > > > > > reclaiming contiguous space, but that's supposed to be better 
> > > > > > > > now (and
> > > > > > > > most h/w figured out they need IOMMUs).
> > > > > > > >
> > > > > > > > But for tag storage you know the size as it is a function of the
> > > > > > > > memory size, right? After all, you are validating the size is 
> > > > > > > > correct.
> > > > > > > > I guess there is still the aspect of whether you want enable 
> > > > > > > > MTE or
> > > > > > > > not which could be done in a variety of ways.
> > > > > > >
> > > > > > > Oh, sorry, my bad, I should have been clearer about this. I don't 
> > > > > > > want to
> > > > > > > put it in the DT as a "linux,cma" node. But I want it to be 
> > > > > > > managed by CMA.
> > > > > >
> > > > > > Yes, I understand, but my point remains. Why do you need this in DT?
> > > > > > If the location doesn't matter and you can calculate the size from 
> > > > > > the
> > > > > > memory size, what else is there to add to the DT?
> > > > >
> > > > > I am afraid there has been a misunderstanding. What do you mean by
> > > > > "location doesn't matter"?
> > > >
> > > > You said:
> > > > > Tag storage is not DMA and can live anywhere in memory.
> > > >
> > > > Which I took as the kernel can figure out where to put it. But maybe
> > > > you meant the h/w platform can hard code it to be anywhere in memory?
> > > > If so, then yes, DT is needed.
> > >
> > > Ah, I see, sorry for not being clear enough, you are correct: tag storage
> > > is a hardware property, and software needs a mechanism (in this case, the
> > > dt) to discover its properties.
> > >
> > > >
> > > > > At the very least, Linux needs to know the address and size of a 
> > > > > memory
> > > > > region to use it. The series is about using the tag storage memory for
> > > > >

[PATCH RERESEND 04/11] tracing: tracing_buffers_splice_read: behave as-if non-blocking I/O

2023-12-14 Thread Ahelenia Ziemiańska

Otherwise we risk sleeping with the pipe locked for indeterminate
lengths of time.

Link: 
https://lore.kernel.org/linux-fsdevel/qk6hjuam54khlaikf2ssom6custxf5is2ekkaequf4hvode3ls@zgf7j5j4ubvw/t/#u
Signed-off-by: Ahelenia Ziemiańska 
---
 kernel/trace/trace.c | 32 
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abaaf516fcae..9be7a4c0a3ca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8477,7 +8477,6 @@ tracing_buffers_splice_read(struct file *file, loff_t 
*ppos,
if (splice_grow_spd(pipe, ))
return -ENOMEM;
 
- again:
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, 
iter->cpu_file);
 
@@ -8528,35 +8527,12 @@ tracing_buffers_splice_read(struct file *file, loff_t 
*ppos,
 
/* did we read anything? */
if (!spd.nr_pages) {
-   long wait_index;
-
-   if (ret)
-   goto out;
-
-   ret = -EAGAIN;
-   if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
-   goto out;
-
-   wait_index = READ_ONCE(iter->wait_index);
-
-   ret = wait_on_pipe(iter, iter->tr->buffer_percent);
-   if (ret)
-   goto out;
-
-   /* No need to wait after waking up when tracing is off */
-   if (!tracer_tracing_is_on(iter->tr))
-   goto out;
-
-   /* Make sure we see the new wait_index */
-   smp_rmb();
-   if (wait_index != iter->wait_index)
-   goto out;
-
-   goto again;
+   if (!ret)
+   ret = -EAGAIN;
+   } else {
+   ret = splice_to_pipe(pipe, );
}
 
-   ret = splice_to_pipe(pipe, );
-out:
splice_shrink_spd();
 
return ret;
-- 
2.39.2


signature.asc
Description: PGP signature

[PATCH RERESEND 00/11] splice(file<>pipe) I/O on file as-if O_NONBLOCK

2023-12-14 Thread Ahelenia Ziemiańska

First:  
https://lore.kernel.org/lkml/cover.1697486714.git.nabijaczlew...@nabijaczleweli.xyz/t/#u
Resend: 
https://lore.kernel.org/lkml/1cover.1697486714.git.nabijaczlew...@nabijaczleweli.xyz/t/#u
Resending again per 
https://lore.kernel.org/lkml/20231214093859.01f6e...@kernel.org/t/#u

Hi!

As it stands, splice(file -> pipe):
1. locks the pipe,
2. does a read from the file,
3. unlocks the pipe.

For reading from regular files and blcokdevs this makes no difference.
But if the file is a tty or a socket, for example, this means that until
data appears, which it may never do, every process trying to read from
or open the pipe enters an uninterruptible sleep,
and will only exit it if the splicing process is killed.

This trivially denies service to:
* any hypothetical pipe-based log collexion system
* all nullmailer installations
* me, personally, when I'm pasting stuff into qemu -serial chardev:pipe

This follows:
1. 
https://lore.kernel.org/linux-fsdevel/qk6hjuam54khlaikf2ssom6custxf5is2ekkaequf4hvode3ls@zgf7j5j4ubvw/t/#u
2. a security@ thread rooted in
   
3. https://nabijaczleweli.xyz/content/blogn_t/011-linux-splice-exclusion.html

Patches were posted and then discarded on principle or funxionality,
all in all terminating in Linus posting
> But it is possible that we need to just bite the bullet and say
> "copy_splice_read() needs to use a non-blocking kiocb for the IO".

This does that, effectively making splice(file -> pipe)
request (and require) O_NONBLOCK on reads fron the file:
this doesn't affect splicing from regular files and blockdevs,
since they're always non-blocking
(and requesting the stronger "no kernel sleep" IOCB_NOWAIT is non-sensical),
but always returns -EINVAL for ttys.
Sockets behave as expected from O_NONBLOCK reads:
splice if there's data available else -EAGAIN.

This should all pretty much behave as-expected.

Mostly a re-based version of the summary diff from
.

Bisexion yields commit 8924feff66f35fe22ce77aafe3f21eb8e5cff881
("splice: lift pipe_lock out of splice_to_pipe()") as first bad.


The patchset is made quite wide due to the many implementations
of the splice_read callback, and was based entirely on results from
  $ git grep '\.splice_read.*=' | cut -d= -f2 |
  tr -s ',;[:space:]' '\n' | sort -u

I'm assuming this is exhaustive, but it's 27 distinct implementations.
Of these, I've classified these as trivial delegating wrappers:
  nfs_file_splice_read   filemap_splice_read
  afs_file_splice_read   filemap_splice_read
  ceph_splice_read   filemap_splice_read
  ecryptfs_splice_read_update_atime  filemap_splice_read
  ext4_file_splice_read  filemap_splice_read
  f2fs_file_splice_read  filemap_splice_read
  ntfs_file_splice_read  filemap_splice_read
  ocfs2_file_splice_read filemap_splice_read
  orangefs_file_splice_read  filemap_splice_read
  v9fs_file_splice_read  filemap_splice_read
  xfs_file_splice_read   filemap_splice_read
  zonefs_file_splice_readfilemap_splice_read
  sock_splice_read   copy_splice_read or a socket-specific one
  coda_file_splice_read  vfs_splice_read
  ovl_splice_readvfs_splice_read


filemap_splice_read() is used for regular files and blockdevs,
and thus needs no changes, and is thus unchanged.

vfs_splice_read() delegates to copy_splice_read() or f_op->splice_read().


The rest are fixed, in patch order:
01. copy_splice_read() by simply doing the I/O with IOCB_NOWAIT;
diff from Linus:
  
https://lore.kernel.org/lkml/5osglsw36dla3mubtpsmdwdid4fsdacplyd6acx2igo4atogdg@yur3idyim3cc/t/#ee67de5a9ec18886c434113637d7eff6cd7acac4b
02. unix_stream_splice_read() by unconditionally passing MSG_DONTWAIT
03. fuse_dev_splice_read() by behaving as-if O_NONBLOCK
04. tracing_buffers_splice_read() by behaving as-if O_NONBLOCK
(this also removes the retry loop)
05. relay_file_splice_read() by behaving as-if SPLICE_F_NONBLOCK
(this just means EAGAINing unconditionally for an empty transfer)
06. smc_splice_read() by unconditionally passing MSG_DONTWAIT
07. kcm_splice_read() by unconditionally passing MSG_DONTWAIT
08. tls_sw_splice_read() by behaving as-if SPLICE_F_NONBLOCK
09. tcp_splice_read() by behaving as-if O_NONBLOCK
(this also removes the retry loop)

10. EINVALs on files that neither have FMODE_NOWAIT nor are S_ISREG

We don't want this to be just FMODE_NOWAIT since most regular files
don't have it set and that's not the right semantic anyway,
as noted at the top of this mail,

But this allows blockdevs "by accident", effectively,
since they have FMODE_NOWAIT (at least the ones I tried),
even though they're just like regular files:
handled by filemap_splice_read(),
thus not dispatched with IOCB_NOWAIT. since always non-blocking.

Should this be a check for FMODE_NOWAIT && (S_ISREG || S_ISBLK)?
Should it remain

Re: [PATCH vhost v2 4/8] vdpa/mlx5: Mark vq addrs for modification in hw vq

2023-12-14 Thread Eugenio Perez Martin

On Thu, Dec 14, 2023 at 4:51 PM Dragos Tatulea  wrote:
>
> On Thu, 2023-12-14 at 08:45 -0500, Michael S. Tsirkin wrote:
> > On Thu, Dec 14, 2023 at 01:39:55PM +, Dragos Tatulea wrote:
> > > On Tue, 2023-12-12 at 15:44 -0800, Si-Wei Liu wrote:
> > > >
> > > > On 12/12/2023 11:21 AM, Eugenio Perez Martin wrote:
> > > > > On Tue, Dec 5, 2023 at 11:46 AM Dragos Tatulea  
> > > > > wrote:
> > > > > > Addresses get set by .set_vq_address. hw vq addresses will be 
> > > > > > updated on
> > > > > > next modify_virtqueue.
> > > > > >
> > > > > > Signed-off-by: Dragos Tatulea 
> > > > > > Reviewed-by: Gal Pressman 
> > > > > > Acked-by: Eugenio Pérez 
> > > > > I'm kind of ok with this patch and the next one about state, but I
> > > > > didn't ack them in the previous series.
> > > > >
> > > > > My main concern is that it is not valid to change the vq address after
> > > > > DRIVER_OK in VirtIO, which vDPA follows. Only memory maps are ok to
> > > > > change at this moment. I'm not sure about vq state in vDPA, but vhost
> > > > > forbids changing it with an active backend.
> > > > >
> > > > > Suspend is not defined in VirtIO at this moment though, so maybe it is
> > > > > ok to decide that all of these parameters may change during suspend.
> > > > > Maybe the best thing is to protect this with a vDPA feature flag.
> > > > I think protect with vDPA feature flag could work, while on the other
> > > > hand vDPA means vendor specific optimization is possible around suspend
> > > > and resume (in case it helps performance), which doesn't have to be
> > > > backed by virtio spec. Same applies to vhost user backend features,
> > > > variations there were not backed by spec either. Of course, we should
> > > > try best to make the default behavior backward compatible with
> > > > virtio-based backend, but that circles back to no suspend definition in
> > > > the current virtio spec, for which I hope we don't cease development on
> > > > vDPA indefinitely. After all, the virtio based vdap backend can well
> > > > define its own feature flag to describe (minor difference in) the
> > > > suspend behavior based on the later spec once it is formed in future.
> > > >
> > > So what is the way forward here? From what I understand the options are:
> > >
> > > 1) Add a vdpa feature flag for changing device properties while suspended.
> > >
> > > 2) Drop these 2 patches from the series for now. Not sure if this makes 
> > > sense as
> > > this. But then Si-Wei's qemu device suspend/resume poc [0] that exercises 
> > > this
> > > code won't work anymore. This means the series would be less well tested.
> > >
> > > Are there other possible options? What do you think?
> > >
> > > [0] https://github.com/siwliu-kernel/qemu/tree/svq-resume-wip
> >
> > I am fine with either of these.
> >
> How about allowing the change only under the following conditions:
>   vhost_vdpa_can_suspend && vhost_vdpa_can_resume &&
> VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is set
>
> ?

I think the best option by far is 1, as there is no hint in the
combination of these 3 indicating that you can change device
properties in the suspended state.

>
> Thanks,
> Dragos
>
> > > Thanks,
> > > Dragos
> > >
> > > > Regards,
> > > > -Siwei
> > > >
> > > >
> > > >
> > > > >
> > > > > Jason, what do you think?
> > > > >
> > > > > Thanks!
> > > > >
> > > > > > ---
> > > > > >   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 9 +
> > > > > >   include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
> > > > > >   2 files changed, 10 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > > > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > > > index f8f088cced50..80e066de0866 100644
> > > > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > > > @@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct 
> > > > > > mlx5_vdpa_net *ndev,
> > > > > >  bool state_change = false;
> > > > > >  void *obj_context;
> > > > > >  void *cmd_hdr;
> > > > > > +   void *vq_ctx;
> > > > > >  void *in;
> > > > > >  int err;
> > > > > >
> > > > > > @@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct 
> > > > > > mlx5_vdpa_net *ndev,
> > > > > >  MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 
> > > > > > ndev->mvdev.res.uid);
> > > > > >
> > > > > >  obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, 
> > > > > > obj_context);
> > > > > > +   vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
> > > > > > virtio_q_context);
> > > > > >
> > > > > >  if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
> > > > > >  if (!is_valid_state_change(mvq->fw_state, state, 
> > > > > > is_resumable(ndev))) {
> > > > > > @@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct 
> > > > > > mlx5_vdpa_net *ndev,
> > > > > >  state_change = true;
> > > > > >  }
> > > > > >
> > > > > > +   if

Re: [PATCH 0/2] ARM: dts: qcom: msm8926-motorola-peregrine: Add initial device tree

2023-12-14 Thread Konrad Dybcio





On 12/14/23 09:18, André Apitzsch wrote:

Am Mittwoch, dem 13.12.2023 um 21:44 +0100 schrieb Konrad Dybcio:



On 12/13/23 21:33, André Apitzsch wrote:

This dts adds support for Motorola Moto G 4G released in 2013.

I have a similar one in my drawer.. not the 4g kind, titan IIRC?
Wasn't this one codenamed thea?

Konrad


Yes, thea is the 2nd generation of Moto G 4G, released in 2014.
pregrine is the first generation, from 2013.

Should

model = "Motorola Moto G 4G";

be updated, to reflect that it is 1st gen or should only "thea" (if it
is added at all) have an addition in the model name?

I *think* it'd be good to make it

"Motorola Moto G 4G (2014)"

as I think it's what it was called anyway

Konrad

[PATCH v3] ring-buffer: Remove 32bit timestamp logic

From: "Steven Rostedt (Google)" 

Each event has a 27 bit timestamp delta that is used to hold the delta
from the last event. If the time between events is greater than 2^27, then
a timestamp is added that holds a 59 bit absolute timestamp.

Until a389d86f7fd09 ("ring-buffer: Have nested events still record running
time stamp"), if an interrupt interrupted an event in progress, all the
events delta would be zero to not deal with the races that need to be
handled. The commit a389d86f7fd09 changed that to handle the races giving
all events, even those that preempt other events, still have an accurate
timestamp.

To handle those races requires performing 64-bit cmpxchg on the
timestamps. But doing 64-bit cmpxchg on 32-bit architectures is considered
very slow. To try to deal with this the timestamp logic was broken into
two and then three 32-bit cmpxchgs, with the thought that two (or three)
32-bit cmpxchgs are still faster than a single 64-bit cmpxchg on 32-bit
architectures.

Part of the problem with this is that I didn't have any 32-bit
architectures to test on. After hitting several subtle bugs in this code,
an effort was made to try and see if three 32-bit cmpxchgs are indeed
faster than a single 64-bit. After a few people brushed off the dust of
their old 32-bit machines, tests were done, and even though 32-bit cmpxchg
was faster than a single 64-bit, it was in the order of 50% at best, not
300%.

This means that this complex code is not only complex but also not even
faster than just using 64-bit cmpxchg.

Nuke it!

This is basically a revert of 10464b4aa605e ("ring-buffer: Add rb_time_t
64 bit operations for speeding up 32 bit").

Cc: sta...@vger.kernel.org
Fixes: 10464b4aa605e ("ring-buffer: Add rb_time_t 64 bit operations for 
speeding up 32 bit")
Acked-by: Mathieu Desnoyers 
Signed-off-by: Steven Rostedt (Google) 
---
Changes since v2: 
https://lore.kernel.org/linux-trace-kernel/20231213232957.498cd...@gandalf.local.home

-- Added check for architectures that can not handle cmpxchg in NMI or
   x86 architectures that do not support true 64bit cmpxchg, and for
   them, to bail out of tracing if in NMI context.

 kernel/trace/ring_buffer.c | 232 -
 1 file changed, 22 insertions(+), 210 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d9caee7f542..9fdbd08af72f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 /*
@@ -463,27 +464,9 @@ enum {
RB_CTX_MAX
 };
 
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
-   local_t cnt;
-   local_t top;
-   local_t bottom;
-   local_t msb;
-};
-#else
-#include 
 struct rb_time_struct {
local64_t   time;
 };
-#endif
 typedef struct rb_time_struct rb_time_t;
 
 #define MAX_NEST   5
@@ -573,179 +556,9 @@ struct ring_buffer_iter {
int missed_events;
 };
 
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- *  - Reads may fail if it interrupted a modification of the time stamp.
- *  It will succeed if it did not interrupt another write even if
- *  the read itself is interrupted by a write.
- *  It returns whether it was successful or not.
- *
- *  - Writes always succeed and will overwrite other writes and writes
- *  that were done by events interrupting the current write.
- *
- *  - A write followed by a read of the same time stamp will always succeed,
- *  but may not contain the same value.
- *
- *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
- *  Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- *  (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- *  top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT  30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT   60
-
-static inline int rb_time_cnt(unsigned long val)
-{
-   return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
-   u64 val;
-
-   val = top & RB_TIME_VAL_MASK;
-   val <<= RB_TIME_SHIFT;
-   val |= bottom & RB_TIME_VAL_MASK;
-
-   return val;
-}
-
-static inline bool

Re: [PATCH -next 2/2] mm: vmscan: add new event to trace shrink lru

2023-12-14 Thread kernel test robot

Hi Bixuan,

kernel test robot noticed the following build errors:

[auto build test ERROR on next-20231211]

url:
https://github.com/intel-lab-lkp/linux/commits/Bixuan-Cui/mm-shrinker-add-new-event-to-trace-shrink-count/20231212-112824
base:   next-20231211
patch link:
https://lore.kernel.org/r/20231212032640.6968-3-cuibixuan%40vivo.com
patch subject: [PATCH -next 2/2] mm: vmscan: add new event to trace shrink lru
config: i386-randconfig-014-20231214 
(https://download.01.org/0day-ci/archive/20231215/202312150018.eie4fkef-...@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git 
ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20231215/202312150018.eie4fkef-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202312150018.eie4fkef-...@intel.com/

All errors (new ones prefixed by >>):

>> mm/vmscan.c:4533:2: error: call to undeclared function 
>> 'trace_mm_vmscan_lru_shrink_inactive'; ISO C99 and later do not support 
>> implicit function declarations [-Wimplicit-function-declaration]
   trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
   ^
   mm/vmscan.c:4533:2: note: did you mean 
'trace_mm_vmscan_lru_shrink_inactive_end'?
   include/trace/events/vmscan.h:415:1: note: 
'trace_mm_vmscan_lru_shrink_inactive_end' declared here
   TRACE_EVENT(mm_vmscan_lru_shrink_inactive_end,
   ^
   include/linux/tracepoint.h:566:2: note: expanded from macro 'TRACE_EVENT'
   DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
   ^
   include/linux/tracepoint.h:432:2: note: expanded from macro 'DECLARE_TRACE'
   __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),  \
   ^
   include/linux/tracepoint.h:255:21: note: expanded from macro 
'__DECLARE_TRACE'
   static inline void trace_##name(proto)  \
  ^
   :60:1: note: expanded from here
   trace_mm_vmscan_lru_shrink_inactive_end
   ^
   1 error generated.


vim +/trace_mm_vmscan_lru_shrink_inactive +4533 mm/vmscan.c

ac35a490237446 Yu Zhao 2022-09-18  4500  
a579086c99ed70 Yu Zhao 2022-12-21  4501  static int 
evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
ac35a490237446 Yu Zhao 2022-09-18  4502  {
ac35a490237446 Yu Zhao 2022-09-18  4503 int type;
ac35a490237446 Yu Zhao 2022-09-18  4504 int scanned;
ac35a490237446 Yu Zhao 2022-09-18  4505 int reclaimed;
ac35a490237446 Yu Zhao 2022-09-18  4506 LIST_HEAD(list);
359a5e1416caaf Yu Zhao 2022-11-15  4507 
LIST_HEAD(clean);
ac35a490237446 Yu Zhao 2022-09-18  4508 struct folio 
*folio;
359a5e1416caaf Yu Zhao 2022-11-15  4509 struct folio 
*next;
ac35a490237446 Yu Zhao 2022-09-18  4510 enum 
vm_event_item item;
ac35a490237446 Yu Zhao 2022-09-18  4511 struct 
reclaim_stat stat;
bd74fdaea14602 Yu Zhao 2022-09-18  4512 struct 
lru_gen_mm_walk *walk;
359a5e1416caaf Yu Zhao 2022-11-15  4513 bool skip_retry 
= false;
ac35a490237446 Yu Zhao 2022-09-18  4514 struct 
mem_cgroup *memcg = lruvec_memcg(lruvec);
ac35a490237446 Yu Zhao 2022-09-18  4515 struct 
pglist_data *pgdat = lruvec_pgdat(lruvec);
ac35a490237446 Yu Zhao 2022-09-18  4516  
ac35a490237446 Yu Zhao 2022-09-18  4517 
spin_lock_irq(>lru_lock);
ac35a490237446 Yu Zhao 2022-09-18  4518  
ac35a490237446 Yu Zhao 2022-09-18  4519 scanned = 
isolate_folios(lruvec, sc, swappiness, , );
ac35a490237446 Yu Zhao 2022-09-18  4520  
ac35a490237446 Yu Zhao 2022-09-18  4521 scanned += 
try_to_inc_min_seq(lruvec, swappiness);
ac35a490237446 Yu Zhao 2022-09-18  4522  
ac35a490237446 Yu Zhao 2022-09-18  4523 if 
(get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
ac35a490237446 Yu Zhao 2022-09-18  4524 scanned 
= 0;
ac35a490237446 Yu Zhao 2022-09-18  4525  
ac35a490237446 Yu Zhao 2022-09-18  4526 
spin_unlock_irq(>lru_lock);
ac35a490237446 Yu Zhao 2022-09-18  4527  
ac35a490237446 Yu Zhao 2022-09-18  4528 if 
(list_empty())
ac35a490237446 Yu Zhao 2022-09-18  4529 return 
scanned;
359a5e1416caaf Yu Zhao 2022-11-15  4530  retry:
49fd9b6df54e61 Matthew

Re: [PATCH] ring-buffer: Remove 32bit timestamp logic

On Wed, 13 Dec 2023 22:53:19 -0800
Linus Torvalds  wrote:

> On Wed, 13 Dec 2023 at 18:45, Steven Rostedt  wrote:
> >
> > tl;dr;  The ring-buffer timestamp requires a 64-bit cmpxchg to keep the
> > timestamps in sync (only in the slow paths). I was told that 64-bit cmpxchg
> > can be extremely slow on 32-bit architectures. So I created a rb_time_t
> > that on 64-bit was a normal local64_t type, and on 32-bit it's represented
> > by 3 32-bit words and a counter for synchronization. But this now requires
> > three 32-bit cmpxchgs for where one simple 64-bit cmpxchg would do.  
> 
> It's not that a 64-bit cmpxchg is even slow. It doesn't EXIST AT ALL
> on older 32-bit x86 machines.
> 
> Which is why we have
> 
> arch/x86/lib/cmpxchg8b_emu.S
> 
> which emulates it on machines that don't have the CX8 capability
> ("CX8" being the x86 capability flag name for the cmpxchg8b
> instruction, aka 64-bit cmpxchg).
> 
> Which only works because those older 32-bit cpu's also don't do SMP,
> so there are no SMP cache coherency issues, only interrupt atomicity
> issues.
> 
> IOW, the way to do an atomic 64-bit cmpxchg on the affected hardware
> is to simply disable interrupts.
> 
> In other words - it's not just slow.  It's *really* slow. As in 10x
> slower, not "slightly slower".

Ah, I'm starting to remember this for the rationale in doing it.

I should have read up on the LWN article I even wrote about it!

  https://lwn.net/Articles/831892/

  "I mentioned that I used the local64 variants of operations like
   local_read/cmpxchg/etc. operations. Desnoyers went on to argue that the
   local64 operations on 32-bit machines were horrible in performance, and
   worse, some require that interrupts be disabled, meaning that they could
   not be used in NMI context."

And yes, this does get called in NMI context.

> 
> > We started discussing how much time this is actually saving to be worth the
> > complexity, and actually found some hardware to test. One Atom processor.  
> 
> That atom processor won't actually show the issue. It's much too
> recent. So your "test" is actually worthless.
> 
> And you probably did this all with a kernel config that had
> CONFIG_X86_CMPXCHG64 set anyway, which wouldn't even boot on a i486
> machine.
> 
> So in fact your test was probably doubly broken, in that not only
> didn't you test the slow case, you tested something that wouldn't even
> have worked in the environment where the slow case happened.
> 
> Now, the real question is if anybody cares about CPUs that don't have
> cmpxchg8b support.
> 
> IBecause in practice, it's really just old 486-class machines (and a
> couple of clone manufacturers who _claimed_ to be Pentium class, but
> weren't - there was also some odd thing with Windows breaking if you
> had CPUID claiming to support CX8
> 
> We dropped support for the original 80386 some time ago. I'd actually
> be willing to drop support for ll pre-cmpxchg8b machines, and get rid
> of the emulation.
> 
> I also suspect that from a perf angle, none of this matters. The
> emulation being slow probably is a non-issue, simply because even if
> you run on an old i486 machine, you probably won't be doing perf or
> tracing on it.

Thanks for the background.

I had a patch that added:

+   /* ring buffer does cmpxchg, make sure it is safe in NMI context */
+   if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+   (unlikely(in_nmi( {
+   return NULL;
+   }

But for ripping out this code, I should probably change that to:

   if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
(IS_ENABLED(X86_32) && !IS_ENABLED(X86_CMPXCHG64))) &&
   unlikely(in_nmi())) {
   return NULL;
   }

Not sure if there's other architectures that are affected by this (hence
why I Cc'd linux-arch).

I don't think anyone actually cares about the performance overhead of 486
doing 64-bit cmpxchg by disabling interrupts. Especially since this only
happens in the slow path (if an event interrupts the processing of another
event). If someone complains, we can always add back this code.

Now back to my original question. Are you OK with me sending this to you
now, or should I send you just the subtle fixes to the 32-bit rb_time_*
code and keep this patch for the merge window?

My preference is to get it in now and label it as stable, but I'm fine
either way.

-- Steve

Re: [PATCH vhost v2 4/8] vdpa/mlx5: Mark vq addrs for modification in hw vq

2023-12-14 Thread Dragos Tatulea

On Thu, 2023-12-14 at 08:45 -0500, Michael S. Tsirkin wrote:
> On Thu, Dec 14, 2023 at 01:39:55PM +, Dragos Tatulea wrote:
> > On Tue, 2023-12-12 at 15:44 -0800, Si-Wei Liu wrote:
> > > 
> > > On 12/12/2023 11:21 AM, Eugenio Perez Martin wrote:
> > > > On Tue, Dec 5, 2023 at 11:46 AM Dragos Tatulea  
> > > > wrote:
> > > > > Addresses get set by .set_vq_address. hw vq addresses will be updated 
> > > > > on
> > > > > next modify_virtqueue.
> > > > > 
> > > > > Signed-off-by: Dragos Tatulea 
> > > > > Reviewed-by: Gal Pressman 
> > > > > Acked-by: Eugenio Pérez 
> > > > I'm kind of ok with this patch and the next one about state, but I
> > > > didn't ack them in the previous series.
> > > > 
> > > > My main concern is that it is not valid to change the vq address after
> > > > DRIVER_OK in VirtIO, which vDPA follows. Only memory maps are ok to
> > > > change at this moment. I'm not sure about vq state in vDPA, but vhost
> > > > forbids changing it with an active backend.
> > > > 
> > > > Suspend is not defined in VirtIO at this moment though, so maybe it is
> > > > ok to decide that all of these parameters may change during suspend.
> > > > Maybe the best thing is to protect this with a vDPA feature flag.
> > > I think protect with vDPA feature flag could work, while on the other 
> > > hand vDPA means vendor specific optimization is possible around suspend 
> > > and resume (in case it helps performance), which doesn't have to be 
> > > backed by virtio spec. Same applies to vhost user backend features, 
> > > variations there were not backed by spec either. Of course, we should 
> > > try best to make the default behavior backward compatible with 
> > > virtio-based backend, but that circles back to no suspend definition in 
> > > the current virtio spec, for which I hope we don't cease development on 
> > > vDPA indefinitely. After all, the virtio based vdap backend can well 
> > > define its own feature flag to describe (minor difference in) the 
> > > suspend behavior based on the later spec once it is formed in future.
> > > 
> > So what is the way forward here? From what I understand the options are:
> > 
> > 1) Add a vdpa feature flag for changing device properties while suspended.
> > 
> > 2) Drop these 2 patches from the series for now. Not sure if this makes 
> > sense as
> > this. But then Si-Wei's qemu device suspend/resume poc [0] that exercises 
> > this
> > code won't work anymore. This means the series would be less well tested.
> > 
> > Are there other possible options? What do you think?
> > 
> > [0] https://github.com/siwliu-kernel/qemu/tree/svq-resume-wip
> 
> I am fine with either of these.
> 
How about allowing the change only under the following conditions:
  vhost_vdpa_can_suspend && vhost_vdpa_can_resume &&  
VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is set

?

Thanks,
Dragos

> > Thanks,
> > Dragos
> > 
> > > Regards,
> > > -Siwei
> > > 
> > > 
> > > 
> > > > 
> > > > Jason, what do you think?
> > > > 
> > > > Thanks!
> > > > 
> > > > > ---
> > > > >   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 9 +
> > > > >   include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
> > > > >   2 files changed, 10 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > > index f8f088cced50..80e066de0866 100644
> > > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > > @@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct 
> > > > > mlx5_vdpa_net *ndev,
> > > > >  bool state_change = false;
> > > > >  void *obj_context;
> > > > >  void *cmd_hdr;
> > > > > +   void *vq_ctx;
> > > > >  void *in;
> > > > >  int err;
> > > > > 
> > > > > @@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct 
> > > > > mlx5_vdpa_net *ndev,
> > > > >  MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 
> > > > > ndev->mvdev.res.uid);
> > > > > 
> > > > >  obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, 
> > > > > obj_context);
> > > > > +   vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
> > > > > virtio_q_context);
> > > > > 
> > > > >  if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
> > > > >  if (!is_valid_state_change(mvq->fw_state, state, 
> > > > > is_resumable(ndev))) {
> > > > > @@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct 
> > > > > mlx5_vdpa_net *ndev,
> > > > >  state_change = true;
> > > > >  }
> > > > > 
> > > > > +   if (mvq->modified_fields & 
> > > > > MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
> > > > > +   MLX5_SET64(virtio_q, vq_ctx, desc_addr, 
> > > > > mvq->desc_addr);
> > > > > +   MLX5_SET64(virtio_q, vq_ctx, used_addr, 
> > > > > mvq->device_addr);
> > > > > +   MLX5_SET64(virtio_q, vq_ctx, available_addr, 
> > > > > mvq->driver_addr);
> > > > > +   }
> > > > > +
> > > > >

Re: [PATCH] driver core: Add a guard() definition for the device_lock()

2023-12-14 Thread Dave Jiang




On 12/13/23 16:02, Dan Williams wrote:
> At present there are ~200 usages of device_lock() in the kernel. Some of
> those usages lead to "goto unlock;" patterns which have proven to be
> error prone. Define a "device" guard() definition to allow for those to
> be cleaned up and prevent new ones from appearing.
> 
> Link: 
> http://lore.kernel.org/r/657897453dda8_269bd29...@dwillia2-mobl3.amr.corp.intel.com.notmuch
> Link: 
> http://lore.kernel.org/r/6577b0c2a02df_a04c529...@dwillia2-xfh.jf.intel.com.notmuch
> Cc: Vishal Verma 
> Cc: Ira Weiny 
> Cc: Peter Zijlstra 
> Cc: Greg Kroah-Hartman 
> Cc: Andrew Morton 
> Signed-off-by: Dan Williams 

Reviewed-by: Dave Jiang 
> ---
> Hi Greg,
> 
> I wonder if you might include this change in v6.7-rc to ease some patch
> sets alternately going through my tree and Andrew's tree. Those
> discussions are linked above. Alternately I can can just take it through
> my tree with your ack and the other use case can circle back to it in
> the v6.9 cycle.
> 
> I considered also defining a __free() helper similar to __free(mutex),
> but I think "__free(device)" would be a surprising name for something
> that drops a lock. Also, I like the syntax of guard(device) over
> something like guard(device_lock) since a 'struct device *' is the
> argument, not a lock type, but I'm open to your or Peter's thoughts on
> the naming.
> 
>  include/linux/device.h |2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/include/linux/device.h b/include/linux/device.h
> index d7a72a8749ea..6c83294395ac 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -1007,6 +1007,8 @@ static inline void device_unlock(struct device *dev)
>   mutex_unlock(>mutex);
>  }
>  
> +DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))
> +
>  static inline void device_lock_assert(struct device *dev)
>  {
>   lockdep_assert_held(>mutex);
> 
>

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-14 Thread Alexandru Elisei

Hi,

On Wed, Dec 13, 2023 at 02:30:42PM -0600, Rob Herring wrote:
> On Wed, Dec 13, 2023 at 11:44 AM Alexandru Elisei
>  wrote:
> >
> > On Wed, Dec 13, 2023 at 11:22:17AM -0600, Rob Herring wrote:
> > > On Wed, Dec 13, 2023 at 8:51 AM Alexandru Elisei
> > >  wrote:
> > > >
> > > > Hi,
> > > >
> > > > On Wed, Dec 13, 2023 at 08:06:44AM -0600, Rob Herring wrote:
> > > > > On Wed, Dec 13, 2023 at 7:05 AM Alexandru Elisei
> > > > >  wrote:
> > > > > >
> > > > > > Hi Rob,
> > > > > >
> > > > > > On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> > > > > > > On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > Hi Rob,
> > > > > > > >
> > > > > > > > Thank you so much for the feedback, I'm not very familiar with 
> > > > > > > > device tree,
> > > > > > > > and any comments are very useful.
> > > > > > > >
> > > > > > > > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > > > > > > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > > > > > > > >  wrote:
> > > > > > > > > >
> > > > > > > > > > Allow the kernel to get the size and location of the MTE 
> > > > > > > > > > tag storage
> > > > > > > > > > regions from the DTB. This memory is marked as reserved for 
> > > > > > > > > > now.
> > > > > > > > > >
> > > > > > > > > > The DTB node for the tag storage region is defined as:
> > > > > > > > > >
> > > > > > > > > > tags0: tag-storage@8f800 {
> > > > > > > > > > compatible = "arm,mte-tag-storage";
> > > > > > > > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > > > > > > > block-size = <0x1000>;
> > > > > > > > > > memory = <>;// Associated 
> > > > > > > > > > tagged memory node
> > > > > > > > > > };
> > > > > > > > >
> > > > > > > > > I skimmed thru the discussion some. If this memory range is 
> > > > > > > > > within
> > > > > > > > > main RAM, then it definitely belongs in /reserved-memory.
> > > > > > > >
> > > > > > > > Ok, will do that.
> > > > > > > >
> > > > > > > > If you don't mind, why do you say that it definitely belongs in
> > > > > > > > reserved-memory? I'm not trying to argue otherwise, I'm curious 
> > > > > > > > about the
> > > > > > > > motivation.
> > > > > > >
> > > > > > > Simply so that /memory nodes describe all possible memory and
> > > > > > > /reserved-memory is just adding restrictions. It's also because
> > > > > > > /reserved-memory is what gets handled early, and we don't need
> > > > > > > multiple things to handle early.
> > > > > > >
> > > > > > > > Tag storage is not DMA and can live anywhere in memory.
> > > > > > >
> > > > > > > Then why put it in DT at all? The only reason CMA is there is to 
> > > > > > > set
> > > > > > > the size. It's not even clear to me we need CMA in DT either. The
> > > > > > > reasoning long ago was the kernel didn't do a good job of moving 
> > > > > > > and
> > > > > > > reclaiming contiguous space, but that's supposed to be better now 
> > > > > > > (and
> > > > > > > most h/w figured out they need IOMMUs).
> > > > > > >
> > > > > > > But for tag storage you know the size as it is a function of the
> > > > > > > memory size, right? After all, you are validating the size is 
> > > > > > > correct.
> > > > > > > I guess there is still the aspect of whether you want enable MTE 
> > > > > > > or
> > > > > > > not which could be done in a variety of ways.
> > > > > >
> > > > > > Oh, sorry, my bad, I should have been clearer about this. I don't 
> > > > > > want to
> > > > > > put it in the DT as a "linux,cma" node. But I want it to be managed 
> > > > > > by CMA.
> > > > >
> > > > > Yes, I understand, but my point remains. Why do you need this in DT?
> > > > > If the location doesn't matter and you can calculate the size from the
> > > > > memory size, what else is there to add to the DT?
> > > >
> > > > I am afraid there has been a misunderstanding. What do you mean by
> > > > "location doesn't matter"?
> > >
> > > You said:
> > > > Tag storage is not DMA and can live anywhere in memory.
> > >
> > > Which I took as the kernel can figure out where to put it. But maybe
> > > you meant the h/w platform can hard code it to be anywhere in memory?
> > > If so, then yes, DT is needed.
> >
> > Ah, I see, sorry for not being clear enough, you are correct: tag storage
> > is a hardware property, and software needs a mechanism (in this case, the
> > dt) to discover its properties.
> >
> > >
> > > > At the very least, Linux needs to know the address and size of a memory
> > > > region to use it. The series is about using the tag storage memory for
> > > > data. Tag storage cannot be described as a regular memory node because 
> > > > it
> > > > cannot be tagged (and normal memory can).
> > >
> > > If the tag storage lives in the middle of memory, then it would be
> > > described in the memory node, but removed by being in reserved-memory
> > > node.
> >
> > I don't follow. Would you

Re: [PATCH] [v2] nvdimm-btt: simplify code with the scope based resource management

2023-12-14 Thread Dave Jiang




On 12/14/23 01:39, Dinghao Liu wrote:
> Use the scope based resource management (defined in
> linux/cleanup.h) to automate resource lifetime
> control on struct btt_sb *super in discover_arenas().
> 
> Signed-off-by: Dinghao Liu 
Reviewed-by: Dave Jiang 
> ---
> 
> Changelog:
> 
> v2: Set the __free attribute before kzalloc.
> ---
>  drivers/nvdimm/btt.c | 13 -
>  1 file changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> index d5593b0dc700..32a9e2f543c5 100644
> --- a/drivers/nvdimm/btt.c
> +++ b/drivers/nvdimm/btt.c
> @@ -16,6 +16,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "btt.h"
>  #include "nd.h"
>  
> @@ -847,23 +848,20 @@ static int discover_arenas(struct btt *btt)
>  {
>   int ret = 0;
>   struct arena_info *arena;
> - struct btt_sb *super;
>   size_t remaining = btt->rawsize;
>   u64 cur_nlba = 0;
>   size_t cur_off = 0;
>   int num_arenas = 0;
>  
> - super = kzalloc(sizeof(*super), GFP_KERNEL);
> + struct btt_sb *super __free(kfree) = kzalloc(sizeof(*super), 
> GFP_KERNEL);
>   if (!super)
>   return -ENOMEM;
>  
>   while (remaining) {
>   /* Alloc memory for arena */
>   arena = alloc_arena(btt, 0, 0, 0);
> - if (!arena) {
> - ret = -ENOMEM;
> - goto out_super;
> - }
> + if (!arena)
> + return -ENOMEM;
>  
>   arena->infooff = cur_off;
>   ret = btt_info_read(arena, super);
> @@ -919,14 +917,11 @@ static int discover_arenas(struct btt *btt)
>   btt->nlba = cur_nlba;
>   btt->init_state = INIT_READY;
>  
> - kfree(super);
>   return ret;
>  
>   out:
>   kfree(arena);
>   free_arenas(btt);
> - out_super:
> - kfree(super);
>   return ret;
>  }
>

Re: [PATCH] driver core: Add a guard() definition for the device_lock()

2023-12-14 Thread Greg KH

On Wed, Dec 13, 2023 at 03:02:35PM -0800, Dan Williams wrote:
> At present there are ~200 usages of device_lock() in the kernel. Some of
> those usages lead to "goto unlock;" patterns which have proven to be
> error prone. Define a "device" guard() definition to allow for those to
> be cleaned up and prevent new ones from appearing.
> 
> Link: 
> http://lore.kernel.org/r/657897453dda8_269bd29...@dwillia2-mobl3.amr.corp.intel.com.notmuch
> Link: 
> http://lore.kernel.org/r/6577b0c2a02df_a04c529...@dwillia2-xfh.jf.intel.com.notmuch
> Cc: Vishal Verma 
> Cc: Ira Weiny 
> Cc: Peter Zijlstra 
> Cc: Greg Kroah-Hartman 
> Cc: Andrew Morton 
> Signed-off-by: Dan Williams 
> ---
> Hi Greg,
> 
> I wonder if you might include this change in v6.7-rc to ease some patch
> sets alternately going through my tree and Andrew's tree. Those
> discussions are linked above. Alternately I can can just take it through
> my tree with your ack and the other use case can circle back to it in
> the v6.9 cycle.

Sure, I'll queue it up now for 6.7-final, makes sense to have it now for
others to build off of, and for me to fix up some places in the driver
core to use it as well.

> I considered also defining a __free() helper similar to __free(mutex),
> but I think "__free(device)" would be a surprising name for something
> that drops a lock. Also, I like the syntax of guard(device) over
> something like guard(device_lock) since a 'struct device *' is the
> argument, not a lock type, but I'm open to your or Peter's thoughts on
> the naming.

guard(device); makes sense to me, as that's what you are doing here, so
I'm good with it.

thanks,

greg k-h

Re: [PATCH -next 2/2] mm: vmscan: add new event to trace shrink lru

2023-12-14 Thread kernel test robot

Hi Bixuan,

kernel test robot noticed the following build errors:

[auto build test ERROR on next-20231211]

url:
https://github.com/intel-lab-lkp/linux/commits/Bixuan-Cui/mm-shrinker-add-new-event-to-trace-shrink-count/20231212-112824
base:   next-20231211
patch link:
https://lore.kernel.org/r/20231212032640.6968-3-cuibixuan%40vivo.com
patch subject: [PATCH -next 2/2] mm: vmscan: add new event to trace shrink lru
config: i386-buildonly-randconfig-003-20231214 
(https://download.01.org/0day-ci/archive/20231214/202312142212.vbse7cms-...@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20231214/202312142212.vbse7cms-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202312142212.vbse7cms-...@intel.com/

All errors (new ones prefixed by >>):

   mm/vmscan.c: In function 'evict_folios':
>> mm/vmscan.c:4533:9: error: implicit declaration of function 
>> 'trace_mm_vmscan_lru_shrink_inactive'; did you mean 
>> 'trace_mm_vmscan_lru_shrink_inactive_end'? 
>> [-Werror=implicit-function-declaration]
4533 | trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
 | ^~~
 | trace_mm_vmscan_lru_shrink_inactive_end
   cc1: some warnings being treated as errors


vim +4533 mm/vmscan.c

ac35a490237446 Yu Zhao 2022-09-18  4500  
a579086c99ed70 Yu Zhao 2022-12-21  4501  static int 
evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
ac35a490237446 Yu Zhao 2022-09-18  4502  {
ac35a490237446 Yu Zhao 2022-09-18  4503 int type;
ac35a490237446 Yu Zhao 2022-09-18  4504 int scanned;
ac35a490237446 Yu Zhao 2022-09-18  4505 int reclaimed;
ac35a490237446 Yu Zhao 2022-09-18  4506 LIST_HEAD(list);
359a5e1416caaf Yu Zhao 2022-11-15  4507 
LIST_HEAD(clean);
ac35a490237446 Yu Zhao 2022-09-18  4508 struct folio 
*folio;
359a5e1416caaf Yu Zhao 2022-11-15  4509 struct folio 
*next;
ac35a490237446 Yu Zhao 2022-09-18  4510 enum 
vm_event_item item;
ac35a490237446 Yu Zhao 2022-09-18  4511 struct 
reclaim_stat stat;
bd74fdaea14602 Yu Zhao 2022-09-18  4512 struct 
lru_gen_mm_walk *walk;
359a5e1416caaf Yu Zhao 2022-11-15  4513 bool skip_retry 
= false;
ac35a490237446 Yu Zhao 2022-09-18  4514 struct 
mem_cgroup *memcg = lruvec_memcg(lruvec);
ac35a490237446 Yu Zhao 2022-09-18  4515 struct 
pglist_data *pgdat = lruvec_pgdat(lruvec);
ac35a490237446 Yu Zhao 2022-09-18  4516  
ac35a490237446 Yu Zhao 2022-09-18  4517 
spin_lock_irq(>lru_lock);
ac35a490237446 Yu Zhao 2022-09-18  4518  
ac35a490237446 Yu Zhao 2022-09-18  4519 scanned = 
isolate_folios(lruvec, sc, swappiness, , );
ac35a490237446 Yu Zhao 2022-09-18  4520  
ac35a490237446 Yu Zhao 2022-09-18  4521 scanned += 
try_to_inc_min_seq(lruvec, swappiness);
ac35a490237446 Yu Zhao 2022-09-18  4522  
ac35a490237446 Yu Zhao 2022-09-18  4523 if 
(get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
ac35a490237446 Yu Zhao 2022-09-18  4524 scanned 
= 0;
ac35a490237446 Yu Zhao 2022-09-18  4525  
ac35a490237446 Yu Zhao 2022-09-18  4526 
spin_unlock_irq(>lru_lock);
ac35a490237446 Yu Zhao 2022-09-18  4527  
ac35a490237446 Yu Zhao 2022-09-18  4528 if 
(list_empty())
ac35a490237446 Yu Zhao 2022-09-18  4529 return 
scanned;
359a5e1416caaf Yu Zhao 2022-11-15  4530  retry:
49fd9b6df54e61 Matthew Wilcox (Oracle  2022-09-02  4531)reclaimed = 
shrink_folio_list(, pgdat, sc, , false);
359a5e1416caaf Yu Zhao 2022-11-15  4532 
sc->nr_reclaimed += reclaimed;
8c2214fc9a470a Jaewon Kim  2023-10-03 @4533 
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
8c2214fc9a470a Jaewon Kim  2023-10-03  4534 
scanned, reclaimed, , sc->priority,
8c2214fc9a470a Jaewon Kim  2023-10-03  4535 
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
359a5e1416caaf Yu Zhao 2022-11-15  4536  
359a5e1416caaf Yu Zhao 2022-11-15  4537 
list_for_each_entry_safe_reverse(folio, next, , lru) {
359a5e1

Re: [PATCH vhost v2 4/8] vdpa/mlx5: Mark vq addrs for modification in hw vq

On Thu, Dec 14, 2023 at 01:39:55PM +, Dragos Tatulea wrote:
> On Tue, 2023-12-12 at 15:44 -0800, Si-Wei Liu wrote:
> > 
> > On 12/12/2023 11:21 AM, Eugenio Perez Martin wrote:
> > > On Tue, Dec 5, 2023 at 11:46 AM Dragos Tatulea  
> > > wrote:
> > > > Addresses get set by .set_vq_address. hw vq addresses will be updated on
> > > > next modify_virtqueue.
> > > > 
> > > > Signed-off-by: Dragos Tatulea 
> > > > Reviewed-by: Gal Pressman 
> > > > Acked-by: Eugenio Pérez 
> > > I'm kind of ok with this patch and the next one about state, but I
> > > didn't ack them in the previous series.
> > > 
> > > My main concern is that it is not valid to change the vq address after
> > > DRIVER_OK in VirtIO, which vDPA follows. Only memory maps are ok to
> > > change at this moment. I'm not sure about vq state in vDPA, but vhost
> > > forbids changing it with an active backend.
> > > 
> > > Suspend is not defined in VirtIO at this moment though, so maybe it is
> > > ok to decide that all of these parameters may change during suspend.
> > > Maybe the best thing is to protect this with a vDPA feature flag.
> > I think protect with vDPA feature flag could work, while on the other 
> > hand vDPA means vendor specific optimization is possible around suspend 
> > and resume (in case it helps performance), which doesn't have to be 
> > backed by virtio spec. Same applies to vhost user backend features, 
> > variations there were not backed by spec either. Of course, we should 
> > try best to make the default behavior backward compatible with 
> > virtio-based backend, but that circles back to no suspend definition in 
> > the current virtio spec, for which I hope we don't cease development on 
> > vDPA indefinitely. After all, the virtio based vdap backend can well 
> > define its own feature flag to describe (minor difference in) the 
> > suspend behavior based on the later spec once it is formed in future.
> > 
> So what is the way forward here? From what I understand the options are:
> 
> 1) Add a vdpa feature flag for changing device properties while suspended.
> 
> 2) Drop these 2 patches from the series for now. Not sure if this makes sense 
> as
> this. But then Si-Wei's qemu device suspend/resume poc [0] that exercises this
> code won't work anymore. This means the series would be less well tested.
> 
> Are there other possible options? What do you think?
> 
> [0] https://github.com/siwliu-kernel/qemu/tree/svq-resume-wip

I am fine with either of these.

> Thanks,
> Dragos
> 
> > Regards,
> > -Siwei
> > 
> > 
> > 
> > > 
> > > Jason, what do you think?
> > > 
> > > Thanks!
> > > 
> > > > ---
> > > >   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 9 +
> > > >   include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
> > > >   2 files changed, 10 insertions(+)
> > > > 
> > > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > index f8f088cced50..80e066de0866 100644
> > > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > > @@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net 
> > > > *ndev,
> > > >  bool state_change = false;
> > > >  void *obj_context;
> > > >  void *cmd_hdr;
> > > > +   void *vq_ctx;
> > > >  void *in;
> > > >  int err;
> > > > 
> > > > @@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net 
> > > > *ndev,
> > > >  MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 
> > > > ndev->mvdev.res.uid);
> > > > 
> > > >  obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, 
> > > > obj_context);
> > > > +   vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
> > > > virtio_q_context);
> > > > 
> > > >  if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
> > > >  if (!is_valid_state_change(mvq->fw_state, state, 
> > > > is_resumable(ndev))) {
> > > > @@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net 
> > > > *ndev,
> > > >  state_change = true;
> > > >  }
> > > > 
> > > > +   if (mvq->modified_fields & 
> > > > MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
> > > > +   MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
> > > > +   MLX5_SET64(virtio_q, vq_ctx, used_addr, 
> > > > mvq->device_addr);
> > > > +   MLX5_SET64(virtio_q, vq_ctx, available_addr, 
> > > > mvq->driver_addr);
> > > > +   }
> > > > +
> > > >  MLX5_SET64(virtio_net_q_object, obj_context, 
> > > > modify_field_select, mvq->modified_fields);
> > > >  err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, 
> > > > sizeof(out));
> > > >  if (err)
> > > > @@ -2202,6 +2210,7 @@ static int mlx5_vdpa_set_vq_address(struct 
> > > > vdpa_device *vdev, u16 idx, u64 desc_
> > > >  mvq->desc_addr = desc_area;
> > > >  mvq->device_addr = device_area;
> > > >  mvq->driver_addr = driver_area;
> > > > +

Re: [PATCH vhost v2 4/8] vdpa/mlx5: Mark vq addrs for modification in hw vq

2023-12-14 Thread Dragos Tatulea

On Tue, 2023-12-12 at 15:44 -0800, Si-Wei Liu wrote:
> 
> On 12/12/2023 11:21 AM, Eugenio Perez Martin wrote:
> > On Tue, Dec 5, 2023 at 11:46 AM Dragos Tatulea  wrote:
> > > Addresses get set by .set_vq_address. hw vq addresses will be updated on
> > > next modify_virtqueue.
> > > 
> > > Signed-off-by: Dragos Tatulea 
> > > Reviewed-by: Gal Pressman 
> > > Acked-by: Eugenio Pérez 
> > I'm kind of ok with this patch and the next one about state, but I
> > didn't ack them in the previous series.
> > 
> > My main concern is that it is not valid to change the vq address after
> > DRIVER_OK in VirtIO, which vDPA follows. Only memory maps are ok to
> > change at this moment. I'm not sure about vq state in vDPA, but vhost
> > forbids changing it with an active backend.
> > 
> > Suspend is not defined in VirtIO at this moment though, so maybe it is
> > ok to decide that all of these parameters may change during suspend.
> > Maybe the best thing is to protect this with a vDPA feature flag.
> I think protect with vDPA feature flag could work, while on the other 
> hand vDPA means vendor specific optimization is possible around suspend 
> and resume (in case it helps performance), which doesn't have to be 
> backed by virtio spec. Same applies to vhost user backend features, 
> variations there were not backed by spec either. Of course, we should 
> try best to make the default behavior backward compatible with 
> virtio-based backend, but that circles back to no suspend definition in 
> the current virtio spec, for which I hope we don't cease development on 
> vDPA indefinitely. After all, the virtio based vdap backend can well 
> define its own feature flag to describe (minor difference in) the 
> suspend behavior based on the later spec once it is formed in future.
> 
So what is the way forward here? From what I understand the options are:

1) Add a vdpa feature flag for changing device properties while suspended.

2) Drop these 2 patches from the series for now. Not sure if this makes sense as
this. But then Si-Wei's qemu device suspend/resume poc [0] that exercises this
code won't work anymore. This means the series would be less well tested.

Are there other possible options? What do you think?

[0] https://github.com/siwliu-kernel/qemu/tree/svq-resume-wip

Thanks,
Dragos

> Regards,
> -Siwei
> 
> 
> 
> > 
> > Jason, what do you think?
> > 
> > Thanks!
> > 
> > > ---
> > >   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 9 +
> > >   include/linux/mlx5/mlx5_ifc_vdpa.h | 1 +
> > >   2 files changed, 10 insertions(+)
> > > 
> > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> > > b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > index f8f088cced50..80e066de0866 100644
> > > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> > > @@ -1209,6 +1209,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net 
> > > *ndev,
> > >  bool state_change = false;
> > >  void *obj_context;
> > >  void *cmd_hdr;
> > > +   void *vq_ctx;
> > >  void *in;
> > >  int err;
> > > 
> > > @@ -1230,6 +1231,7 @@ static int modify_virtqueue(struct mlx5_vdpa_net 
> > > *ndev,
> > >  MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, 
> > > ndev->mvdev.res.uid);
> > > 
> > >  obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, 
> > > obj_context);
> > > +   vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, 
> > > virtio_q_context);
> > > 
> > >  if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
> > >  if (!is_valid_state_change(mvq->fw_state, state, 
> > > is_resumable(ndev))) {
> > > @@ -1241,6 +1243,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net 
> > > *ndev,
> > >  state_change = true;
> > >  }
> > > 
> > > +   if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) 
> > > {
> > > +   MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
> > > +   MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
> > > +   MLX5_SET64(virtio_q, vq_ctx, available_addr, 
> > > mvq->driver_addr);
> > > +   }
> > > +
> > >  MLX5_SET64(virtio_net_q_object, obj_context, 
> > > modify_field_select, mvq->modified_fields);
> > >  err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, 
> > > sizeof(out));
> > >  if (err)
> > > @@ -2202,6 +2210,7 @@ static int mlx5_vdpa_set_vq_address(struct 
> > > vdpa_device *vdev, u16 idx, u64 desc_
> > >  mvq->desc_addr = desc_area;
> > >  mvq->device_addr = device_area;
> > >  mvq->driver_addr = driver_area;
> > > +   mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS;
> > >  return 0;
> > >   }
> > > 
> > > diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h 
> > > b/include/linux/mlx5/mlx5_ifc_vdpa.h
> > > index b86d51a855f6..9594ac405740 100644
> > > --- a/include/linux/mlx5/mlx5_ifc_vdpa.h
> > > +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
>

Re: [PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback




On 14.12.2023 13:29, Michael S. Tsirkin wrote:
> On Thu, Dec 14, 2023 at 12:19:46PM +0300, Arseniy Krasnov wrote:
>> Do not return if transport callback for SO_RCVLOWAT is set (only in
>> error case). In this case we don't need to set 'sk_rcvlowat' field in
>> each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
>> is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
>> to 'notify_set_rcvlowat'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> Reviewed-by: Stefano Garzarella 
>> Acked-by: Michael S. Tsirkin 
> 
> Maybe squash this with patch 2/4?

Done in v10

Thanks, Arseniy

> 
>> ---
>>  Changelog:
>>  v3 -> v4:
>>   * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
>>   * Commit message updated.
>>
>>  include/net/af_vsock.h   | 2 +-
>>  net/vmw_vsock/af_vsock.c | 9 +++--
>>  net/vmw_vsock/hyperv_transport.c | 4 ++--
>>  3 files changed, 10 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> index e302c0e804d0..535701efc1e5 100644
>> --- a/include/net/af_vsock.h
>> +++ b/include/net/af_vsock.h
>> @@ -137,7 +137,6 @@ struct vsock_transport {
>>  u64 (*stream_rcvhiwat)(struct vsock_sock *);
>>  bool (*stream_is_active)(struct vsock_sock *);
>>  bool (*stream_allow)(u32 cid, u32 port);
>> -int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* SEQ_PACKET. */
>>  ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>> @@ -168,6 +167,7 @@ struct vsock_transport {
>>  struct vsock_transport_send_notify_data *);
>>  /* sk_lock held by the caller */
>>  void (*notify_buffer_size)(struct vsock_sock *, u64 *);
>> +int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* Shutdown. */
>>  int (*shutdown)(struct vsock_sock *, int);
>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> index 816725af281f..54ba7316f808 100644
>> --- a/net/vmw_vsock/af_vsock.c
>> +++ b/net/vmw_vsock/af_vsock.c
>> @@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int 
>> val)
>>  
>>  transport = vsk->transport;
>>  
>> -if (transport && transport->set_rcvlowat)
>> -return transport->set_rcvlowat(vsk, val);
>> +if (transport && transport->notify_set_rcvlowat) {
>> +int err;
>> +
>> +err = transport->notify_set_rcvlowat(vsk, val);
>> +if (err)
>> +return err;
>> +}
>>  
>>  WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
>>  return 0;
> 
> 
> 
> I would s
> 
>> diff --git a/net/vmw_vsock/hyperv_transport.c 
>> b/net/vmw_vsock/hyperv_transport.c
>> index 7cb1a9d2cdb4..e2157e387217 100644
>> --- a/net/vmw_vsock/hyperv_transport.c
>> +++ b/net/vmw_vsock/hyperv_transport.c
>> @@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
>> ssize_t written,
>>  }
>>  
>>  static
>> -int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>>  {
>>  return -EOPNOTSUPP;
>>  }
>> @@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
>>  .notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
>>  .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
>>  
>> -.set_rcvlowat = hvs_set_rcvlowat
>> +.notify_set_rcvlowat  = hvs_notify_set_rcvlowat
>>  };
>>  
>>  static bool hvs_check_transport(struct vsock_sock *vsk)
>> -- 
>> 2.25.1
>

[PATCH net-next v10 1/3] virtio/vsock: fix logic which reduces credit update messages

Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v7 -> v8:
  * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index c8e162c9d1df..7eabe5219ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -557,6 +557,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -600,7 +602,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}
 
-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -610,9 +615,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1

[PATCH net-next v10 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
O_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Rename 'set_rcvlowat' callback to 'notify_set_rcvlowat' and set
'sk->sk_rcvlowat' only in one place (i.e. 'vsock_set_rcvlowat'), so the
transport doesn't need to do it.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v9 -> v10:
  * This is squash of 0002 and 0003 from v9.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 include/net/af_vsock.h  |  2 +-
 net/vmw_vsock/af_vsock.c|  9 ++--
 net/vmw_vsock/hyperv_transport.c|  4 ++--
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 8 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index

[PATCH net-next v10 0/3] send credit update during setting SO_RCVLOWAT

Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/
Link to v9:
https://lore.kernel.org/netdev/20231214091947.395892-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.
v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v7 -> v8:
 * See per-patch changelog after ---.
v8 -> v9:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * Add 'Fixes' tag for the current 0002.
 * Reorder patches by moving two fixes first.
v9 -> v10:
 * Squash 0002 and 0003 and update commit message in result.

Arseniy Krasnov (3):
  virtio/vsock: fix logic which reduces credit update messages
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  43 +-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 229 insertions(+), 8 deletions(-)

-- 
2.25.1

[PATCH net-next v10 3/3] vsock/test: two tests to check credit update logic

Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+

Re: [PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback

On Thu, Dec 14, 2023 at 01:52:50PM +0300, Arseniy Krasnov wrote:
> 
> 
> On 14.12.2023 13:29, Michael S. Tsirkin wrote:
> > On Thu, Dec 14, 2023 at 12:19:46PM +0300, Arseniy Krasnov wrote:
> >> Do not return if transport callback for SO_RCVLOWAT is set (only in
> >> error case). In this case we don't need to set 'sk_rcvlowat' field in
> >> each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
> >> is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
> >> to 'notify_set_rcvlowat'.
> >>
> >> Signed-off-by: Arseniy Krasnov 
> >> Reviewed-by: Stefano Garzarella 
> >> Acked-by: Michael S. Tsirkin 
> > 
> > Maybe squash this with patch 2/4?
> 
> You mean just do 'git squash' without updating commit message manually?
> 
> Thanks, Arseniy

commit message should reflect all that patch does.

> > 
> >> ---
> >>  Changelog:
> >>  v3 -> v4:
> >>   * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
> >>   * Commit message updated.
> >>
> >>  include/net/af_vsock.h   | 2 +-
> >>  net/vmw_vsock/af_vsock.c | 9 +++--
> >>  net/vmw_vsock/hyperv_transport.c | 4 ++--
> >>  3 files changed, 10 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> >> index e302c0e804d0..535701efc1e5 100644
> >> --- a/include/net/af_vsock.h
> >> +++ b/include/net/af_vsock.h
> >> @@ -137,7 +137,6 @@ struct vsock_transport {
> >>u64 (*stream_rcvhiwat)(struct vsock_sock *);
> >>bool (*stream_is_active)(struct vsock_sock *);
> >>bool (*stream_allow)(u32 cid, u32 port);
> >> -  int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
> >>  
> >>/* SEQ_PACKET. */
> >>ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
> >> @@ -168,6 +167,7 @@ struct vsock_transport {
> >>struct vsock_transport_send_notify_data *);
> >>/* sk_lock held by the caller */
> >>void (*notify_buffer_size)(struct vsock_sock *, u64 *);
> >> +  int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
> >>  
> >>/* Shutdown. */
> >>int (*shutdown)(struct vsock_sock *, int);
> >> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> >> index 816725af281f..54ba7316f808 100644
> >> --- a/net/vmw_vsock/af_vsock.c
> >> +++ b/net/vmw_vsock/af_vsock.c
> >> @@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int 
> >> val)
> >>  
> >>transport = vsk->transport;
> >>  
> >> -  if (transport && transport->set_rcvlowat)
> >> -  return transport->set_rcvlowat(vsk, val);
> >> +  if (transport && transport->notify_set_rcvlowat) {
> >> +  int err;
> >> +
> >> +  err = transport->notify_set_rcvlowat(vsk, val);
> >> +  if (err)
> >> +  return err;
> >> +  }
> >>  
> >>WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
> >>return 0;
> > 
> > 
> > 
> > I would s
> > 
> >> diff --git a/net/vmw_vsock/hyperv_transport.c 
> >> b/net/vmw_vsock/hyperv_transport.c
> >> index 7cb1a9d2cdb4..e2157e387217 100644
> >> --- a/net/vmw_vsock/hyperv_transport.c
> >> +++ b/net/vmw_vsock/hyperv_transport.c
> >> @@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock 
> >> *vsk, ssize_t written,
> >>  }
> >>  
> >>  static
> >> -int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
> >> +int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
> >>  {
> >>return -EOPNOTSUPP;
> >>  }
> >> @@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
> >>.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
> >>.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
> >>  
> >> -  .set_rcvlowat = hvs_set_rcvlowat
> >> +  .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
> >>  };
> >>  
> >>  static bool hvs_check_transport(struct vsock_sock *vsk)
> >> -- 
> >> 2.25.1
> >

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

On Thu, 14 Dec 2023 at 10:20, Lukasz Luba  wrote:
>
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Provide to the scheduler a feedback about the temporary max available
> > capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> > filtered as the pressure will happen for dozens ms or more.
> >
> > Signed-off-by: Vincent Guittot 
> > ---
> >   drivers/cpufreq/cpufreq.c | 48 +++
> >   include/linux/cpufreq.h   | 10 
> >   2 files changed, 58 insertions(+)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 44db4f59c4cc..7d5f71be8d29 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy 
> > *policy, unsigned int cpu)
> >   }
> >   EXPORT_SYMBOL(cpufreq_get_policy);
> >
> > +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> > +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
>
> Why do we export this variable when we have get/update functions?
> Do we expect modules would manipulate those per-cpu variables
> independently and not like we do per-cpumask in the update func.?

No, I will remove the EXPORT_PER_CPU_SYMBOL_GPL

Re: [PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback




On 14.12.2023 13:29, Michael S. Tsirkin wrote:
> On Thu, Dec 14, 2023 at 12:19:46PM +0300, Arseniy Krasnov wrote:
>> Do not return if transport callback for SO_RCVLOWAT is set (only in
>> error case). In this case we don't need to set 'sk_rcvlowat' field in
>> each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
>> is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
>> to 'notify_set_rcvlowat'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> Reviewed-by: Stefano Garzarella 
>> Acked-by: Michael S. Tsirkin 
> 
> Maybe squash this with patch 2/4?

You mean just do 'git squash' without updating commit message manually?

Thanks, Arseniy

> 
>> ---
>>  Changelog:
>>  v3 -> v4:
>>   * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
>>   * Commit message updated.
>>
>>  include/net/af_vsock.h   | 2 +-
>>  net/vmw_vsock/af_vsock.c | 9 +++--
>>  net/vmw_vsock/hyperv_transport.c | 4 ++--
>>  3 files changed, 10 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> index e302c0e804d0..535701efc1e5 100644
>> --- a/include/net/af_vsock.h
>> +++ b/include/net/af_vsock.h
>> @@ -137,7 +137,6 @@ struct vsock_transport {
>>  u64 (*stream_rcvhiwat)(struct vsock_sock *);
>>  bool (*stream_is_active)(struct vsock_sock *);
>>  bool (*stream_allow)(u32 cid, u32 port);
>> -int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* SEQ_PACKET. */
>>  ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>> @@ -168,6 +167,7 @@ struct vsock_transport {
>>  struct vsock_transport_send_notify_data *);
>>  /* sk_lock held by the caller */
>>  void (*notify_buffer_size)(struct vsock_sock *, u64 *);
>> +int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* Shutdown. */
>>  int (*shutdown)(struct vsock_sock *, int);
>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> index 816725af281f..54ba7316f808 100644
>> --- a/net/vmw_vsock/af_vsock.c
>> +++ b/net/vmw_vsock/af_vsock.c
>> @@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int 
>> val)
>>  
>>  transport = vsk->transport;
>>  
>> -if (transport && transport->set_rcvlowat)
>> -return transport->set_rcvlowat(vsk, val);
>> +if (transport && transport->notify_set_rcvlowat) {
>> +int err;
>> +
>> +err = transport->notify_set_rcvlowat(vsk, val);
>> +if (err)
>> +return err;
>> +}
>>  
>>  WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
>>  return 0;
> 
> 
> 
> I would s
> 
>> diff --git a/net/vmw_vsock/hyperv_transport.c 
>> b/net/vmw_vsock/hyperv_transport.c
>> index 7cb1a9d2cdb4..e2157e387217 100644
>> --- a/net/vmw_vsock/hyperv_transport.c
>> +++ b/net/vmw_vsock/hyperv_transport.c
>> @@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
>> ssize_t written,
>>  }
>>  
>>  static
>> -int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>>  {
>>  return -EOPNOTSUPP;
>>  }
>> @@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
>>  .notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
>>  .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
>>  
>> -.set_rcvlowat = hvs_set_rcvlowat
>> +.notify_set_rcvlowat  = hvs_notify_set_rcvlowat
>>  };
>>  
>>  static bool hvs_check_transport(struct vsock_sock *vsk)
>> -- 
>> 2.25.1
>

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler





On 12/14/23 09:40, Rafael J. Wysocki wrote:

On Thu, Dec 14, 2023 at 10:07 AM Lukasz Luba  wrote:


On 12/14/23 07:57, Vincent Guittot wrote:

On Thu, 14 Dec 2023 at 06:43, Viresh Kumar  wrote:


On 12-12-23, 15:27, Vincent Guittot wrote:

@@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy 
*policy,
policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
trace_cpu_frequency_limits(policy);

+ cpus = policy->related_cpus;
+ cpufreq_update_pressure(cpus, policy->max);
+
policy->cached_target_freq = UINT_MAX;


One more question, why are you doing this from cpufreq_set_policy ? If
due to cpufreq cooling or from userspace, we end up limiting the
maximum possible frequency, will this routine always get called ?


Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
to update the policy->max



Agree, cpufreq sysfs scaling_max_freq is also important to handle
in this new design. Currently we don't reflect that as reduced CPU
capacity in the scheduler. There was discussion when I proposed to feed
that CPU frequency reduction into thermal_pressure [1].

The same applies for the DTPM which is missing currently the proper
impact to the CPU reduced capacity in the scheduler.

IMHO any limit set into FREQ_QOS_MAX should be visible in this
new design of capacity reduction signaling.

[1] https://lore.kernel.org/lkml/20220930094821.31665-2-lukasz.l...@arm.com/


Actually, freq_qos_read_value(>constraints, FREQ_QOS_MAX) will
return the requisite limit.


Yes, but we need to translate that information from freq domain
into capacity domain and plumb ii into scheduler as stolen CPU capacity.
Ideally, w/o any 'smoothing' but just instant value.
That's the hope of this patch set re-design.

Re: [PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback

On Thu, Dec 14, 2023 at 12:19:46PM +0300, Arseniy Krasnov wrote:
> Do not return if transport callback for SO_RCVLOWAT is set (only in
> error case). In this case we don't need to set 'sk_rcvlowat' field in
> each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
> is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
> to 'notify_set_rcvlowat'.
> 
> Signed-off-by: Arseniy Krasnov 
> Reviewed-by: Stefano Garzarella 
> Acked-by: Michael S. Tsirkin 

Maybe squash this with patch 2/4?

> ---
>  Changelog:
>  v3 -> v4:
>   * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
>   * Commit message updated.
> 
>  include/net/af_vsock.h   | 2 +-
>  net/vmw_vsock/af_vsock.c | 9 +++--
>  net/vmw_vsock/hyperv_transport.c | 4 ++--
>  3 files changed, 10 insertions(+), 5 deletions(-)
> 
> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
> index e302c0e804d0..535701efc1e5 100644
> --- a/include/net/af_vsock.h
> +++ b/include/net/af_vsock.h
> @@ -137,7 +137,6 @@ struct vsock_transport {
>   u64 (*stream_rcvhiwat)(struct vsock_sock *);
>   bool (*stream_is_active)(struct vsock_sock *);
>   bool (*stream_allow)(u32 cid, u32 port);
> - int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
>  
>   /* SEQ_PACKET. */
>   ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
> @@ -168,6 +167,7 @@ struct vsock_transport {
>   struct vsock_transport_send_notify_data *);
>   /* sk_lock held by the caller */
>   void (*notify_buffer_size)(struct vsock_sock *, u64 *);
> + int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
>  
>   /* Shutdown. */
>   int (*shutdown)(struct vsock_sock *, int);
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index 816725af281f..54ba7316f808 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
>  
>   transport = vsk->transport;
>  
> - if (transport && transport->set_rcvlowat)
> - return transport->set_rcvlowat(vsk, val);
> + if (transport && transport->notify_set_rcvlowat) {
> + int err;
> +
> + err = transport->notify_set_rcvlowat(vsk, val);
> + if (err)
> + return err;
> + }
>  
>   WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
>   return 0;



I would s

> diff --git a/net/vmw_vsock/hyperv_transport.c 
> b/net/vmw_vsock/hyperv_transport.c
> index 7cb1a9d2cdb4..e2157e387217 100644
> --- a/net/vmw_vsock/hyperv_transport.c
> +++ b/net/vmw_vsock/hyperv_transport.c
> @@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
> ssize_t written,
>  }
>  
>  static
> -int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
> +int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>  {
>   return -EOPNOTSUPP;
>  }
> @@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
>   .notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
>   .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
>  
> - .set_rcvlowat = hvs_set_rcvlowat
> + .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
>  };
>  
>  static bool hvs_check_transport(struct vsock_sock *vsk)
> -- 
> 2.25.1

Re: [PATCH net-next v9 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

On Thu, Dec 14, 2023 at 12:19:45PM +0300, Arseniy Krasnov wrote:
> Send credit update message when SO_RCVLOWAT is updated and it is bigger
> than number of bytes in rx queue. It is needed, because 'poll()' will
> wait until number of bytes in rx queue will be not smaller than
> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
> for tx/rx is possible: sender waits for free space and receiver is
> waiting data in 'poll()'.
> 
> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
> Signed-off-by: Arseniy Krasnov 
> Reviewed-by: Stefano Garzarella 
> Acked-by: Michael S. Tsirkin 
> ---
>  Changelog:
>  v1 -> v2:
>   * Update commit message by removing 'This patch adds XXX' manner.
>   * Do not initialize 'send_update' variable - set it directly during
> first usage.
>  v3 -> v4:
>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
>  v4 -> v5:
>   * Do not change callbacks order in transport structures.
>  v5 -> v6:
>   * Reorder callbacks in transport structures.
>   * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
>  v8 -> v9:
>   * Add 'Fixes' tag.
> 
>  drivers/vhost/vsock.c   |  1 +
>  include/linux/virtio_vsock.h|  1 +
>  net/vmw_vsock/virtio_transport.c|  1 +
>  net/vmw_vsock/virtio_transport_common.c | 30 +
>  net/vmw_vsock/vsock_loopback.c  |  1 +
>  5 files changed, 34 insertions(+)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index f75731396b7e..ec20ecff85c7 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
>   .notify_send_pre_enqueue  = 
> virtio_transport_notify_send_pre_enqueue,
>   .notify_send_post_enqueue = 
> virtio_transport_notify_send_post_enqueue,
>   .notify_buffer_size   = virtio_transport_notify_buffer_size,
> + .notify_set_rcvlowat  = 
> virtio_transport_notify_set_rcvlowat,
>  

This needs to be set_rcvlowat.

>   .read_skb = virtio_transport_read_skb,
>   },
> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> index ebb3ce63d64d..c82089dee0c8 100644
> --- a/include/linux/virtio_vsock.h
> +++ b/include/linux/virtio_vsock.h
> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
> *vvs, u32 credit);
>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>  int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
> read_actor);
> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
>  #endif /* _LINUX_VIRTIO_VSOCK_H */
> diff --git a/net/vmw_vsock/virtio_transport.c 
> b/net/vmw_vsock/virtio_transport.c
> index af5bab1acee1..f495b9e5186b 100644
> --- a/net/vmw_vsock/virtio_transport.c
> +++ b/net/vmw_vsock/virtio_transport.c
> @@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
>   .notify_send_pre_enqueue  = 
> virtio_transport_notify_send_pre_enqueue,
>   .notify_send_post_enqueue = 
> virtio_transport_notify_send_post_enqueue,
>   .notify_buffer_size   = virtio_transport_notify_buffer_size,
> + .notify_set_rcvlowat  = 
> virtio_transport_notify_set_rcvlowat,
>  
>   .read_skb = virtio_transport_read_skb,
>   },


This, too.

> diff --git a/net/vmw_vsock/virtio_transport_common.c 
> b/net/vmw_vsock/virtio_transport_common.c
> index 7eabe5219ef7..9d2305fdc65c 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -1690,6 +1690,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
> skb_read_actor_t recv_acto
>  }
>  EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>  
> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
> +{
> + struct virtio_vsock_sock *vvs = vsk->trans;
> + bool send_update;
> +
> + spin_lock_bh(>rx_lock);
> +
> + /* If number of available bytes is less than new SO_RCVLOWAT value,
> +  * kick sender to send more data, because sender may sleep in its
> +  * 'send()' syscall waiting for enough space at our side. Also
> +  * don't send credit update when peer already knows actual value -
> +  * such transmission will be useless.
> +  */
> + send_update = (vvs->rx_bytes < val) &&
> +   (vvs->fwd_cnt != vvs->last_fwd_cnt);
> +
> + spin_unlock_bh(>rx_lock);
> +
> + if (send_update) {
> + int err;
> +
> + err = virtio_transport_send_credit_update(vsk);
> + if (err < 0)
> + return err;
> + }
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
> +
>  MODULE_LICENSE("GPL v2");
>  MODULE_AUTHOR("Asias He");
>  MODULE_DESCRIPTION("common code for virtio vsock");

Re: [PATCH net-next v9 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Stefano Garzarella


On Thu, Dec 14, 2023 at 12:19:45PM +0300, Arseniy Krasnov wrote:

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Do not initialize 'send_update' variable - set it directly during
   first usage.
v3 -> v4:
 * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
v4 -> v5:
 * Do not change callbacks order in transport structures.
v5 -> v6:
 * Reorder callbacks in transport structures.
 * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
v8 -> v9:
 * Add 'Fixes' tag.

drivers/vhost/vsock.c   |  1 +
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  1 +
net/vmw_vsock/virtio_transport_common.c | 30 +
net/vmw_vsock/vsock_loopback.c  |  1 +
5 files changed, 34 insertions(+)


As I already mentioned in the cover letter, this patch doesn't compile
unless we apply patch 3 before this one, so:

Nacked-by: Stefano Garzarella

Re: [PATCH net-next v9 0/4] send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Stefano Garzarella

On Thu, Dec 14, 2023 at 12:19:43PM +0300, Arseniy Krasnov wrote:

Hello,

DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.

STEPS

SENDER RECEIVER
1) sends 128KB + 1 byte in a
single buffer. 128KB will
be sent, but for 1 byte
sender will wait for free
space at peer. Sender goes
to sleep.

2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless).

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* New patch is added as 0001 - it removes return from SO_RCVLOWAT set
callback in 'af_vsock.c' when transport callback is set - with that
we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
not copy-paste it to every transport. It was discussed in v1.
* See per-patch changelog after ---.
v2 -> v3:
* See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.
v4 -> v5:
* Change patchset tag 'RFC' -> 'net-next'.
* See per-patch changelog after ---.
v5 -> v6:
* New patch 0003 which sends credit update during reading bytes from
socket.
* See per-patch changelog after ---.
v6 -> v7:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.
v7 -> v8:
* See per-patch changelog after ---.
v8 -> v9:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* Add 'Fixes' tag for the current 0002.
* Reorder patches by moving two fixes first.

Arseniy Krasnov (4):
virtio/vsock: fix logic which reduces credit update messages
virtio/vsock: send credit update during setting SO_RCVLOWAT
vsock: update SO_RCVLOWAT setting callback
vsock/test: two tests to check credit update logic

This order will break the bisectability, since now patch 2 will not
build if patch 3 is not applied.

So you need to implement in patch 2 `set_rcvlowat` and in patch 3
updated it to `notify_set_rcvlowat`, otherwise we always need to
backport patch 3 in stable branches, that should be applied before
patch 2.

You have 2 options:
a. move patch 3 before patch 2 without changing the code
b. change patch 2 to use `set_rcvlowat` and updated that code in patch 3

I don't have a strong opinion, but I slightly prefer option a. BTW that
forces us to backport more patches on stable branches, so I'm fine with
option b as well.

That said:
Nacked-by: Stefano Garzarella

Re: [PATCH v2] virtio: Add support for no-reset virtio PCI PM

2023-12-14 Thread David Hildenbrand


On 08.12.23 08:07, David Stevens wrote:

If a virtio_pci_device supports native PCI power management and has the
No_Soft_Reset bit set, then skip resetting and reinitializing the device
when suspending and restoring the device. This allows system-wide low
power states like s2idle to be used in systems with stateful virtio
devices that can't simply be re-initialized (e.g. virtio-fs).

Signed-off-by: David Stevens 
---
v1 -> v2:
  - Check the No_Soft_Reset bit

  drivers/virtio/virtio_pci_common.c | 34 +-
  1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index c2524a7207cf..3a95ecaf12dc 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -492,8 +492,40 @@ static int virtio_pci_restore(struct device *dev)
return virtio_device_restore(_dev->vdev);
  }
  
+static bool vp_supports_pm_no_reset(struct device *dev)

+{
+   struct pci_dev *pci_dev = to_pci_dev(dev);
+   u16 pmcsr;
+
+   if (!pci_dev->pm_cap)
+   return false;
+
+   pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, );
+   if (PCI_POSSIBLE_ERROR(pmcsr)) {
+   dev_err(dev, "Unable to query pmcsr");
+   return false;
+   }
+
+   return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET;
+}
+
+static int virtio_pci_suspend(struct device *dev)
+{
+   return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev);
+}
+
+static int virtio_pci_resume(struct device *dev)
+{
+   return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev);
+}
+
  static const struct dev_pm_ops virtio_pci_pm_ops = {
-   SET_SYSTEM_SLEEP_PM_OPS(virtio_pci_freeze, virtio_pci_restore)
+   .suspend = virtio_pci_suspend,
+   .resume = virtio_pci_resume,
+   .freeze = virtio_pci_freeze,
+   .thaw = virtio_pci_restore,
+   .poweroff = virtio_pci_freeze,
+   .restore = virtio_pci_restore,
  };
  #endif
  


Am I correct with my assumption that this will make s2idle work with 
virtio-mem-pci as well?

Right now, all suspending is disabled, but really only s4/hibernate is 
problematic.

[root@vm-0 ~]# echo "s2idle" > /sys/power/mem_sleep
[root@vm-0 ~]# echo "mem" > /sys/power/state
[  101.822991] PM: suspend entry (s2idle)
[  101.828978] Filesystems sync: 0.004 seconds
[  101.831618] Freezing user space processes
[  101.834569] Freezing user space processes completed (elapsed 0.001 seconds)
[  101.836915] OOM killer disabled.
[  101.838072] Freezing remaining freezable tasks
[  101.841054] Freezing remaining freezable tasks completed (elapsed 0.001 
seconds)
[  101.843538] printk: Suspending console(s) (use no_console_suspend to debug)
[  101.957676] virtio_mem virtio0: save/restore not supported.
[  101.957683] virtio-pci :00:02.0: PM: pci_pm_suspend(): 
virtio_pci_freeze+0x0/0x50 returns -1
[  101.957702] virtio-pci :00:02.0: PM: dpm_run_callback(): 
pci_pm_suspend+0x0/0x170 returns -1
[  101.957718] virtio-pci :00:02.0: PM: failed to suspend async: error -1


--
Cheers,

David / dhildenb

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

2023-12-14 Thread Rafael J. Wysocki

On Thu, Dec 14, 2023 at 10:07 AM Lukasz Luba  wrote:
>
> On 12/14/23 07:57, Vincent Guittot wrote:
> > On Thu, 14 Dec 2023 at 06:43, Viresh Kumar  wrote:
> >>
> >> On 12-12-23, 15:27, Vincent Guittot wrote:
> >>> @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy 
> >>> *policy,
> >>>policy->max = __resolve_freq(policy, policy->max, 
> >>> CPUFREQ_RELATION_H);
> >>>trace_cpu_frequency_limits(policy);
> >>>
> >>> + cpus = policy->related_cpus;
> >>> + cpufreq_update_pressure(cpus, policy->max);
> >>> +
> >>>policy->cached_target_freq = UINT_MAX;
> >>
> >> One more question, why are you doing this from cpufreq_set_policy ? If
> >> due to cpufreq cooling or from userspace, we end up limiting the
> >> maximum possible frequency, will this routine always get called ?
> >
> > Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
> > to update the policy->max
> >
>
> Agree, cpufreq sysfs scaling_max_freq is also important to handle
> in this new design. Currently we don't reflect that as reduced CPU
> capacity in the scheduler. There was discussion when I proposed to feed
> that CPU frequency reduction into thermal_pressure [1].
>
> The same applies for the DTPM which is missing currently the proper
> impact to the CPU reduced capacity in the scheduler.
>
> IMHO any limit set into FREQ_QOS_MAX should be visible in this
> new design of capacity reduction signaling.
>
> [1] https://lore.kernel.org/lkml/20220930094821.31665-2-lukasz.l...@arm.com/

Actually, freq_qos_read_value(>constraints, FREQ_QOS_MAX) will
return the requisite limit.

[PATCH net-next v9 1/4] virtio/vsock: fix logic which reduces credit update messages

Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v7 -> v8:
  * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index c8e162c9d1df..7eabe5219ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -557,6 +557,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -600,7 +602,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}
 
-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -610,9 +615,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1

[PATCH net-next v9 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
 v4 -> v5:
  * Do not change callbacks order in transport structures.
 v5 -> v6:
  * Reorder callbacks in transport structures.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v8 -> v9:
  * Add 'Fixes' tag.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 5 files changed, 34 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 7eabe5219ef7..9d2305fdc65c 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1690,6 +1690,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side. Also
+* don't send credit update when peer already knows actual value -
+* such transmission will be useless.
+*/
+   send_update = (vvs->rx_bytes < val) &&
+ (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,6 +96,7 @@ static struct virtio_transport loopback_transport = {

[PATCH net-next v9 0/4] send credit update during setting SO_RCVLOWAT

Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.
v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v7 -> v8:
 * See per-patch changelog after ---.
v8 -> v9:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * Add 'Fixes' tag for the current 0002.
 * Reorder patches by moving two fixes first.

Arseniy Krasnov (4):
  virtio/vsock: fix logic which reduces credit update messages
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock: update SO_RCVLOWAT setting callback
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  43 +-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 229 insertions(+), 8 deletions(-)

-- 
2.25.1

[PATCH net-next v9 4/4] vsock/test: two tests to check credit update logic

Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+

[PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback

Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
to 'notify_set_rcvlowat'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v3 -> v4:
  * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
  * Commit message updated.

 include/net/af_vsock.h   | 2 +-
 net/vmw_vsock/af_vsock.c | 9 +++--
 net/vmw_vsock/hyperv_transport.c | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
-- 
2.25.1

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler





On 12/12/23 14:27, Vincent Guittot wrote:

Provide to the scheduler a feedback about the temporary max available
capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
filtered as the pressure will happen for dozens ms or more.

Signed-off-by: Vincent Guittot 
---
  drivers/cpufreq/cpufreq.c | 48 +++
  include/linux/cpufreq.h   | 10 
  2 files changed, 58 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44db4f59c4cc..7d5f71be8d29 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, 
unsigned int cpu)
  }
  EXPORT_SYMBOL(cpufreq_get_policy);
  
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);

+EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);


Why do we export this variable when we have get/update functions?
Do we expect modules would manipulate those per-cpu variables
independently and not like we do per-cpumask in the update func.?

Re: [PATCH v5 3/4] mm/memory_hotplug: export mhp_supports_memmap_on_memory()

2023-12-14 Thread David Hildenbrand


On 14.12.23 08:37, Vishal Verma wrote:

In preparation for adding sysfs ABI to toggle memmap_on_memory semantics
for drivers adding memory, export the mhp_supports_memmap_on_memory()
helper. This allows drivers to check if memmap_on_memory support is
available before trying to request it, and display an appropriate
message if it isn't available. As part of this, remove the size argument
to this - with recent updates to allow memmap_on_memory for larger
ranges, and the internal splitting of altmaps into respective memory
blocks, the size argument is meaningless.

Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Dave Hansen 
Cc: Huang Ying 
Suggested-by: David Hildenbrand 
Signed-off-by: Vishal Verma 
---
  include/linux/memory_hotplug.h |  6 ++
  mm/memory_hotplug.c| 17 ++---
  2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7d2076583494..ebc9d528f00c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -121,6 +121,7 @@ struct mhp_params {
  
  bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);

  struct range mhp_get_pluggable_range(bool need_mapping);
+bool mhp_supports_memmap_on_memory(void);
  
  /*

   * Zone resizing functions
@@ -262,6 +263,11 @@ static inline bool movable_node_is_enabled(void)
return false;
  }
  
+static bool mhp_supports_memmap_on_memory(void)

+{
+   return false;
+}
+
  static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
  static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
  static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 926e1cfb10e9..751664c519f7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1325,7 +1325,7 @@ static inline bool 
arch_supports_memmap_on_memory(unsigned long vmemmap_size)
  }
  #endif
  
-static bool mhp_supports_memmap_on_memory(unsigned long size)

+bool mhp_supports_memmap_on_memory(void)
  {
unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long memmap_pages = memory_block_memmap_on_memory_pages();
@@ -1334,17 +1334,11 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 * Besides having arch support and the feature enabled at runtime, we
 * need a few more assumptions to hold true:
 *
-* a) We span a single memory block: memory onlining/offlinin;g happens
-*in memory block granularity. We don't want the vmemmap of online
-*memory blocks to reside on offline memory blocks. In the future,
-*we might want to support variable-sized memory blocks to make the
-*feature more versatile.
-*
-* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+* a) The vmemmap pages span complete PMDs: We don't want vmemmap code
 *to populate memory from the altmap for unrelated parts (i.e.,
 *other memory blocks)
 *
-* c) The vmemmap pages (and thereby the pages that will be exposed to
+* b) The vmemmap pages (and thereby the pages that will be exposed to
 *the buddy) have to cover full pageblocks: memory 
onlining/offlining
 *code requires applicable ranges to be page-aligned, for example, 
to
 *set the migratetypes properly.
@@ -1356,7 +1350,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 *   altmap as an alternative source of memory, and we do not 
exactly
 *   populate a single PMD.
 */
-   if (!mhp_memmap_on_memory() || size != memory_block_size_bytes())
+   if (!mhp_memmap_on_memory())
return false;
  
  	/*

@@ -1379,6 +1373,7 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
  
  	return arch_supports_memmap_on_memory(vmemmap_size);

  }
+EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
  
  static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)

  {
@@ -1512,7 +1507,7 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
-   mhp_supports_memmap_on_memory(memory_block_size_bytes())) {
+   mhp_supports_memmap_on_memory()) {
ret = create_altmaps_and_memory_blocks(nid, group, start, size);
if (ret)
goto error;



Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler





On 12/14/23 07:57, Vincent Guittot wrote:

On Thu, 14 Dec 2023 at 06:43, Viresh Kumar  wrote:


On 12-12-23, 15:27, Vincent Guittot wrote:

@@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy 
*policy,
   policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
   trace_cpu_frequency_limits(policy);

+ cpus = policy->related_cpus;
+ cpufreq_update_pressure(cpus, policy->max);
+
   policy->cached_target_freq = UINT_MAX;


One more question, why are you doing this from cpufreq_set_policy ? If
due to cpufreq cooling or from userspace, we end up limiting the
maximum possible frequency, will this routine always get called ?


Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
to update the policy->max



Agree, cpufreq sysfs scaling_max_freq is also important to handle
in this new design. Currently we don't reflect that as reduced CPU
capacity in the scheduler. There was discussion when I proposed to feed
that CPU frequency reduction into thermal_pressure [1].

The same applies for the DTPM which is missing currently the proper
impact to the CPU reduced capacity in the scheduler.

IMHO any limit set into FREQ_QOS_MAX should be visible in this
new design of capacity reduction signaling.

[1] https://lore.kernel.org/lkml/20220930094821.31665-2-lukasz.l...@arm.com/

Re: [PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure

On Thu, 14 Dec 2023 at 09:53, Lukasz Luba  wrote:
>
>
>
> On 12/14/23 08:36, Vincent Guittot wrote:
> > On Thu, 14 Dec 2023 at 09:30, Lukasz Luba  wrote:
> >>
> >>
> >> On 12/12/23 14:27, Vincent Guittot wrote:
>
> [snip]
>
> >>>update_rq_clock(rq);
> >>> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
> >>> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
> >>> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> >>> + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> >>
> >> We switch to task clock here, could you tell me why?
> >> Don't we have to maintain the boot command parameter for the shift?
> >
> > This should have been part of the patch5 that I finally removed. IMO,
> > the additional time shift with rq_clock_thermal is no more needed now
> > that we have 2 separates signals
> >
>
> I didn't like the left-shift which causes the signal to converge slowly.
> I rather wanted right-shift to converge (react faster), so you have my
> vote for this change. Also, I agree that with the two-signal approach
> this shift trick can go away now. I just worry about the dropped boot
> parameter.
>
> So, are going to send that patach5 which removes the
> 'sched_thermal_decay_shift' and documentation bit?

Yes, i will add it back for the next version

Re: [PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure





On 12/14/23 08:36, Vincent Guittot wrote:

On Thu, 14 Dec 2023 at 09:30, Lukasz Luba  wrote:



On 12/12/23 14:27, Vincent Guittot wrote:


[snip]


   update_rq_clock(rq);
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);


We switch to task clock here, could you tell me why?
Don't we have to maintain the boot command parameter for the shift?


This should have been part of the patch5 that I finally removed. IMO,
the additional time shift with rq_clock_thermal is no more needed now
that we have 2 separates signals



I didn't like the left-shift which causes the signal to converge slowly.
I rather wanted right-shift to converge (react faster), so you have my
vote for this change. Also, I agree that with the two-signal approach
this shift trick can go away now. I just worry about the dropped boot
parameter.

So, are going to send that patach5 which removes the
'sched_thermal_decay_shift' and documentation bit?

[PATCH RT 1/1] Linux 4.19.302-rt131

2023-12-14 Thread Daniel Wagner

v4.19.302-rt131-rc1 stable review patch.
If anyone has any objections, please let me know.

---


Signed-off-by: Daniel Wagner 
---
 localversion-rt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/localversion-rt b/localversion-rt
index 6fa797e5b850..a328b97369c2 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt130
+-rt131
-- 
2.43.0

[PATCH RT 0/1] Linux v4.19.302-rt131-rc1

2023-12-14 Thread Daniel Wagner

Dear RT Folks,

This is the RT stable review cycle of patch 4.19.302-rt131-rc1.

Please scream at me if I messed something up. Please test the patches
too.

The -rc release is also available on kernel.org

  https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

on the v4.19-rt-next branch.

If all goes well, this patch will be converted to the next main
release on 2023-12-21.

Signing key fingerprint:

  5BF6 7BC5 0826 72CA BB45  ACAE 587C 5ECA 5D0A 306C

All keys used for the above files and repositories can be found on the
following git repository:

   git://git.kernel.org/pub/scm/docs/kernel/pgpkeys.git

Enjoy!
Daniel

Changes from v4.19.299-rt130:


Daniel Wagner (1):
  Linux 4.19.302-rt131

 localversion-rt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.43.0

Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Stefano Garzarella


On Wed, Dec 13, 2023 at 08:11:57PM +0300, Arseniy Krasnov wrote:



On 13.12.2023 18:13, Michael S. Tsirkin wrote:

On Wed, Dec 13, 2023 at 10:05:44AM -0500, Michael S. Tsirkin wrote:

On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote:



On 13.12.2023 11:43, Stefano Garzarella wrote:

On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 19:12, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 18:54, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:

Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/



Patchset:

Acked-by: Michael S. Tsirkin 


Thanks!




But I worry whether we actually need 3/8 in net not in net-next.


Because of "Fixes" tag ? I think this problem is not critical and reproducible
only in special cases, but i'm not familiar with netdev process so good, so I 
don't
have strong opinion. I guess @Stefano knows better.

Thanks, Arseniy


Fixes means "if you have that other commit then you need this commit
too". I think as a minimum you need to rearrange patches to make the
fix go in first. We don't want a regression followed by a fix.


I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this
patch fixes problem that is not related with the new patches from this patchset.


I agree, patch 3 is for sure net material (I'm fine with both rearrangement or 
send it separately), but IMHO also patch 2 could be.
I think with the same fixes tag, since before commit b89d882dc9fc ("vsock/virtio: 
reduce credit update messages") we sent a credit update
for every bytes we read, so we should not have this problem, right?


Agree for 2, so I think I can rearrange: two fixes go first, then current 0001, 
and then tests. And send it as V9 for 'net' only ?

Thanks, Arseniy



hmm why not net-next?


Oh I missed your previous discussion. I think everything in net-next is
safer.  Having said that, I won't nack it net, either.


So, summarizing all above:
1) This patchset entirely goes to net-next as v9
2) I reorder patches like 3 - 2 - 1 - 4, e.g. two fixes goes first with Fixes 
tag
3) Add Acked-by: Michael S. Tsirkin  to each patch

@Michael, @Stefano ?


Okay, let's do that ;-)

Stefano

[ANNOUNCE] 4.19.299-rt130

2023-12-14 Thread Daniel Wagner

Hello RT-list!

I'm pleased to announce the 4.19.299-rt130 stable release. This
is just updating the stable release 4.19.299. No RT specific changes.

There was a conflict due to the backport of 56e894982522 ("sched/rt:
Provide migrate_disable/enable() inlines") to stable. Thus I dropped the
backport because v4.19-rt already provides these two functions.

You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  branch: v4.19-rt
  Head SHA1: a279ca52df99df429ac873bcd8cf7df8569758ae

Or to build 4.19.299-rt130 directly, the following patches should be applied:

  https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz

  https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.19.299.xz

  
https://www.kernel.org/pub/linux/kernel/projects/rt/4.19/older/patch-4.19.299-rt130.patch.xz

Signing key fingerprint:

  5BF6 7BC5 0826 72CA BB45  ACAE 587C 5ECA 5D0A 306C

All keys used for the above files and repositories can be found on the
following git repository:

   git://git.kernel.org/pub/scm/docs/kernel/pgpkeys.git

Enjoy!
Daniel

Changes from v4.19.295-rt129:
---

Daniel Wagner (2):
  Revert "sched/rt: Provide migrate_disable/enable() inlines"
  Linux 4.19.299-rt130
---
include/linux/preempt.h | 30 --
 localversion-rt |  2 +-
 2 files changed, 1 insertion(+), 31 deletions(-)
---
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 29ecd13afdda..9c74a019bf57 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -422,34 +422,4 @@ static inline void preempt_notifier_init(struct 
preempt_notifier *notifier,
 
 #endif
 
-/**
- * migrate_disable - Prevent migration of the current task
- *
- * Maps to preempt_disable() which also disables preemption. Use
- * migrate_disable() to annotate that the intent is to prevent migration,
- * but not necessarily preemption.
- *
- * Can be invoked nested like preempt_disable() and needs the corresponding
- * number of migrate_enable() invocations.
- */
-static __always_inline void migrate_disable(void)
-{
-   preempt_disable();
-}
-
-/**
- * migrate_enable - Allow migration of the current task
- *
- * Counterpart to migrate_disable().
- *
- * As migrate_disable() can be invoked nested, only the outermost invocation
- * reenables migration.
- *
- * Currently mapped to preempt_enable().
- */
-static __always_inline void migrate_enable(void)
-{
-   preempt_enable();
-}
-
 #endif /* __LINUX_PREEMPT_H */
diff --git a/localversion-rt b/localversion-rt
index 90303f5aabcf..6fa797e5b850 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt129
+-rt130

[PATCH] [v2] nvdimm-btt: simplify code with the scope based resource management

2023-12-14 Thread Dinghao Liu

Use the scope based resource management (defined in
linux/cleanup.h) to automate resource lifetime
control on struct btt_sb *super in discover_arenas().

Signed-off-by: Dinghao Liu 
---

Changelog:

v2: Set the __free attribute before kzalloc.
---
 drivers/nvdimm/btt.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index d5593b0dc700..32a9e2f543c5 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "btt.h"
 #include "nd.h"
 
@@ -847,23 +848,20 @@ static int discover_arenas(struct btt *btt)
 {
int ret = 0;
struct arena_info *arena;
-   struct btt_sb *super;
size_t remaining = btt->rawsize;
u64 cur_nlba = 0;
size_t cur_off = 0;
int num_arenas = 0;
 
-   super = kzalloc(sizeof(*super), GFP_KERNEL);
+   struct btt_sb *super __free(kfree) = kzalloc(sizeof(*super), 
GFP_KERNEL);
if (!super)
return -ENOMEM;
 
while (remaining) {
/* Alloc memory for arena */
arena = alloc_arena(btt, 0, 0, 0);
-   if (!arena) {
-   ret = -ENOMEM;
-   goto out_super;
-   }
+   if (!arena)
+   return -ENOMEM;
 
arena->infooff = cur_off;
ret = btt_info_read(arena, super);
@@ -919,14 +917,11 @@ static int discover_arenas(struct btt *btt)
btt->nlba = cur_nlba;
btt->init_state = INIT_READY;
 
-   kfree(super);
return ret;
 
  out:
kfree(arena);
free_arenas(btt);
- out_super:
-   kfree(super);
return ret;
 }
 
-- 
2.17.1

Re: [PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure

On Thu, 14 Dec 2023 at 09:30, Lukasz Luba  wrote:
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Now that cpufreq provides a pressure value to the scheduler, rename
> > arch_update_thermal_pressure into hw pressure to reflect that it returns
> > a pressure applied by HW with a high frequency and which needs filtering.
>
> I would elaborte this meaning 'filtering' here. Something like:
> '... high frequency and which needs filtering to smooth the singal and
> get an average value. That reflects available capacity of the CPU in
> longer period'

Ok I will update the commit message to provide more details

>
> > This pressure is not always related to thermal mitigation but can also be
> > generated by max current limitation as an example.
> >
> > Signed-off-by: Vincent Guittot 
> > ---
> >   arch/arm/include/asm/topology.h   |  6 ++---
> >   arch/arm64/include/asm/topology.h |  6 ++---
> >   drivers/base/arch_topology.c  | 26 +--
> >   drivers/cpufreq/qcom-cpufreq-hw.c |  4 +--
> >   include/linux/arch_topology.h |  8 +++---
> >   include/linux/sched/topology.h|  8 +++---
> >   .../{thermal_pressure.h => hw_pressure.h} | 14 +-
> >   include/trace/events/sched.h  |  2 +-
> >   init/Kconfig  | 12 -
> >   kernel/sched/core.c   |  8 +++---
> >   kernel/sched/fair.c   | 12 -
> >   kernel/sched/pelt.c   | 18 ++---
> >   kernel/sched/pelt.h   | 16 ++--
> >   kernel/sched/sched.h  |  4 +--
> >   14 files changed, 72 insertions(+), 72 deletions(-)
> >   rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
> >
> > diff --git a/arch/arm/include/asm/topology.h 
> > b/arch/arm/include/asm/topology.h
> > index 853c4f81ba4a..e175e8596b5d 100644
> > --- a/arch/arm/include/asm/topology.h
> > +++ b/arch/arm/include/asm/topology.h
> > @@ -22,9 +22,9 @@
> >   /* Enable topology flag updates */
> >   #define arch_update_cpu_topology topology_update_cpu_topology
> >
> > -/* Replace task scheduler's default thermal pressure API */
> > -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> > -#define arch_update_thermal_pressure topology_update_thermal_pressure
> > +/* Replace task scheduler's default hw pressure API */
> > +#define arch_scale_hw_pressure topology_get_hw_pressure
> > +#define arch_update_hw_pressure  topology_update_hw_pressure
> >
> >   #else
> >
> > diff --git a/arch/arm64/include/asm/topology.h 
> > b/arch/arm64/include/asm/topology.h
> > index a323b109b9c4..a427650bdfba 100644
> > --- a/arch/arm64/include/asm/topology.h
> > +++ b/arch/arm64/include/asm/topology.h
> > @@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
> >   /* Enable topology flag updates */
> >   #define arch_update_cpu_topology topology_update_cpu_topology
> >
> > -/* Replace task scheduler's default thermal pressure API */
> > -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> > -#define arch_update_thermal_pressure topology_update_thermal_pressure
> > +/* Replace task scheduler's default hw pressure API */
>
> s/hw/HW/ ?
>
> > +#define arch_scale_hw_pressure topology_get_hw_pressure
> > +#define arch_update_hw_pressure  topology_update_hw_pressure
> >
> >   #include 
> >
> > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> > index 0906114963ff..3d8dc9d5c3ad 100644
> > --- a/drivers/base/arch_topology.c
> > +++ b/drivers/base/arch_topology.c
> > @@ -22,7 +22,7 @@
> >   #include 
> >
> >   #define CREATE_TRACE_POINTS
> > -#include 
> > +#include 
> >
> >   static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
> >   static struct cpumask scale_freq_counters_mask;
> > @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, 
> > unsigned long capacity)
> >   per_cpu(cpu_scale, cpu) = capacity;
> >   }
> >
> > -DEFINE_PER_CPU(unsigned long, thermal_pressure);
> > +DEFINE_PER_CPU(unsigned long, hw_pressure);
> >
> >   /**
> > - * topology_update_thermal_pressure() - Update thermal pressure for CPUs
> > + * topology_update_hw_pressure() - Update hw pressure for CPUs
>
> same here: HW?
>
> >* @cpus: The related CPUs for which capacity has been reduced
> >* @capped_freq : The maximum allowed frequency that CPUs can run at
> >*
> > - * Update the value of thermal pressure for all @cpus in the mask. The
> > + * Update the value of hw pressure for all @cpus in the mask. The
>
> HW?
>
> >* cpumask should include all (online+offline) affected CPUs, to avoid
> >* operating on stale data when hot-plug is used for some CPUs. The
> >* @capped_freq reflects the currently allowed max CPUs frequency due to
> > - * thermal capping. It might be also a boost frequency value, which is 
> > bigger
> > + * hw

Re: [PATCH 0/5] Rework system pressure interface to the scheduler





On 12/14/23 08:29, Vincent Guittot wrote:

On Thu, 14 Dec 2023 at 09:21, Lukasz Luba  wrote:


Hi Vincent,

I've been waiting for this feature, thanks!


On 12/12/23 14:27, Vincent Guittot wrote:

Following the consolidation and cleanup of CPU capacity in [1], this serie
reworks how the scheduler gets the pressures on CPUs. We need to take into
account all pressures applied by cpufreq on the compute capacity of a CPU
for dozens of ms or more and not only cpufreq cooling device or HW
mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
- one from cpufreq and freq_qos
- one from HW high freq mitigiation.

The next step will be to add a dedicated interface for long standing
capping of the CPU capacity (i.e. for seconds or more) like the
scaling_max_freq of cpufreq sysfs. The latter is already taken into
account by this serie but as a temporary pressure which is not always the
best choice when we know that it will happen for seconds or more.

[1] 
https://lore.kernel.org/lkml/20231211104855.558096-1-vincent.guit...@linaro.org/

Vincent Guittot (4):
cpufreq: Add a cpufreq pressure feedback for the scheduler
sched: Take cpufreq feedback into account
thermal/cpufreq: Remove arch_update_thermal_pressure()
sched: Rename arch_update_thermal_pressure into
  arch_update_hw_pressure

   arch/arm/include/asm/topology.h   |  6 +--
   arch/arm64/include/asm/topology.h |  6 +--
   drivers/base/arch_topology.c  | 26 -
   drivers/cpufreq/cpufreq.c | 48 +
   drivers/cpufreq/qcom-cpufreq-hw.c |  4 +-
   drivers/thermal/cpufreq_cooling.c |  3 --
   include/linux/arch_topology.h |  8 +--
   include/linux/cpufreq.h   | 10 
   include/linux/sched/topology.h|  8 +--
   .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
   include/trace/events/sched.h  |  2 +-
   init/Kconfig  | 12 ++---
   kernel/sched/core.c   |  8 +--
   kernel/sched/fair.c   | 53 ++-
   kernel/sched/pelt.c   | 18 +++
   kernel/sched/pelt.h   | 16 +++---
   kernel/sched/sched.h  |  4 +-
   17 files changed, 152 insertions(+), 94 deletions(-)
   rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)



I would like to test it, but something worries me. Why there is 0/5 in
this subject and only 4 patches?


I removed a patch from the series but copied/pasted the cover letter
subject without noticing the /5 instead of /4


OK





Could you tell me your base branch that I can apply this, please?


It applies on top of tip/sched/core + [1]
and you can find it here:
https://git.linaro.org/people/vincent.guittot/kernel.git/log/?h=sched/system-pressure


Thanks for the info and handy link.

Re: [PATCH v5 4/4] dax: add a sysfs knob to control memmap_on_memory behavior

2023-12-14 Thread Greg Kroah-Hartman

On Thu, Dec 14, 2023 at 12:37:57AM -0700, Vishal Verma wrote:
> +static ssize_t memmap_on_memory_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> +
> + return sprintf(buf, "%d\n", dev_dax->memmap_on_memory);

checkpatch should have noticed that this should be sysfs_emit(), right?
If not, please make the change anyway.

> diff --git a/Documentation/ABI/testing/sysfs-bus-dax 
> b/Documentation/ABI/testing/sysfs-bus-dax
> index 6359f7bc9bf4..40d9965733b2 100644
> --- a/Documentation/ABI/testing/sysfs-bus-dax
> +++ b/Documentation/ABI/testing/sysfs-bus-dax
> @@ -134,3 +134,20 @@ KernelVersion:   v5.1
>  Contact: nvd...@lists.linux.dev
>  Description:
>   (RO) The id attribute indicates the region id of a dax region.
> +
> +What:/sys/bus/dax/devices/daxX.Y/memmap_on_memory
> +Date:October, 2023

It's not October anymore :)

thanks,

greg k-h

Re: [PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure




On 12/12/23 14:27, Vincent Guittot wrote:

Now that cpufreq provides a pressure value to the scheduler, rename
arch_update_thermal_pressure into hw pressure to reflect that it returns
a pressure applied by HW with a high frequency and which needs filtering.


I would elaborte this meaning 'filtering' here. Something like:
'... high frequency and which needs filtering to smooth the singal and
get an average value. That reflects available capacity of the CPU in
longer period'


This pressure is not always related to thermal mitigation but can also be
generated by max current limitation as an example.

Signed-off-by: Vincent Guittot 
---
  arch/arm/include/asm/topology.h   |  6 ++---
  arch/arm64/include/asm/topology.h |  6 ++---
  drivers/base/arch_topology.c  | 26 +--
  drivers/cpufreq/qcom-cpufreq-hw.c |  4 +--
  include/linux/arch_topology.h |  8 +++---
  include/linux/sched/topology.h|  8 +++---
  .../{thermal_pressure.h => hw_pressure.h} | 14 +-
  include/trace/events/sched.h  |  2 +-
  init/Kconfig  | 12 -
  kernel/sched/core.c   |  8 +++---
  kernel/sched/fair.c   | 12 -
  kernel/sched/pelt.c   | 18 ++---
  kernel/sched/pelt.h   | 16 ++--
  kernel/sched/sched.h  |  4 +--
  14 files changed, 72 insertions(+), 72 deletions(-)
  rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 853c4f81ba4a..e175e8596b5d 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -22,9 +22,9 @@
  /* Enable topology flag updates */
  #define arch_update_cpu_topology topology_update_cpu_topology
  
-/* Replace task scheduler's default thermal pressure API */

-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure   topology_update_thermal_pressure
+/* Replace task scheduler's default hw pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressuretopology_update_hw_pressure
  
  #else
  
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h

index a323b109b9c4..a427650bdfba 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
  /* Enable topology flag updates */
  #define arch_update_cpu_topology topology_update_cpu_topology
  
-/* Replace task scheduler's default thermal pressure API */

-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure   topology_update_thermal_pressure
+/* Replace task scheduler's default hw pressure API */


s/hw/HW/ ?


+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressuretopology_update_hw_pressure
  
  #include 
  
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c

index 0906114963ff..3d8dc9d5c3ad 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,7 +22,7 @@
  #include 
  
  #define CREATE_TRACE_POINTS

-#include 
+#include 
  
  static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);

  static struct cpumask scale_freq_counters_mask;
@@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
  }
  
-DEFINE_PER_CPU(unsigned long, thermal_pressure);

+DEFINE_PER_CPU(unsigned long, hw_pressure);
  
  /**

- * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * topology_update_hw_pressure() - Update hw pressure for CPUs


same here: HW?


   * @cpus: The related CPUs for which capacity has been reduced
   * @capped_freq : The maximum allowed frequency that CPUs can run at
   *
- * Update the value of thermal pressure for all @cpus in the mask. The
+ * Update the value of hw pressure for all @cpus in the mask. The


HW?


   * cpumask should include all (online+offline) affected CPUs, to avoid
   * operating on stale data when hot-plug is used for some CPUs. The
   * @capped_freq reflects the currently allowed max CPUs frequency due to
- * thermal capping. It might be also a boost frequency value, which is bigger
+ * hw capping. It might be also a boost frequency value, which is bigger


HW?


   * than the internal 'capacity_freq_ref' max frequency. In such case the
   * pressure value should simply be removed, since this is an indication that
- * there is no thermal throttling. The @capped_freq must be provided in kHz.
+ * there is no hw throttling. The @capped_freq must be provided in kHz.


HW?


   */
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void

Re: [PATCH 0/5] Rework system pressure interface to the scheduler

On Thu, 14 Dec 2023 at 09:21, Lukasz Luba  wrote:
>
> Hi Vincent,
>
> I've been waiting for this feature, thanks!
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Following the consolidation and cleanup of CPU capacity in [1], this serie
> > reworks how the scheduler gets the pressures on CPUs. We need to take into
> > account all pressures applied by cpufreq on the compute capacity of a CPU
> > for dozens of ms or more and not only cpufreq cooling device or HW
> > mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
> > - one from cpufreq and freq_qos
> > - one from HW high freq mitigiation.
> >
> > The next step will be to add a dedicated interface for long standing
> > capping of the CPU capacity (i.e. for seconds or more) like the
> > scaling_max_freq of cpufreq sysfs. The latter is already taken into
> > account by this serie but as a temporary pressure which is not always the
> > best choice when we know that it will happen for seconds or more.
> >
> > [1] 
> > https://lore.kernel.org/lkml/20231211104855.558096-1-vincent.guit...@linaro.org/
> >
> > Vincent Guittot (4):
> >cpufreq: Add a cpufreq pressure feedback for the scheduler
> >sched: Take cpufreq feedback into account
> >thermal/cpufreq: Remove arch_update_thermal_pressure()
> >sched: Rename arch_update_thermal_pressure into
> >  arch_update_hw_pressure
> >
> >   arch/arm/include/asm/topology.h   |  6 +--
> >   arch/arm64/include/asm/topology.h |  6 +--
> >   drivers/base/arch_topology.c  | 26 -
> >   drivers/cpufreq/cpufreq.c | 48 +
> >   drivers/cpufreq/qcom-cpufreq-hw.c |  4 +-
> >   drivers/thermal/cpufreq_cooling.c |  3 --
> >   include/linux/arch_topology.h |  8 +--
> >   include/linux/cpufreq.h   | 10 
> >   include/linux/sched/topology.h|  8 +--
> >   .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
> >   include/trace/events/sched.h  |  2 +-
> >   init/Kconfig  | 12 ++---
> >   kernel/sched/core.c   |  8 +--
> >   kernel/sched/fair.c   | 53 ++-
> >   kernel/sched/pelt.c   | 18 +++
> >   kernel/sched/pelt.h   | 16 +++---
> >   kernel/sched/sched.h  |  4 +-
> >   17 files changed, 152 insertions(+), 94 deletions(-)
> >   rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
> >
>
> I would like to test it, but something worries me. Why there is 0/5 in
> this subject and only 4 patches?

I removed a patch from the series but copied/pasted the cover letter
subject without noticing the /5 instead of /4

>
> Could you tell me your base branch that I can apply this, please?

It applies on top of tip/sched/core + [1]
and you can find it here:
https://git.linaro.org/people/vincent.guittot/kernel.git/log/?h=sched/system-pressure

>
> Regards,
> Lukasz

Re: [PATCH 0/5] Rework system pressure interface to the scheduler