from:"Carlos"

Re: [PATCH v3 0/2] Properly initialize speed/duplex and remove vDPA config updates

2024-09-30 Thread Carlos Bilbao

On 9/11/24 11:55 AM, Carlos Bilbao wrote:

> Hello,
>
> On 9/10/24 10:42 PM, Jason Wang wrote:
>> On Tue, Sep 10, 2024 at 2:29 PM Michael S. Tsirkin  wrote:
>>> On Wed, Sep 04, 2024 at 10:11:13AM -0500, Carlos Bilbao wrote:
>>>> From: Carlos Bilbao 
>>>>
>>>> Initialize speed and duplex for virtio_net_config to UNKNOWN (mlx5_vdpa
>>>> vDPA devices currently do not support VIRTIO_NET_F_SPEED_DUPLEX). Remove
>>>> ioctl VHOST_VDPA_SET_CONFIG and its related logic as it is not supported;
>>>> see: https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html
>>>>
>>>> Carlos:
>>>>   vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN
>>>>   vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance
>>> This will need a rebase. Will apply once you post one.
>>> Thanks!


I successfully patched linux-next without any issues. Could you please
specify the repo/branch you need me to rebase onto? Thanks in advance


>> Note that I think patch 2 is probably not right as we indeed allow
>> config write for some device.
>
> I'll rebase patch 1 and drop patch 2.
>
>
>> Thanks
>>
> Thanks, Carlos
>

Re: [PATCH v3 0/2] Properly initialize speed/duplex and remove vDPA config updates

2024-09-11 Thread Carlos Bilbao

Hello,

On 9/10/24 10:42 PM, Jason Wang wrote:
> On Tue, Sep 10, 2024 at 2:29 PM Michael S. Tsirkin  wrote:
>> On Wed, Sep 04, 2024 at 10:11:13AM -0500, Carlos Bilbao wrote:
>>> From: Carlos Bilbao 
>>>
>>> Initialize speed and duplex for virtio_net_config to UNKNOWN (mlx5_vdpa
>>> vDPA devices currently do not support VIRTIO_NET_F_SPEED_DUPLEX). Remove
>>> ioctl VHOST_VDPA_SET_CONFIG and its related logic as it is not supported;
>>> see: https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html
>>>
>>> Carlos:
>>>   vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN
>>>   vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance
>> This will need a rebase. Will apply once you post one.
>> Thanks!
> Note that I think patch 2 is probably not right as we indeed allow
> config write for some device.


I'll rebase patch 1 and drop patch 2.


>
> Thanks
>

Thanks, Carlos

[PATCH v3 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-04 Thread Carlos Bilbao

From: Carlos Bilbao 

Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations with
vdpa_config_ops->set_config(). This is needed per virtio spec requirements:
virtio-spec v1.3 Sec 5.1.4 states that "All of the device configuration
fields are read-only for the driver.". This excludes legacy Alibaba's ENI
so make it use vda_config_ops->set_config_legacy() to avoid future
confusion.

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/alibaba/eni_vdpa.c|  2 +-
 drivers/vdpa/ifcvf/ifcvf_main.c| 10 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  7 ---
 drivers/vdpa/pds/vdpa_dev.c| 16 
 drivers/vdpa/solidrun/snet_main.c  | 18 --
 drivers/vdpa/vdpa.c| 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 -
 drivers/vdpa/vdpa_user/vduse_dev.c |  7 ---
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 14 --
 drivers/vhost/vdpa.c   | 26 --
 drivers/virtio/virtio_vdpa.c   |  9 -
 include/linux/vdpa.h   |  7 ---
 include/uapi/linux/vhost.h |  8 
 14 files changed, 9 insertions(+), 148 deletions(-)

diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c
index cce3d1837104..64b0c0be89ae 100644
--- a/drivers/vdpa/alibaba/eni_vdpa.c
+++ b/drivers/vdpa/alibaba/eni_vdpa.c
@@ -429,7 +429,7 @@ static const struct vdpa_config_ops eni_vdpa_ops = {
.get_vq_align   = eni_vdpa_get_vq_align,
.get_config_size = eni_vdpa_get_config_size,
.get_config = eni_vdpa_get_config,
-   .set_config = eni_vdpa_set_config,
+   .set_config_legacy = eni_vdpa_set_config,
.set_config_cb  = eni_vdpa_set_config_cb,
.get_vq_irq = eni_vdpa_get_vq_irq,
 };
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index e98fa8100f3c..7cbac787ad5f 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -568,15 +568,6 @@ static void ifcvf_vdpa_get_config(struct vdpa_device 
*vdpa_dev,
ifcvf_read_dev_config(vf, offset, buf, len);
 }
 
-static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
- unsigned int offset, const void *buf,
- unsigned int len)
-{
-   struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
-
-   ifcvf_write_dev_config(vf, offset, buf, len);
-}
-
 static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
 struct vdpa_callback *cb)
 {
@@ -640,7 +631,6 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
.get_vq_group   = ifcvf_vdpa_get_vq_group,
.get_config_size= ifcvf_vdpa_get_config_size,
.get_config = ifcvf_vdpa_get_config,
-   .set_config = ifcvf_vdpa_set_config,
.set_config_cb  = ifcvf_vdpa_set_config_cb,
.get_vq_notification = ifcvf_get_vq_notification,
 };
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 41ca268d43ff..35ed46c65b4d 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2918,12 +2918,6 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
*vdev, unsigned int offset,
memcpy(buf, (u8 *)&ndev->config + offset, len);
 }
 
-static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
offset, const void *buf,
-unsigned int len)
-{
-   /* not supported */
-}
-
 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
@@ -3218,7 +3212,6 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
.reset = mlx5_vdpa_reset,
.get_config_size = mlx5_vdpa_get_config_size,
.get_config = mlx5_vdpa_get_config,
-   .set_config = mlx5_vdpa_set_config,
.get_generation = mlx5_vdpa_get_generation,
.set_map = mlx5_vdpa_set_map,
.set_group_asid = mlx5_set_group_asid,
diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
index 25c0fe5ec3d5..553dcd2aa065 100644
--- a/drivers/vdpa/pds/vdpa_dev.c
+++ b/drivers/vdpa/pds/vdpa_dev.c
@@ -553,21 +553,6 @@ static void pds_vdpa_get_config(struct vdpa_device 
*vdpa_dev,
memcpy_fromio(buf, device + offset, len);
 }
 
-static void pds_vdpa_set_config(struct vdpa_device *vdpa_dev,
-   unsigned int offset, const void *buf,
-   unsigned int len)
-{
-   struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
-   void __iomem *device;
-
-   if (offset + len > sizeof(struct virtio_net_config)) {
-   WARN(true, "%s: bad read, offset %d len %d\n", __func__, 
offset, len);
-   return;
-   }
-
-   device = pdsv->vdpa_aux->vd_mdev.device;
-   memcpy_toio

[PATCH v3 0/2] Properly initialize speed/duplex and remove vDPA config updates

2024-09-04 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize speed and duplex for virtio_net_config to UNKNOWN (mlx5_vdpa
vDPA devices currently do not support VIRTIO_NET_F_SPEED_DUPLEX). Remove
ioctl VHOST_VDPA_SET_CONFIG and its related logic as it is not supported;
see: https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html

Carlos:
  vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN
  vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

---

Changes since v1:
 Link: https://lkml.org/lkml/2024/8/29/1368
 - Fix prefix of the first commit and add Reviewed-By tag.
 - Redo second commit completely: instead of attempting to add support to
   set configuration fields, remove ioctl and support entirely from vDPA
   implementations -- because it's not allowed by spec.

Changes since v2:
 Link: https://lkml.org/lkml/2024/9/3/1407
 - Fix first commit by changing 4 spaces for a tab.
 - In second commit, ENI is legacy and should keep set_config(). Change it
   to set_config_legacy() to avoid future confusion and erroneous
   implementations.

---
 drivers/vdpa/alibaba/eni_vdpa.c|  2 +-
 drivers/vdpa/ifcvf/ifcvf_main.c| 10 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 19 ---
 drivers/vdpa/pds/vdpa_dev.c| 16 
 drivers/vdpa/solidrun/snet_main.c  | 18 --
 drivers/vdpa/vdpa.c| 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 -
 drivers/vdpa/vdpa_user/vduse_dev.c |  7 ---
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 14 --
 drivers/vhost/vdpa.c   | 26 --
 drivers/virtio/virtio_vdpa.c   |  9 -
 include/linux/vdpa.h   |  7 ---
 include/uapi/linux/vhost.h |  8 
 14 files changed, 21 insertions(+), 148 deletions(-)

[PATCH v3 1/2] vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN

2024-09-04 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
This is needed because mlx5_vdpa vDPA devices currently do not support the
VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.

Signed-off-by: Carlos Bilbao 
Reviewed-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b56aae3f7be3..41ca268d43ff 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -173,6 +173,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev 
*mvdev, u16 val)
return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
 }
 
+static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
+{
+   return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
 {
if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
@@ -3433,6 +3438,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
init_rwsem(&ndev->reslock);
config = &ndev->config;
 
+   /*
+* mlx5_vdpa vDPA devices currently don't support reporting or
+* setting the speed or duplex.
+*/
+   config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
+   config->duplex = DUPLEX_UNKNOWN;
+
if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
err = config_func_mtu(mdev, add_config->net.mtu);
if (err)
-- 
2.34.1

Re: [PATCH v2 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-04 Thread Carlos Bilbao

Hello,

On 9/4/24 12:58 AM, Dragos Tatulea wrote:
>
> On 04.09.24 05:38, Jason Wang wrote:
>> On Wed, Sep 4, 2024 at 1:15 AM Carlos Bilbao
>>  wrote:
>>> From: Carlos Bilbao 
>>>
>>> Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations
>>> with vdpa_config_ops->set_config(). This is needed per virtio spec
>>> requirements; virtio-spec v3.1 Sec 5.1.4 states that "All of the device
>>> configuration fields are read-only for the driver."
>>>
>>> Signed-off-by: Carlos Bilbao 
>> Note that only the config space of the modern device is read only. So
>> it should be fine to remove vp_vdpa which only works for modern
>> devices.
> Just out of curiosity: how will this work for devices that are not
> v1.3 compliant but are v1.2 compliant? Or is this true of all devices
> except eni?
>
> Thanks,
> Dragos
>> And for eni, it is a legacy only device, so we should not move the
>> set_config there.


Understood. I'll keep the ENI as is for v3 and will reach out to the 
maintainers.


>>
>> For the rest, we need the acks for those maintainers.


Thanks, Carlos


>>
>> Thanks
>>

Re: [PATCH v2 1/2] vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN

2024-09-04 Thread Carlos Bilbao

Hello Nelson,

On 9/3/24 12:37 PM, Nelson, Shannon wrote:
> On 9/3/2024 10:15 AM, Carlos Bilbao wrote:
>> From: Carlos Bilbao 
>>
>> Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
>> This is needed because mlx5_vdpa vDPA devicess currently do not support the
>> VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
>> needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.
>>
>> Signed-off-by: Carlos Bilbao 
>> Reviewed-by: Dragos Tatulea 
>> ---
>>   drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
>>   1 file changed, 12 insertions(+)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index b56aae3f7be3..5fce6d62af4f 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -173,6 +173,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct 
>> mlx5_vdpa_dev *mvdev, u16 val)
>>  return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
>>   }
>>
>> +static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
>> +{
>> +    return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
>
> I hate picking at little things like this, but this should be a tab rather 
> than 4 spaces.
>

Correct, thanks, will fix in v3.


> sln
>
>
>> +}
>> +
>>   static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
>>   {
>>  if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
>> @@ -3433,6 +3438,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
>> *v_mdev, const char *name,
>>  init_rwsem(&ndev->reslock);
>>  config = &ndev->config;
>>
>> +   /*
>> +    * mlx5_vdpa vDPA devices currently don't support reporting or
>> +    * setting the speed or duplex.
>> +    */
>> +   config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
>> +   config->duplex = DUPLEX_UNKNOWN;
>> +
>>  if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
>>  err = config_func_mtu(mdev, add_config->net.mtu);
>>  if (err)
>> -- 
>> 2.34.1
>>

Thanks, Carlos

[PATCH v2 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-03 Thread Carlos Bilbao

From: Carlos Bilbao 

Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations
with vdpa_config_ops->set_config(). This is needed per virtio spec
requirements; virtio-spec v3.1 Sec 5.1.4 states that "All of the device
configuration fields are read-only for the driver."

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/alibaba/eni_vdpa.c| 17 -
 drivers/vdpa/ifcvf/ifcvf_main.c| 10 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  7 ---
 drivers/vdpa/pds/vdpa_dev.c| 16 
 drivers/vdpa/solidrun/snet_main.c  | 18 --
 drivers/vdpa/vdpa.c| 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 -
 drivers/vdpa/vdpa_user/vduse_dev.c |  7 ---
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 14 --
 drivers/vhost/vdpa.c   | 26 --
 drivers/virtio/virtio_vdpa.c   |  9 -
 include/linux/vdpa.h   |  9 -
 include/uapi/linux/vhost.h |  8 
 14 files changed, 4 insertions(+), 170 deletions(-)

diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c
index cce3d1837104..d8f70b385661 100644
--- a/drivers/vdpa/alibaba/eni_vdpa.c
+++ b/drivers/vdpa/alibaba/eni_vdpa.c
@@ -383,22 +383,6 @@ static void eni_vdpa_get_config(struct vdpa_device *vdpa,
*p++ = ioread8(ioaddr + i);
 }
 
-static void eni_vdpa_set_config(struct vdpa_device *vdpa,
-   unsigned int offset, const void *buf,
-   unsigned int len)
-{
-   struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
-   struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
-   void __iomem *ioaddr = ldev->ioaddr +
-   VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) +
-   offset;
-   const u8 *p = buf;
-   int i;
-
-   for (i = 0; i < len; i++)
-   iowrite8(*p++, ioaddr + i);
-}
-
 static void eni_vdpa_set_config_cb(struct vdpa_device *vdpa,
   struct vdpa_callback *cb)
 {
@@ -429,7 +413,6 @@ static const struct vdpa_config_ops eni_vdpa_ops = {
.get_vq_align   = eni_vdpa_get_vq_align,
.get_config_size = eni_vdpa_get_config_size,
.get_config = eni_vdpa_get_config,
-   .set_config = eni_vdpa_set_config,
.set_config_cb  = eni_vdpa_set_config_cb,
.get_vq_irq = eni_vdpa_get_vq_irq,
 };
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index e98fa8100f3c..7cbac787ad5f 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -568,15 +568,6 @@ static void ifcvf_vdpa_get_config(struct vdpa_device 
*vdpa_dev,
ifcvf_read_dev_config(vf, offset, buf, len);
 }
 
-static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
- unsigned int offset, const void *buf,
- unsigned int len)
-{
-   struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
-
-   ifcvf_write_dev_config(vf, offset, buf, len);
-}
-
 static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
 struct vdpa_callback *cb)
 {
@@ -640,7 +631,6 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
.get_vq_group   = ifcvf_vdpa_get_vq_group,
.get_config_size= ifcvf_vdpa_get_config_size,
.get_config = ifcvf_vdpa_get_config,
-   .set_config = ifcvf_vdpa_set_config,
.set_config_cb  = ifcvf_vdpa_set_config_cb,
.get_vq_notification = ifcvf_get_vq_notification,
 };
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 5fce6d62af4f..fd0db86ba2cf 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2918,12 +2918,6 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
*vdev, unsigned int offset,
memcpy(buf, (u8 *)&ndev->config + offset, len);
 }
 
-static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
offset, const void *buf,
-unsigned int len)
-{
-   /* not supported */
-}
-
 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
@@ -3218,7 +3212,6 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
.reset = mlx5_vdpa_reset,
.get_config_size = mlx5_vdpa_get_config_size,
.get_config = mlx5_vdpa_get_config,
-   .set_config = mlx5_vdpa_set_config,
.get_generation = mlx5_vdpa_get_generation,
.set_map = mlx5_vdpa_set_map,
.set_group_asid = mlx5_set_group_asid,
diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
index 25c0fe5ec3d5..553dcd2aa065 100644
--- a/drivers/vdpa/pds/vdpa_dev.c
+++ b/drivers/vdpa/pds/vdpa_dev.c
@@ -553,21 +553,6 @@ st

[PATCH v2 1/2] vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN

2024-09-03 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
This is needed because mlx5_vdpa vDPA devicess currently do not support the
VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.

Signed-off-by: Carlos Bilbao 
Reviewed-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b56aae3f7be3..5fce6d62af4f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -173,6 +173,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev 
*mvdev, u16 val)
return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
 }
 
+static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
+{
+return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
 {
if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
@@ -3433,6 +3438,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
init_rwsem(&ndev->reslock);
config = &ndev->config;
 
+   /*
+* mlx5_vdpa vDPA devices currently don't support reporting or
+* setting the speed or duplex.
+*/
+   config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
+   config->duplex = DUPLEX_UNKNOWN;
+
if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
err = config_func_mtu(mdev, add_config->net.mtu);
if (err)
-- 
2.34.1

[PATCH v2 0/2] Properly initialize speed/duplex and remove vDPA config updates

2024-09-03 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize speed and duplex for virtio_net_config to UNKNOWN (mlx5_vdpa
vDPA devices currently do not support VIRTIO_NET_F_SPEED_DUPLEX). Remove
ioctl VHOST_VDPA_SET_CONFIG and its related logic as it is not supported;
see: https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html

Carlos:
  vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN
  vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

---

Changes since v1:
 Link: https://lkml.org/lkml/2024/8/29/1368
 - Fix prefix of the first commit and add Reviewed-By tag.
 - Redo second commit completely: instead of attempting to add support to
   set configuration fields, remove ioctl and support entirely from vDPA
   implementations -- because it's not allowed by spec.

---
 drivers/vdpa/alibaba/eni_vdpa.c| 17 -
 drivers/vdpa/ifcvf/ifcvf_main.c| 10 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 19 ---
 drivers/vdpa/pds/vdpa_dev.c| 16 
 drivers/vdpa/solidrun/snet_main.c  | 18 --
 drivers/vdpa/vdpa.c| 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 -
 drivers/vdpa/vdpa_user/vduse_dev.c |  7 ---
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 14 --
 drivers/vhost/vdpa.c   | 26 --
 drivers/virtio/virtio_vdpa.c   |  9 -
 include/linux/vdpa.h   |  9 -
 include/uapi/linux/vhost.h |  8 
 14 files changed, 16 insertions(+), 170 deletions(-)

Re: [PATCH 2/2] vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

2024-09-03 Thread Carlos Bilbao

Hello,

On 9/1/24 11:27 PM, Jason Wang wrote:
> On Fri, Aug 30, 2024 at 9:15 PM Carlos Bilbao  
> wrote:
>> Hello,
>>
>> On 8/29/24 9:31 PM, Jason Wang wrote:
>>> On Fri, Aug 30, 2024 at 5:08 AM Dragos Tatulea  wrote:
>>>> (resending as I accidentally replied only to Carlos)
>>>>
>>>> On 29.08.24 18:16, Carlos Bilbao wrote:
>>>>> From: Carlos Bilbao 
>>>>>
>>>>> Include support to update the vDPA configuration fields of speed and
>>>>> duplex (as needed by VHOST_VDPA_SET_CONFIG). This includes function
>>>>> mlx5_vdpa_set_config() as well as changes in vdpa.c to fill the initial
>>>>> values to UNKNOWN. Also add a warning message for when
>>>>> mlx5_vdpa_get_config() receives offset and length out of bounds.
>>>>>
>>>>> Signed-off-by: Carlos Bilbao 
>>>>> ---
>>>>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 34 ++-
>>>>>  drivers/vdpa/vdpa.c   | 27 
>>>>>  include/uapi/linux/vdpa.h |  2 ++
>>>>>  3 files changed, 62 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>>>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>>>> index c47009a8b472..a44bb2072eec 100644
>>>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>>>> @@ -3221,12 +3221,44 @@ static void mlx5_vdpa_get_config(struct 
>>>>> vdpa_device *vdev, unsigned int offset,
>>>>>
>>>>>   if (offset + len <= sizeof(struct virtio_net_config))
>>>>>   memcpy(buf, (u8 *)&ndev->config + offset, len);
>>>>> + else
>>>>> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>>>>>  }
>>>>>
>>>>>  static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>>>>> offset, const void *buf,
>>>>>unsigned int len)
>>>>>  {
>>>>> - /* not supported */
>>>>> + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>>>>> + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
>>>>> +
>>>>> + if (offset + len > sizeof(struct virtio_net_config)) {
>>>>> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>>>>> + return;
>>>>> + }
>>>>> +
>>>>> + /*
>>>>> +  * Note that this will update the speed/duplex configuration fields
>>>>> +  * but the hardware support to actually perform this change does
>>>>> +  * not exist yet.
>>>>> +  */
>>>>> + switch (offset) {
>>>>> + case offsetof(struct virtio_net_config, speed):
>>>>> + if (len == sizeof(((struct virtio_net_config *) 0)->speed))
>>>>> + memcpy(&ndev->config.speed, buf, len);
>>>>> + else
>>>>> + mlx5_vdpa_warn(mvdev, "Invalid length for 
>>>>> speed.\n");
>>>>> + break;
>>>>> +
>>>>> + case offsetof(struct virtio_net_config, duplex):
>>>>> + if (len == sizeof(((struct virtio_net_config *)0)->duplex))
>>>>> + memcpy(&ndev->config.duplex, buf, len);
>>>>> + else
>>>>> + mlx5_vdpa_warn(mvdev, "Invalid length for 
>>>>> duplex.\n");
>>>>> + break;
>>>>> +
>>>>> + default:
>>>>> + mlx5_vdpa_warn(mvdev, "Configuration field not 
>>>>> supported.\n");
>>>> This will trigger noise in dmesg because there is a MAC configuration here.
>>>>> + }
>>>> I would prefer that the .set_config remains a stub TBH. Setting the fields 
>>>> here is
>>>> misleading: the user might deduce that the configuration worked when they 
>>>> read the
>>>> values and see that they were updated.
>>> Yes, and actually, those fields are read-only according to the spec:
>>>
>>> """
>>> The network device has the following device configuration layout. All
>>> of the device configuration fields are read-only for the driver.
>>> """
>>>
>>> Thanks
>>
>> Should I go ahead and remove the ioctl then?
> If you meant mlx5_vdpa_set_config, I think yes.


Ack, I will send a new patch set in which the second commit gets rid of the
ioctl -- but not only for mlx5 but for all vDPA implementations.


>
> Thanks
>

Thanks, Carlos

Re: [PATCH 2/2] vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

2024-08-30 Thread Carlos Bilbao

Hello,

On 8/29/24 9:31 PM, Jason Wang wrote:
> On Fri, Aug 30, 2024 at 5:08 AM Dragos Tatulea  wrote:
>> (resending as I accidentally replied only to Carlos)
>>
>> On 29.08.24 18:16, Carlos Bilbao wrote:
>>> From: Carlos Bilbao 
>>>
>>> Include support to update the vDPA configuration fields of speed and
>>> duplex (as needed by VHOST_VDPA_SET_CONFIG). This includes function
>>> mlx5_vdpa_set_config() as well as changes in vdpa.c to fill the initial
>>> values to UNKNOWN. Also add a warning message for when
>>> mlx5_vdpa_get_config() receives offset and length out of bounds.
>>>
>>> Signed-off-by: Carlos Bilbao 
>>> ---
>>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 34 ++-
>>>  drivers/vdpa/vdpa.c   | 27 
>>>  include/uapi/linux/vdpa.h |  2 ++
>>>  3 files changed, 62 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>> index c47009a8b472..a44bb2072eec 100644
>>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>>> @@ -3221,12 +3221,44 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
>>> *vdev, unsigned int offset,
>>>
>>>   if (offset + len <= sizeof(struct virtio_net_config))
>>>   memcpy(buf, (u8 *)&ndev->config + offset, len);
>>> + else
>>> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>>>  }
>>>
>>>  static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>>> offset, const void *buf,
>>>unsigned int len)
>>>  {
>>> - /* not supported */
>>> + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>>> + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
>>> +
>>> + if (offset + len > sizeof(struct virtio_net_config)) {
>>> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>>> + return;
>>> + }
>>> +
>>> + /*
>>> +  * Note that this will update the speed/duplex configuration fields
>>> +  * but the hardware support to actually perform this change does
>>> +  * not exist yet.
>>> +  */
>>> + switch (offset) {
>>> + case offsetof(struct virtio_net_config, speed):
>>> + if (len == sizeof(((struct virtio_net_config *) 0)->speed))
>>> + memcpy(&ndev->config.speed, buf, len);
>>> + else
>>> + mlx5_vdpa_warn(mvdev, "Invalid length for speed.\n");
>>> + break;
>>> +
>>> + case offsetof(struct virtio_net_config, duplex):
>>> + if (len == sizeof(((struct virtio_net_config *)0)->duplex))
>>> + memcpy(&ndev->config.duplex, buf, len);
>>> + else
>>> + mlx5_vdpa_warn(mvdev, "Invalid length for duplex.\n");
>>> + break;
>>> +
>>> + default:
>>> + mlx5_vdpa_warn(mvdev, "Configuration field not supported.\n");
>> This will trigger noise in dmesg because there is a MAC configuration here.
>>> + }
>> I would prefer that the .set_config remains a stub TBH. Setting the fields 
>> here is
>> misleading: the user might deduce that the configuration worked when they 
>> read the
>> values and see that they were updated.
> Yes, and actually, those fields are read-only according to the spec:
>
> """
> The network device has the following device configuration layout. All
> of the device configuration fields are read-only for the driver.
> """
>
> Thanks


Should I go ahead and remove the ioctl then?


>
>> Thanks,
>> dragos
>>>  }
>>>
>>>  static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
>>> diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
>>> index 4dbd2e55a288..b920e4405f6d 100644
>>> --- a/drivers/vdpa/vdpa.c
>>> +++ b/drivers/vdpa/vdpa.c
>>> @@ -15,6 +15,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>
>>>  static LIST_HEAD(mdev_head);
>>>  /* A global mutex that protects vdpa management device and device level 
>>> operat

Re: [PATCH 2/2] vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

2024-08-29 Thread Carlos Bilbao

Hello,

On 8/29/24 4:07 PM, Dragos Tatulea wrote:
> (resending as I accidentally replied only to Carlos)
>
> On 29.08.24 18:16, Carlos Bilbao wrote:
>> From: Carlos Bilbao 
>>
>> Include support to update the vDPA configuration fields of speed and
>> duplex (as needed by VHOST_VDPA_SET_CONFIG). This includes function
>> mlx5_vdpa_set_config() as well as changes in vdpa.c to fill the initial
>> values to UNKNOWN. Also add a warning message for when
>> mlx5_vdpa_get_config() receives offset and length out of bounds.
>>
>> Signed-off-by: Carlos Bilbao 
>> ---
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 34 ++-
>>  drivers/vdpa/vdpa.c   | 27 
>>  include/uapi/linux/vdpa.h |  2 ++
>>  3 files changed, 62 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index c47009a8b472..a44bb2072eec 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -3221,12 +3221,44 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
>> *vdev, unsigned int offset,
>>  
>>  if (offset + len <= sizeof(struct virtio_net_config))
>>  memcpy(buf, (u8 *)&ndev->config + offset, len);
>> +else
>> +mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>>  }
>>  
>>  static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>> offset, const void *buf,
>>   unsigned int len)
>>  {
>> -/* not supported */
>> +struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>> +struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
>> +
>> +if (offset + len > sizeof(struct virtio_net_config)) {
>> +mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>> +return;
>> +}
>> +
>> +/*
>> + * Note that this will update the speed/duplex configuration fields
>> + * but the hardware support to actually perform this change does
>> + * not exist yet.
>> + */
>> +switch (offset) {
>> +case offsetof(struct virtio_net_config, speed):
>> +if (len == sizeof(((struct virtio_net_config *) 0)->speed))
>> +memcpy(&ndev->config.speed, buf, len);
>> +else
>> +mlx5_vdpa_warn(mvdev, "Invalid length for speed.\n");
>> +break;
>> +
>> +case offsetof(struct virtio_net_config, duplex):
>> +if (len == sizeof(((struct virtio_net_config *)0)->duplex))
>> +memcpy(&ndev->config.duplex, buf, len);
>> +else
>> +mlx5_vdpa_warn(mvdev, "Invalid length for duplex.\n");
>> +break;
>> +
>> +default:
>> +mlx5_vdpa_warn(mvdev, "Configuration field not supported.\n");
> This will trigger noise in dmesg because there is a MAC configuration here.
>> +}
> I would prefer that the .set_config remains a stub TBH. Setting the fields 
> here is
> misleading: the user might deduce that the configuration worked when they 
> read the
> values and see that they were updated.


Well, people might already assume that the values are updated because there
is an ioctl available to user space (VHOST_VDPA_SET_CONFIG) that doesn't
warn or return an error - or at least I did.

But, I understand your concern. I propose that we at least return an error
in the ioctl if the requested config updated is not supported. We could
also structure this patch so that it can be used when or if hardware
support becomes available in the future.

Is there a way to query the hardware capabilities of the card, e.g., MSRs
or other methods? Do you recommend any technical manual?

Or, if the ioctl uses, for example, /dev/vhost-vdpa-2, does its speed and
duplex settings correspond to those of the tap device it links to? This
information can be checked and changed using the definitions in
include/uapi/linux/ethtool.h.

Thank you for taking the time to read and answer me.


>
> Thanks,
> dragos
>>  }
>>  
>>  static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
>> diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
>> index 4dbd2e55a288..b920e4405f6d 100644
>> --- a/drivers/vdpa/vdpa.c
>> +++ b/drivers/vdpa/vdpa.c
>> @@ -15,6 +15,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  static LIST_HEAD(mdev_head);
>&

Re: [PATCH 1/2] mlx5_vnet: Set speed and duplex of vDPA devices to UNKNOWN

2024-08-29 Thread Carlos Bilbao

Hello,

On 8/29/24 1:44 PM, Dragos Tatulea wrote:
>
> On 29.08.24 18:16, Carlos Bilbao wrote:
>> From: Carlos Bilbao 
>>
>> Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
>> This is needed because mlx5_vdpa vDPA devicess currently do not support the
>> VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
>> needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.
>>
>> Signed-off-by: Carlos Bilbao 
> Nit: prefix is vdpa/mlx5. Once that is fixed, for this patch:
> Reviewed-by: Dragos Tatulea 


Thank you, will keep that in mind to fix in v2.


>
>> ---
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
>>  1 file changed, 12 insertions(+)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index fa78e8288ebb..c47009a8b472 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -193,6 +193,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct 
>> mlx5_vdpa_dev *mvdev, u16 val)
>>  return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
>>  }
>>  
>> +static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
>> +{
>> +return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
>> +}
>> +
>>  static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
>>  {
>>  if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
>> @@ -3795,6 +3800,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
>> *v_mdev, const char *name,
>>  init_rwsem(&ndev->reslock);
>>  config = &ndev->config;
>>  
>> +/*
>> + * mlx5_vdpa vDPA devices currently don't support reporting or
>> + * setting the speed or duplex.
>> + */
>> +    config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
>> +config->duplex = DUPLEX_UNKNOWN;
>> +
>>  if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
>>  err = config_func_mtu(mdev, add_config->net.mtu);
>>  if (err)


Thanks, Carlos

[PATCH 1/2] mlx5_vnet: Set speed and duplex of vDPA devices to UNKNOWN

2024-08-29 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
This is needed because mlx5_vdpa vDPA devicess currently do not support the
VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fa78e8288ebb..c47009a8b472 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -193,6 +193,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev 
*mvdev, u16 val)
return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
 }
 
+static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
+{
+   return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
 {
if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
@@ -3795,6 +3800,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
init_rwsem(&ndev->reslock);
config = &ndev->config;
 
+   /*
+* mlx5_vdpa vDPA devices currently don't support reporting or
+* setting the speed or duplex.
+*/
+   config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
+   config->duplex = DUPLEX_UNKNOWN;
+
if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
err = config_func_mtu(mdev, add_config->net.mtu);
if (err)
-- 
2.34.1

[PATCH 2/2] vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

2024-08-29 Thread Carlos Bilbao

From: Carlos Bilbao 

Include support to update the vDPA configuration fields of speed and
duplex (as needed by VHOST_VDPA_SET_CONFIG). This includes function
mlx5_vdpa_set_config() as well as changes in vdpa.c to fill the initial
values to UNKNOWN. Also add a warning message for when
mlx5_vdpa_get_config() receives offset and length out of bounds.

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 34 ++-
 drivers/vdpa/vdpa.c   | 27 
 include/uapi/linux/vdpa.h |  2 ++
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index c47009a8b472..a44bb2072eec 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3221,12 +3221,44 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
*vdev, unsigned int offset,
 
if (offset + len <= sizeof(struct virtio_net_config))
memcpy(buf, (u8 *)&ndev->config + offset, len);
+   else
+   mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
 }
 
 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
offset, const void *buf,
 unsigned int len)
 {
-   /* not supported */
+   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+   struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+   if (offset + len > sizeof(struct virtio_net_config)) {
+   mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
+   return;
+   }
+
+   /*
+* Note that this will update the speed/duplex configuration fields
+* but the hardware support to actually perform this change does
+* not exist yet.
+*/
+   switch (offset) {
+   case offsetof(struct virtio_net_config, speed):
+   if (len == sizeof(((struct virtio_net_config *) 0)->speed))
+   memcpy(&ndev->config.speed, buf, len);
+   else
+   mlx5_vdpa_warn(mvdev, "Invalid length for speed.\n");
+   break;
+
+   case offsetof(struct virtio_net_config, duplex):
+   if (len == sizeof(((struct virtio_net_config *)0)->duplex))
+   memcpy(&ndev->config.duplex, buf, len);
+   else
+   mlx5_vdpa_warn(mvdev, "Invalid length for duplex.\n");
+   break;
+
+   default:
+   mlx5_vdpa_warn(mvdev, "Configuration field not supported.\n");
+   }
 }
 
 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 4dbd2e55a288..b920e4405f6d 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static LIST_HEAD(mdev_head);
 /* A global mutex that protects vdpa management device and device level 
operations. */
@@ -919,6 +920,22 @@ static int vdpa_dev_net_status_config_fill(struct sk_buff 
*msg, u64 features,
return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16);
 }
 
+static int vdpa_dev_net_speed_config_fill(struct sk_buff *msg, u64 features,
+   struct virtio_net_config *config)
+{
+   __le32 speed = cpu_to_le32(SPEED_UNKNOWN);
+
+   return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_SPEED, sizeof(speed), &speed);
+}
+
+static int vdpa_dev_net_duplex_config_fill(struct sk_buff *msg, u64 features,
+   struct virtio_net_config *config)
+{
+   u8 duplex = DUPLEX_UNKNOWN;
+
+   return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_DUPLEX, sizeof(duplex), 
&duplex);
+}
+
 static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff 
*msg)
 {
struct virtio_net_config config = {};
@@ -940,6 +957,16 @@ static int vdpa_dev_net_config_fill(struct vdpa_device 
*vdev, struct sk_buff *ms
 
if (vdpa_dev_net_status_config_fill(msg, features_device, &config))
return -EMSGSIZE;
+   /*
+* mlx5_vdpa vDPA devicess currently do not support the
+* VIRTIO_NET_F_SPEED_DUPLEX feature, which reports speed and
+* duplex; hence these are set to UNKNOWN for now.
+*/
+   if (vdpa_dev_net_speed_config_fill(msg, features_device, &config))
+   return -EMSGSIZE;
+
+   if (vdpa_dev_net_duplex_config_fill(msg, features_device, &config))
+   return -EMSGSIZE;
 
return vdpa_dev_net_mq_config_fill(msg, features_device, &config);
 }
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 842bf1201ac4..1c64ee0dd7b1 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -43,6 +43,8 @@ enum vdpa_attr {
VDPA_ATTR_DEV_NET_STATUS,   /* u8 */

[PATCH 0/2] Initialize vDPA speed/duplex and support their updates

2024-08-29 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize speed and duplex for virtio_net_config to UNKNOWN (mlx5_vdpa
vDPA devices currently do not support VIRTIO_NET_F_SPEED_DUPLEX). Include
logic to update these fields (for VHOST_VDPA_SET_CONFIG) -- even if
hardware support is not capable yet (Make a note of that). Also add warning
messages for out of bounds errors in mlx5_vnet get/set_config logic.

Note: You can test these changes from user space passing a fd of your
file "/dev/vhost-vdpa-%d" to:

void check_config_speed_duplex(int fd) {

uint8_t *buf;
uint32_t size;
struct vhost_vdpa_config *config;
struct virtio_net_config *net_config;

if (ioctl(fd, VHOST_VDPA_GET_CONFIG_SIZE, &size) < 0) {
perror("ioctl failed");
return;
}

config = malloc(sizeof(struct vhost_vdpa_config) + size);

if (!config) {
perror("malloc failed");
return;
}

memset(config, 0, sizeof(struct vhost_vdpa_config) + size);
config->len = size;
config->off = 0;

buf = config->buf;

if (ioctl(fd, VHOST_VDPA_GET_CONFIG, config) < 0) {
perror("ioctl failed");
}
else {
net_config = (struct virtio_net_config *)buf;

printf("  Speed: %u Mb\n", net_config->speed);

if (net_config->duplex == 0)
printf("  Half Duplex\n);
else if (net_config->duplex == 1)
printf("  Full Duplex\n");
else
printf("  Unknown Duplex\n");
}

free(config);
}

Carlos Bilbao:
  mlx5_vnet: Set speed and duplex of vDPA devices to UNKNOWN
  vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 46 ++-
 drivers/vdpa/vdpa.c   | 27 ++
 include/uapi/linux/vdpa.h |  2 ++
 3 files changed, 74 insertions(+), 1 deletion(-)

Re: [PATCH] vdpa: Set speed and duplex of mlx5_vnet to UNKNOWN

2024-08-29 Thread Carlos Bilbao

Hello,

On 8/29/24 5:20 AM, Dragos Tatulea wrote:
>
> On 28.08.24 20:16, Carlos Bilbao wrote:
>> From: Carlos Bilbao 
>>
>> mlx5_vdpa vDPA devices currently don't support reporting or setting the
>> speed and duplex and hence should be UNKNOWN instead of zero.
>>
>> Signed-off-by: Carlos Bilbao 
>> ---
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c |  7 +++
>>  drivers/vdpa/vdpa.c   | 23 +++
>>  include/uapi/linux/vdpa.h |  2 ++
>>  3 files changed, 32 insertions(+)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index fa78e8288ebb..319f5c6121de 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -3795,6 +3795,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
>> *v_mdev, const char *name,
>>  init_rwsem(&ndev->reslock);
>>  config = &ndev->config;
>>  
>> +/*
>> + * mlx5_vdpa vDPA devices currently don't support reporting or
>> + * setting the speed or duplex.
>> + */
>> +config->speed  = SPEED_UNKNOWN;
>> +config->duplex = DUPLEX_UNKNOWN;
>> +
> The values in virtio_net_config are little endian so you'll need to explicitly
> convert them. As speed is a u32, you'll need to add a cpu_to_mlx5vdpa32() 
> helper.
>
> Thanks,
> Dragos
>
>>  if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
>>  err = config_func_mtu(mdev, add_config->net.mtu);
>>  if (err)
>> diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
>> index 4dbd2e55a288..abde23e0041d 100644
>> --- a/drivers/vdpa/vdpa.c
>> +++ b/drivers/vdpa/vdpa.c
>> @@ -15,6 +15,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  static LIST_HEAD(mdev_head);
>>  /* A global mutex that protects vdpa management device and device level 
>> operations. */
>> @@ -919,6 +920,22 @@ static int vdpa_dev_net_status_config_fill(struct 
>> sk_buff *msg, u64 features,
>>  return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16);
>>  }
>>  
>> +static int vdpa_dev_net_speed_config_fill(struct sk_buff *msg, u64 features,
>> +struct virtio_net_config *config)
>> +{
>> +__le32 speed = cpu_to_le32(SPEED_UNKNOWN);
>> +
>> +return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_SPEED, sizeof(speed), &speed);
>> +}
>> +
>> +static int vdpa_dev_net_duplex_config_fill(struct sk_buff *msg, u64 
>> features,
>> +struct virtio_net_config *config)
>> +{
>> +u8 duplex = DUPLEX_UNKNOWN;
>> +
>> +return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_DUPLEX, sizeof(duplex), 
>> &duplex);
>> +}
>> +
>>  static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct 
>> sk_buff *msg)
>>  {
>>  struct virtio_net_config config = {};
>> @@ -941,6 +958,12 @@ static int vdpa_dev_net_config_fill(struct vdpa_device 
>> *vdev, struct sk_buff *ms
>>  if (vdpa_dev_net_status_config_fill(msg, features_device, &config))
>>  return -EMSGSIZE;
>>  
>> +if (vdpa_dev_net_speed_config_fill(msg, features_device, &config))
>> +return -EMSGSIZE;
>> +
>> +if (vdpa_dev_net_duplex_config_fill(msg, features_device, &config))
>> +return -EMSGSIZE;
>> +
>>  return vdpa_dev_net_mq_config_fill(msg, features_device, &config);
>>  }
>>  
>> diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
>> index 842bf1201ac4..1c64ee0dd7b1 100644
>> --- a/include/uapi/linux/vdpa.h
>> +++ b/include/uapi/linux/vdpa.h
>> @@ -43,6 +43,8 @@ enum vdpa_attr {
>>  VDPA_ATTR_DEV_NET_STATUS,   /* u8 */
>>  VDPA_ATTR_DEV_NET_CFG_MAX_VQP,  /* u16 */
>>  VDPA_ATTR_DEV_NET_CFG_MTU,  /* u16 */
>> +VDPA_ATTR_DEV_NET_CFG_SPEED,/* u32 */
>> +VDPA_ATTR_DEV_NET_CFG_DUPLEX,   /* u8 */
>>  
>>  VDPA_ATTR_DEV_NEGOTIATED_FEATURES,  /* u64 */
>>  VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,  /* u32 */


Thank you all for the feedback, everyone. I will prepare a patch set with
two commits:

 - One to set speed and duplex to UNKNOWN (in little-endian format, thank
   you, Dragos).

 - Another to update the vDPA tool to allow setting the values for speed
   and duplex, including support in set_config(). I'll add a comment
   clarifying that this support isn't fully implemented at the hardware
   level yet.

Regards,
Carlos

[PATCH] vdpa: Set speed and duplex of mlx5_vnet to UNKNOWN

2024-08-28 Thread Carlos Bilbao

From: Carlos Bilbao 

mlx5_vdpa vDPA devices currently don't support reporting or setting the
speed and duplex and hence should be UNKNOWN instead of zero.

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c |  7 +++
 drivers/vdpa/vdpa.c   | 23 +++
 include/uapi/linux/vdpa.h |  2 ++
 3 files changed, 32 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index fa78e8288ebb..319f5c6121de 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3795,6 +3795,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
init_rwsem(&ndev->reslock);
config = &ndev->config;
 
+   /*
+* mlx5_vdpa vDPA devices currently don't support reporting or
+* setting the speed or duplex.
+*/
+   config->speed  = SPEED_UNKNOWN;
+   config->duplex = DUPLEX_UNKNOWN;
+
if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
err = config_func_mtu(mdev, add_config->net.mtu);
if (err)
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 4dbd2e55a288..abde23e0041d 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static LIST_HEAD(mdev_head);
 /* A global mutex that protects vdpa management device and device level 
operations. */
@@ -919,6 +920,22 @@ static int vdpa_dev_net_status_config_fill(struct sk_buff 
*msg, u64 features,
return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16);
 }
 
+static int vdpa_dev_net_speed_config_fill(struct sk_buff *msg, u64 features,
+   struct virtio_net_config *config)
+{
+   __le32 speed = cpu_to_le32(SPEED_UNKNOWN);
+
+   return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_SPEED, sizeof(speed), &speed);
+}
+
+static int vdpa_dev_net_duplex_config_fill(struct sk_buff *msg, u64 features,
+   struct virtio_net_config *config)
+{
+   u8 duplex = DUPLEX_UNKNOWN;
+
+   return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_DUPLEX, sizeof(duplex), 
&duplex);
+}
+
 static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff 
*msg)
 {
struct virtio_net_config config = {};
@@ -941,6 +958,12 @@ static int vdpa_dev_net_config_fill(struct vdpa_device 
*vdev, struct sk_buff *ms
if (vdpa_dev_net_status_config_fill(msg, features_device, &config))
return -EMSGSIZE;
 
+   if (vdpa_dev_net_speed_config_fill(msg, features_device, &config))
+   return -EMSGSIZE;
+
+   if (vdpa_dev_net_duplex_config_fill(msg, features_device, &config))
+   return -EMSGSIZE;
+
return vdpa_dev_net_mq_config_fill(msg, features_device, &config);
 }
 
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 842bf1201ac4..1c64ee0dd7b1 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -43,6 +43,8 @@ enum vdpa_attr {
VDPA_ATTR_DEV_NET_STATUS,   /* u8 */
VDPA_ATTR_DEV_NET_CFG_MAX_VQP,  /* u16 */
VDPA_ATTR_DEV_NET_CFG_MTU,  /* u16 */
+   VDPA_ATTR_DEV_NET_CFG_SPEED,/* u32 */
+   VDPA_ATTR_DEV_NET_CFG_DUPLEX,   /* u8 */
 
VDPA_ATTR_DEV_NEGOTIATED_FEATURES,  /* u64 */
VDPA_ATTR_DEV_MGMTDEV_MAX_VQS,  /* u32 */
-- 
2.34.1

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-28 Thread Carlos Bilbao

Hello,

On 8/27/24 11:54 AM, Dragos Tatulea wrote:
>
> On 27.08.24 04:03, Jason Wang wrote:
>> On Tue, Aug 27, 2024 at 12:11 AM Dragos Tatulea  wrote:
>>>
>>> On 26.08.24 16:24, Andrew Lunn wrote:
>>>> On Mon, Aug 26, 2024 at 11:06:09AM +0200, Dragos Tatulea wrote:
>>>>>
>>>>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>>>>> Hello,
>>>>>>
>>>>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>>>>> configuration, I noticed that it's running in half duplex mode:
>>>>>>
>>>>>> Configuration data (24 bytes):
>>>>>>   MAC address: (Mac address)
>>>>>>   Status: 0x0001
>>>>>>   Max virtqueue pairs: 8
>>>>>>   MTU: 1500
>>>>>>   Speed: 0 Mb
>>>>>>   Duplex: Half Duplex
>>>>>>   RSS max key size: 0
>>>>>>   RSS max indirection table length: 0
>>>>>>   Supported hash types: 0x
>>>>>>
>>>>>> I believe this might be contributing to the underperformance of vDPA.
>>>>> mlx5_vdpa vDPA devicess currently do not support the 
>>>>> VIRTIO_NET_F_SPEED_DUPLEX
>>>>> feature which reports speed and duplex. You can check the state on the
>>>>> PF.
>>>> Then it should probably report DUPLEX_UNKNOWN.
>>>>
>>>> The speed of 0 also suggests SPEED_UNKNOWN is not being returned. So
>>>> this just looks buggy in general.
>>>>
>>> The virtio spec doesn't mention what those values should be when
>>> VIRTIO_NET_F_SPEED_DUPLEX is not supported.
>>>
>>> Jason, should vdpa_dev_net_config_fill() initialize the speed/duplex
>>> fields to SPEED/DUPLEX_UNKNOWN instead of 0?
>> Spec said
>>
>> """
>> The following two fields, speed and duplex, only exist if
>> VIRTIO_NET_F_SPEED_DUPLEX is set.
>> """
>>
>> So my understanding is that it is undefined behaviour, and those
>> fields seems useless before feature negotiation. For safety, it might
>> be better to initialize them as UNKOWN.
>>
> After a closer look my statement doesn't make sense: the device will copy
> the virtio_net_config bytes on top.
>
> The solution is to initialize these fields to UNKNOWN in the driver. Will send
> a patch to fix this.


With Dragos' permission, I'm sending a first draft of this now.


>
> Thanks,
> Dragos


Thanks, Carlos

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-28 Thread Carlos Bilbao

Hello,

On 8/26/24 9:07 PM, Jason Wang wrote:
> On Tue, Aug 27, 2024 at 3:23 AM Carlos Bilbao  
> wrote:
>> Hello,
>>
>> On 8/26/24 10:53 AM, Dragos Tatulea wrote:
>>> On 26.08.24 16:26, Carlos Bilbao wrote:
>>>> Hello Dragos,
>>>>
>>>> On 8/26/24 4:06 AM, Dragos Tatulea wrote:
>>>>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>>>>> Hello,
>>>>>>
>>>>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>>>>> configuration, I noticed that it's running in half duplex mode:
>>>>>>
>>>>>> Configuration data (24 bytes):
>>>>>>   MAC address: (Mac address)
>>>>>>   Status: 0x0001
>>>>>>   Max virtqueue pairs: 8
>>>>>>   MTU: 1500
>>>>>>   Speed: 0 Mb
>>>>>>   Duplex: Half Duplex
>>>>>>   RSS max key size: 0
>>>>>>   RSS max indirection table length: 0
>>>>>>   Supported hash types: 0x
>>>>>>
>>>>>> I believe this might be contributing to the underperformance of vDPA.
>>>>> mlx5_vdpa vDPA devicess currently do not support the 
>>>>> VIRTIO_NET_F_SPEED_DUPLEX
>>>>> feature which reports speed and duplex. You can check the state on the
>>>>> PF.
>>>> According to ethtool, all my devices are running at full duplex. I assume I
>>>> can disregard this configuration output from the module then.
>>>>
>>> Yep.
>>>
>>>>>> While looking into how to change this option for Mellanox, I read the 
>>>>>> following
>>>>>> kernel code in mlx5_vnet.c:
>>>>>>
>>>>>> static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>>>>>> offset, const void *buf,
>>>>>>  unsigned int len)
>>>>>> {
>>>>>> /* not supported */
>>>>>> }
>>>>>>
>>>>>> I was wondering why this is the case.
>>>>> TBH, I don't know why it was not added. But in general, the control VQ is 
>>>>> the
>>>>> better way as it's dynamic.
>>>>>
>>>>>> Is there another way for me to change
>>>>>> these configuration settings?
>>>>>>
>>>>> The configuration is done using control VQ for most things (MTU, MAC, VQs,
>>>>> etc). Make sure that you have the CTRL_VQ feature set (should be on by
>>>>> default). It should appear in `vdpa mgmtdev show` and `vdpa dev config
>>>>> show`.
>>>> I see that CTRL_VQ is indeed enabled. Is there any documentation on how to
>>>> use the control VQ to get/set vDPA configuration values?
>>>>
>>>>
>>> You are most likely using it already through through qemu. You can check
>>> if the CTR_VQ feature also shows up in the output of `vdpa dev config show`.
>>>
>>> What values are you trying to configure btw?
>>
>> Yes, CTRL_VQ also shows up in vdpa dev config show. There isn't a specific
>> value I want to configure ATM, but my vDPA isn't performing as expected, so
>> I'm investigating potential issues. Below is the code I used to retrieve
>> the configuration from the driver; I'd be happy to send it as a patch if
>> you or someone else reviews it.
>>
>>
>>> Thanks,
>>> Dragos
>>
>> Thanks,
>> Carlos
>>
>> ---
>>
>> From ab6ea66c926eaf1e95eb5d73bc23183e0021ee27 Mon Sep 17 00:00:00 2001
>> From: Carlos Bilbao 
>> Date: Sat, 24 Aug 2024 00:24:56 +
>> Subject: [PATCH] mlx5: Add support to update the vDPA configuration
>>
>> This is needed for VHOST_VDPA_SET_CONFIG.
>>
>> Signed-off-by: Carlos Bilbao 
>> ---
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 22 --
>>  1 file changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index b56aae3f7be3..da31c743b2b9 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -2909,14 +2909,32 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
>> *vdev, unsigned int offset,
>>  struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>>  struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-27 Thread Carlos Bilbao

Hello,

On 8/26/24 9:07 PM, Jason Wang wrote:
> On Tue, Aug 27, 2024 at 3:23 AM Carlos Bilbao  
> wrote:
>> Hello,
>>
>> On 8/26/24 10:53 AM, Dragos Tatulea wrote:
>>> On 26.08.24 16:26, Carlos Bilbao wrote:
>>>> Hello Dragos,
>>>>
>>>> On 8/26/24 4:06 AM, Dragos Tatulea wrote:
>>>>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>>>>> Hello,
>>>>>>
>>>>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>>>>> configuration, I noticed that it's running in half duplex mode:
>>>>>>
>>>>>> Configuration data (24 bytes):
>>>>>>   MAC address: (Mac address)
>>>>>>   Status: 0x0001
>>>>>>   Max virtqueue pairs: 8
>>>>>>   MTU: 1500
>>>>>>   Speed: 0 Mb
>>>>>>   Duplex: Half Duplex
>>>>>>   RSS max key size: 0
>>>>>>   RSS max indirection table length: 0
>>>>>>   Supported hash types: 0x
>>>>>>
>>>>>> I believe this might be contributing to the underperformance of vDPA.
>>>>> mlx5_vdpa vDPA devicess currently do not support the 
>>>>> VIRTIO_NET_F_SPEED_DUPLEX
>>>>> feature which reports speed and duplex. You can check the state on the
>>>>> PF.
>>>> According to ethtool, all my devices are running at full duplex. I assume I
>>>> can disregard this configuration output from the module then.
>>>>
>>> Yep.
>>>
>>>>>> While looking into how to change this option for Mellanox, I read the 
>>>>>> following
>>>>>> kernel code in mlx5_vnet.c:
>>>>>>
>>>>>> static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>>>>>> offset, const void *buf,
>>>>>>  unsigned int len)
>>>>>> {
>>>>>> /* not supported */
>>>>>> }
>>>>>>
>>>>>> I was wondering why this is the case.
>>>>> TBH, I don't know why it was not added. But in general, the control VQ is 
>>>>> the
>>>>> better way as it's dynamic.
>>>>>
>>>>>> Is there another way for me to change
>>>>>> these configuration settings?
>>>>>>
>>>>> The configuration is done using control VQ for most things (MTU, MAC, VQs,
>>>>> etc). Make sure that you have the CTRL_VQ feature set (should be on by
>>>>> default). It should appear in `vdpa mgmtdev show` and `vdpa dev config
>>>>> show`.
>>>> I see that CTRL_VQ is indeed enabled. Is there any documentation on how to
>>>> use the control VQ to get/set vDPA configuration values?
>>>>
>>>>
>>> You are most likely using it already through through qemu. You can check
>>> if the CTR_VQ feature also shows up in the output of `vdpa dev config show`.
>>>
>>> What values are you trying to configure btw?
>>
>> Yes, CTRL_VQ also shows up in vdpa dev config show. There isn't a specific
>> value I want to configure ATM, but my vDPA isn't performing as expected, so
>> I'm investigating potential issues. Below is the code I used to retrieve
>> the configuration from the driver; I'd be happy to send it as a patch if
>> you or someone else reviews it.
>>
>>
>>> Thanks,
>>> Dragos
>>
>> Thanks,
>> Carlos
>>
>> ---
>>
>> From ab6ea66c926eaf1e95eb5d73bc23183e0021ee27 Mon Sep 17 00:00:00 2001
>> From: Carlos Bilbao 
>> Date: Sat, 24 Aug 2024 00:24:56 +
>> Subject: [PATCH] mlx5: Add support to update the vDPA configuration
>>
>> This is needed for VHOST_VDPA_SET_CONFIG.
>>
>> Signed-off-by: Carlos Bilbao 
>> ---
>>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 22 --
>>  1 file changed, 20 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
>> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index b56aae3f7be3..da31c743b2b9 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -2909,14 +2909,32 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
>> *vdev, unsigned int offset,
>>  struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
>>  struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-26 Thread Carlos Bilbao

Hello,

On 8/26/24 10:53 AM, Dragos Tatulea wrote:
>
> On 26.08.24 16:26, Carlos Bilbao wrote:
>> Hello Dragos,
>>
>> On 8/26/24 4:06 AM, Dragos Tatulea wrote:
>>> On 23.08.24 18:54, Carlos Bilbao wrote:
>>>> Hello,
>>>>
>>>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>>>> configuration, I noticed that it's running in half duplex mode:
>>>>
>>>> Configuration data (24 bytes):
>>>>   MAC address: (Mac address)
>>>>   Status: 0x0001
>>>>   Max virtqueue pairs: 8
>>>>   MTU: 1500
>>>>   Speed: 0 Mb
>>>>   Duplex: Half Duplex
>>>>   RSS max key size: 0
>>>>   RSS max indirection table length: 0
>>>>   Supported hash types: 0x
>>>>
>>>> I believe this might be contributing to the underperformance of vDPA.
>>> mlx5_vdpa vDPA devicess currently do not support the 
>>> VIRTIO_NET_F_SPEED_DUPLEX
>>> feature which reports speed and duplex. You can check the state on the
>>> PF.
>>
>> According to ethtool, all my devices are running at full duplex. I assume I
>> can disregard this configuration output from the module then.
>>
> Yep.
>
>>>> While looking into how to change this option for Mellanox, I read the 
>>>> following
>>>> kernel code in mlx5_vnet.c:
>>>>
>>>> static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>>>> offset, const void *buf,
>>>>  unsigned int len)
>>>> {
>>>> /* not supported */
>>>> }
>>>>
>>>> I was wondering why this is the case.
>>> TBH, I don't know why it was not added. But in general, the control VQ is 
>>> the
>>> better way as it's dynamic.
>>>
>>>> Is there another way for me to change
>>>> these configuration settings?
>>>>
>>> The configuration is done using control VQ for most things (MTU, MAC, VQs,
>>> etc). Make sure that you have the CTRL_VQ feature set (should be on by
>>> default). It should appear in `vdpa mgmtdev show` and `vdpa dev config
>>> show`.
>>
>> I see that CTRL_VQ is indeed enabled. Is there any documentation on how to
>> use the control VQ to get/set vDPA configuration values?
>>
>>
> You are most likely using it already through through qemu. You can check
> if the CTR_VQ feature also shows up in the output of `vdpa dev config show`.
>
> What values are you trying to configure btw?


Yes, CTRL_VQ also shows up in vdpa dev config show. There isn't a specific
value I want to configure ATM, but my vDPA isn't performing as expected, so
I'm investigating potential issues. Below is the code I used to retrieve
the configuration from the driver; I'd be happy to send it as a patch if
you or someone else reviews it.


>
> Thanks,
> Dragos


Thanks,
Carlos

---

>From ab6ea66c926eaf1e95eb5d73bc23183e0021ee27 Mon Sep 17 00:00:00 2001
From: Carlos Bilbao 
Date: Sat, 24 Aug 2024 00:24:56 +
Subject: [PATCH] mlx5: Add support to update the vDPA configuration

This is needed for VHOST_VDPA_SET_CONFIG.

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b56aae3f7be3..da31c743b2b9 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2909,14 +2909,32 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
*vdev, unsigned int offset,
 struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
 struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);

-    if (offset + len <= sizeof(struct virtio_net_config))
+    if (offset + len <= sizeof(struct virtio_net_config)) {
 memcpy(buf, (u8 *)&ndev->config + offset, len);
+    }
+    else
+    {
+    printk(KERN_ERR "%s: Offset and length out of bounds\n",
+    __func__);
+    }
+
 }

 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
offset, const void *buf,
  unsigned int len)
 {
-    /* not supported */
+    struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+    struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+    if (offset + len <= sizeof(struct virtio_net_config))
+    {
+    memcpy((u8 *)&ndev->config + offset, buf, len);
+    }
+    else
+    {
+    printk(KERN_ERR "%s: Offset and length out of bounds\n",
+    __func__);
+    }
 }

 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
--
2.34.1

Re: [RFC] Why is set_config not supported in mlx5_vnet?

2024-08-26 Thread Carlos Bilbao

Hello Dragos,

On 8/26/24 4:06 AM, Dragos Tatulea wrote:
>
> On 23.08.24 18:54, Carlos Bilbao wrote:
>> Hello,
>>
>> I'm debugging my vDPA setup, and when using ioctl to retrieve the
>> configuration, I noticed that it's running in half duplex mode:
>>
>> Configuration data (24 bytes):
>>   MAC address: (Mac address)
>>   Status: 0x0001
>>   Max virtqueue pairs: 8
>>   MTU: 1500
>>   Speed: 0 Mb
>>   Duplex: Half Duplex
>>   RSS max key size: 0
>>   RSS max indirection table length: 0
>>   Supported hash types: 0x
>>
>> I believe this might be contributing to the underperformance of vDPA.
> mlx5_vdpa vDPA devicess currently do not support the VIRTIO_NET_F_SPEED_DUPLEX
> feature which reports speed and duplex. You can check the state on the
> PF.


According to ethtool, all my devices are running at full duplex. I assume I
can disregard this configuration output from the module then.


>
>> While looking into how to change this option for Mellanox, I read the 
>> following
>> kernel code in mlx5_vnet.c:
>>
>> static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
>> offset, const void *buf,
>>  unsigned int len)
>> {
>> /* not supported */
>> }
>>
>> I was wondering why this is the case.
> TBH, I don't know why it was not added. But in general, the control VQ is the
> better way as it's dynamic.
>
>> Is there another way for me to change
>> these configuration settings?
>>
> The configuration is done using control VQ for most things (MTU, MAC, VQs,
> etc). Make sure that you have the CTRL_VQ feature set (should be on by
> default). It should appear in `vdpa mgmtdev show` and `vdpa dev config
> show`.


I see that CTRL_VQ is indeed enabled. Is there any documentation on how to
use the control VQ to get/set vDPA configuration values?


>
> Thanks,
> Dragos


Thank you!
Carlos

[RFC] Why is set_config not supported in mlx5_vnet?

2024-08-23 Thread Carlos Bilbao

Hello,

I'm debugging my vDPA setup, and when using ioctl to retrieve the
configuration, I noticed that it's running in half duplex mode:

Configuration data (24 bytes):
  MAC address: (Mac address)
  Status: 0x0001
  Max virtqueue pairs: 8
  MTU: 1500
  Speed: 0 Mb
  Duplex: Half Duplex
  RSS max key size: 0
  RSS max indirection table length: 0
  Supported hash types: 0x

I believe this might be contributing to the underperformance of vDPA. While
looking into how to change this option for Mellanox, I read the following
kernel code in mlx5_vnet.c:

static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, 
const void *buf,
 unsigned int len)
{
    /* not supported */
}

I was wondering why this is the case. Is there another way for me to change
these configuration settings?

Thank you in advance,
Carlos

Re: [RFC] vDPA: Trying to make sense of config data

2024-08-23 Thread Carlos Bilbao

Hello again, 

Answering my own question:

https://elixir.bootlin.com/linux/v6.10.2/source/include/uapi/linux/virtio_net.h#L92

Thanks, Carlos

On 8/22/24 1:21 PM, Carlos Bilbao wrote:
> Hello folks,
>
> I'm using the code below to retrieve configuration data for my vDPA file
> via ioctl. I get as output:
>
> Configuration data (24 bytes):
> 5a c3 5f 68 48 a9 01 00 08 00 dc 05 00 00 00 00
> 00 00 00 00 00 00 00 00
> ASCII representation:
> Z._hH...
>
> Could a good Samaritan point me in the right direction for the docs I need
> to understand these values and convert them to a human-readable format?
> hank you in advance!
>
> Regards,
> Carlos
>
> ---
>
> void check_config(int fd) {
>
>     uint32_t size;
>     struct vhost_vdpa_config *config;
>     uint8_t *buf;
>
>     if (ioctl(fd, VHOST_VDPA_GET_CONFIG_SIZE, &size) < 0) {
>     perror("ioctl failed");
>     return;
>     }
>
>     config = malloc(sizeof(struct vhost_vdpa_config) + size);
>     if (!config) {
>     perror("malloc failed");
>     return;
>     }
>
>     memset(config, 0, sizeof(struct vhost_vdpa_config) + size);
>     config->len = size;
>     config->off = 0;
>
>     buf = config->buf;
>
>     if (ioctl(fd, VHOST_VDPA_GET_CONFIG, config) < 0) {
>     perror("ioctl failed");
>     } else {
>     printf("Configuration data (%u bytes):\n", size);
>
>     /* Print the data in a human-readable format */
>     for (unsigned int i = 0; i < size; i++) {
>     if (i % 16 == 0 && i != 0) printf("\n");
>     printf("%02x ", buf[i]);
>     }
>     printf("\n");
>
>     printf("ASCII representation:\n");
>     for (unsigned int i = 0; i < size; i++) {
>     if (buf[i] >= 32 && buf[i] <= 126) {
>     printf("%c", buf[i]);
>     } else {
>     printf(".");
>     }
>     }
>     printf("\n");
>     }
>
>     free(config);
> }

[RFC] vDPA: Trying to make sense of config data

2024-08-22 Thread Carlos Bilbao

Hello folks,

I'm using the code below to retrieve configuration data for my vDPA file
via ioctl. I get as output:

Configuration data (24 bytes):
5a c3 5f 68 48 a9 01 00 08 00 dc 05 00 00 00 00
00 00 00 00 00 00 00 00
ASCII representation:
Z._hH...

Could a good Samaritan point me in the right direction for the docs I need
to understand these values and convert them to a human-readable format?
hank you in advance!

Regards,
Carlos

---

void check_config(int fd) {

    uint32_t size;
    struct vhost_vdpa_config *config;
    uint8_t *buf;

    if (ioctl(fd, VHOST_VDPA_GET_CONFIG_SIZE, &size) < 0) {
    perror("ioctl failed");
    return;
    }

    config = malloc(sizeof(struct vhost_vdpa_config) + size);
    if (!config) {
    perror("malloc failed");
    return;
    }

    memset(config, 0, sizeof(struct vhost_vdpa_config) + size);
    config->len = size;
    config->off = 0;

    buf = config->buf;

    if (ioctl(fd, VHOST_VDPA_GET_CONFIG, config) < 0) {
    perror("ioctl failed");
    } else {
    printf("Configuration data (%u bytes):\n", size);

    /* Print the data in a human-readable format */
    for (unsigned int i = 0; i < size; i++) {
    if (i % 16 == 0 && i != 0) printf("\n");
    printf("%02x ", buf[i]);
    }
    printf("\n");

    printf("ASCII representation:\n");
    for (unsigned int i = 0; i < size; i++) {
    if (buf[i] >= 32 && buf[i] <= 126) {
    printf("%c", buf[i]);
    } else {
    printf(".");
    }
    }
    printf("\n");
    }

    free(config);
}

Re: [PATCH v4 2/2] rust: add tracepoint support

2024-07-04 Thread Carlos Llamas

On Fri, Jun 28, 2024 at 01:23:32PM +, Alice Ryhl wrote:
> Make it possible to have Rust code call into tracepoints defined by C
> code. It is still required that the tracepoint is declared in a C
> header, and that this header is included in the input to bindgen.
> 
> Signed-off-by: Alice Ryhl 
> ---
>  include/linux/tracepoint.h  | 18 +++-
>  include/trace/define_trace.h| 12 +++
>  rust/bindings/bindings_helper.h |  1 +
>  rust/kernel/lib.rs  |  1 +
>  rust/kernel/tracepoint.rs   | 47 
> +
>  5 files changed, 78 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 689b6d71590e..d82af4d77c9f 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -238,6 +238,20 @@ static inline struct tracepoint 
> *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  #define __DECLARE_TRACE_RCU(name, proto, args, cond)
>  #endif
>  
> +/*
> + * Declare an exported function that Rust code can call to trigger this
> + * tracepoint. This function does not include the static branch; that is done
> + * in Rust to avoid a function call when the tracepoint is disabled.
> + */
> +#define DEFINE_RUST_DO_TRACE(name, proto, args)
> +#define DEFINE_RUST_DO_TRACE_REAL(name, proto, args) \
nit: IMO using a __* prefix would be a better option to describe the
internal use of the macro instead of the _REAL suffix.

Other than that, this patch looks good to me:

Reviewed-by: Carlos Llamas

[PATCH v2] tracing/probes: fix error check in parse_btf_field()

2024-05-27 Thread Carlos López

btf_find_struct_member() might return NULL or an error via the
ERR_PTR() macro. However, its caller in parse_btf_field() only checks
for the NULL condition. Fix this by using IS_ERR() and returning the
error up the stack.

Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field 
access")
Signed-off-by: Carlos López 
---
v2: added call to trace_probe_log_err()

 kernel/trace/trace_probe.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5e263c141574..39877c80d6cb 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -554,6 +554,10 @@ static int parse_btf_field(char *fieldname, const struct 
btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, 
fieldname,
   &anon_offs);
+   if (IS_ERR(field)) {
+   trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+   return PTR_ERR(field);
+   }
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
-- 
2.35.3

Re: [PATCH] tracing/probes: fix error check in parse_btf_field()

2024-05-26 Thread Carlos López




Hi,

On 26/5/24 12:17, Masami Hiramatsu (Google) wrote:

On Sat, 25 May 2024 20:21:32 +0200
Carlos López  wrote:


btf_find_struct_member() might return NULL or an error via the
ERR_PTR() macro. However, its caller in parse_btf_field() only checks
for the NULL condition. Fix this by using IS_ERR() and returning the
error up the stack.



Thanks for finding it!
I think this requires new error message for error_log file.
Can you add the log as

trace_probe_log_err(ctx->offset, BTF_ERROR);

And define BTF_ERROR in ERRORS@kernel/trace/trace_probe.h ?


Sounds good, but should we perhaps reuse BAD_BTF_TID?

```
C(BAD_BTF_TID,  "Failed to get BTF type info."),\
```

`btf_find_struct_member()` fails if `type` is not a struct or if it runs
OOM while allocating the anon stack, so it seems appropriate.

Best,
Carlos


Thank you,


Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field 
access")
Signed-off-by: Carlos López 
---
  kernel/trace/trace_probe.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5e263c141574..5417e9712157 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -554,6 +554,8 @@ static int parse_btf_field(char *fieldname, const struct 
btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, 
fieldname,
   &anon_offs);
+   if (IS_ERR(field))
+   return PTR_ERR(field);
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
--
2.35.3






--
Carlos López
Security Engineer
SUSE Software Solutions

[PATCH] tracing/probes: fix error check in parse_btf_field()

2024-05-25 Thread Carlos López

btf_find_struct_member() might return NULL or an error via the
ERR_PTR() macro. However, its caller in parse_btf_field() only checks
for the NULL condition. Fix this by using IS_ERR() and returning the
error up the stack.

Fixes: c440adfbe3025 ("tracing/probes: Support BTF based data structure field 
access")
Signed-off-by: Carlos López 
---
 kernel/trace/trace_probe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5e263c141574..5417e9712157 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -554,6 +554,8 @@ static int parse_btf_field(char *fieldname, const struct 
btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, 
fieldname,
   &anon_offs);
+   if (IS_ERR(field))
+   return PTR_ERR(field);
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
-- 
2.35.3

Re: [PATCH v2] mm: Update mark_victim tracepoints fields

2024-02-23 Thread Carlos Galo

On Thu, Feb 22, 2024 at 9:59 AM Carlos Galo  wrote:
>
> On Thu, Feb 22, 2024 at 6:16 AM Michal Hocko  wrote:
> >
> > On Wed 21-02-24 13:30:51, Carlos Galo wrote:
> > > On Tue, Feb 20, 2024 at 11:55 PM Michal Hocko  wrote:
> > > >
> > > > Hi,
> > > > sorry I have missed this before.
> > > >
> > > > On Thu 11-01-24 21:05:30, Carlos Galo wrote:
> > > > > The current implementation of the mark_victim tracepoint provides only
> > > > > the process ID (pid) of the victim process. This limitation poses
> > > > > challenges for userspace tools that need additional information
> > > > > about the OOM victim. The association between pid and the additional
> > > > > data may be lost after the kill, making it difficult for userspace to
> > > > > correlate the OOM event with the specific process.
> > > >
> > > > You are correct that post OOM all per-process information is lost. On
> > > > the other hand we do dump all this information to the kernel log. Could
> > > > you explain why that is not suitable for your purpose?
> > >
> > > Userspace tools often need real-time visibility into OOM situations
> > > for userspace intervention. Our use case involves utilizing BPF
> > > programs, along with BPF ring buffers, to provide OOM notification to
> > > userspace. Parsing kernel logs would be significant overhead as
> > > opposed to the event based BPF approach.
> >
> > Please put that into the changelog.
>
> Will do.
>
> > > > > In order to mitigate this limitation, add the following fields:
> > > > >
> > > > > - UID
> > > > >In Android each installed application has a unique UID. Including
> > > > >the `uid` assists in correlating OOM events with specific apps.
> > > > >
> > > > > - Process Name (comm)
> > > > >Enables identification of the affected process.
> > > > >
> > > > > - OOM Score
> > > > >Allows userspace to get additional insights of the relative kill
> > > > >priority of the OOM victim.
> > > >
> > > > What is the oom score useful for?
> > > >
> > > The OOM score provides us a measure of the victim's importance. On the
> > > android side, it allows us to identify if top or foreground apps are
> > > killed, which have user perceptible impact.
> >
> > But the value on its own (wihtout knowing scores of other tasks) doesn't
> > really tell you anything, does it?
>
> Android uses the OOM adj_score ranges  to categorize app state
> (foreground, background, ...). I'll resend a v3 with the commit text
> updated to include details about this.
>
> > > > Is there any reason to provide a different information from the one
> > > > reported to the kernel log?
> > > > __oom_kill_process:
> > > > pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, 
> > > > file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB 
> > > > oom_score_adj:%hd\n",
> > > > message, task_pid_nr(victim), victim->comm, 
> > > > K(mm->total_vm),
> > > > K(get_mm_counter(mm, MM_ANONPAGES)),
> > > > K(get_mm_counter(mm, MM_FILEPAGES)),
> > > > K(get_mm_counter(mm, MM_SHMEMPAGES)),
> > > > from_kuid(&init_user_ns, task_uid(victim)),
> > > >     mm_pgtables_bytes(mm) >> 10, 
> > > > victim->signal->oom_score_adj);
> > > >
> > >
> > > We added these fields we need (UID, process name, and OOM score), but
> > > we're open to adding the others if you prefer that for consistency
> > > with the kernel log.
> >
> > yes, I think the consistency would be better here. For one it reports
> > numbers which can tell quite a lot about the killed victim. It is a
> > superset of what you already asking for. With a notable exception of the
> > oom_score which is really dubious without a wider context.
>
> Sounds good, I'll resend a v3 that includes these fields as well.
>
> Thanks,
> Carlos
>

I posted V3 here:
https://lore.kernel.org/all/20240223173258.174828-1-carlosg...@google.com/

Thanks,
Carlos
> > --
> > Michal Hocko
> > SUSE Labs

[PATCH v3] mm: Update mark_victim tracepoints fields

2024-02-23 Thread Carlos Galo

The current implementation of the mark_victim tracepoint provides only
the process ID (pid) of the victim process. This limitation poses
challenges for userspace tools requiring real-time OOM analysis and
intervention. Although this information is available from the kernel
logs, it’s not the appropriate format to provide OOM notifications. In
Android, BPF programs are used with the mark_victim trace events to
notify userspace of an OOM kill. For consistency, update the trace
event to include the same information about the OOMed victim as the
kernel logs.

- UID
   In Android each installed application has a unique UID. Including
   the `uid` assists in correlating OOM events with specific apps.

- Process Name (comm)
   Enables identification of the affected process.

- OOM Score
  Will allow userspace to get additional insight of the relative kill
  priority of the OOM victim. In Android, the oom_score_adj is used to
  categorize app state (foreground, background, etc.), which aids in
  analyzing user-perceptible impacts of OOM events [1].

- Total VM, RSS Stats, and pgtables
  Amount of memory used by the victim that will, potentially, be freed up
  by killing it.

[1] 
https://cs.android.com/android/platform/superproject/main/+/246dc8fc95b6d93afcba5c6d6c133307abb3ac2e:frameworks/base/services/core/java/com/android/server/am/ProcessList.java;l=188-283

Cc: Steven Rostedt 
Cc: Andrew Morton 
Cc: Suren Baghdasaryan 
Cc: Michal Hocko 
Reviewed-by: Steven Rostedt 
Signed-off-by: Carlos Galo 
---

v3:
- Added total_vm, rss fields, and pgtables  per Michal Hocko.
- Added Steven Rostedt reviewed by
- Updated commit description to include android usecase

v2: Fixed build error. Added missing comma when printing `__entry->uid`.

 include/trace/events/oom.h | 36 
 mm/oom_kill.c  |  6 +-
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 26a11e4a2c36..b799f3bcba82 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -7,6 +7,8 @@
 #include 
 #include 
 
+#define PG_COUNT_TO_KB(x) ((x) << (PAGE_SHIFT - 10))
+
 TRACE_EVENT(oom_score_adj_update,
 
TP_PROTO(struct task_struct *task),
@@ -72,19 +74,45 @@ TRACE_EVENT(reclaim_retry_zone,
 );
 
 TRACE_EVENT(mark_victim,
-   TP_PROTO(int pid),
+   TP_PROTO(struct task_struct *task, uid_t uid),
 
-   TP_ARGS(pid),
+   TP_ARGS(task, uid),
 
TP_STRUCT__entry(
__field(int, pid)
+   __string(comm, task->comm)
+   __field(unsigned long, total_vm)
+   __field(unsigned long, anon_rss)
+   __field(unsigned long, file_rss)
+   __field(unsigned long, shmem_rss)
+   __field(uid_t, uid)
+   __field(unsigned long, pgtables)
+   __field(short, oom_score_adj)
),
 
TP_fast_assign(
-   __entry->pid = pid;
+   __entry->pid = task->pid;
+   __assign_str(comm, task->comm);
+   __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm);
+   __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, 
MM_ANONPAGES));
+   __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, 
MM_FILEPAGES));
+   __entry->shmem_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, 
MM_SHMEMPAGES));
+   __entry->uid = uid;
+   __entry->pgtables = mm_pgtables_bytes(task->mm) >> 10;
+   __entry->oom_score_adj = task->signal->oom_score_adj;
),
 
-   TP_printk("pid=%d", __entry->pid)
+   TP_printk("pid=%d comm=%s total-vm=%lukB anon-rss=%lukB file-rss:%lukB 
shmem-rss:%lukB uid=%u pgtables=%lukB oom_score_adj=%hd",
+   __entry->pid,
+   __get_str(comm),
+   __entry->total_vm,
+   __entry->anon_rss,
+   __entry->file_rss,
+   __entry->shmem_rss,
+   __entry->uid,
+   __entry->pgtables,
+   __entry->oom_score_adj
+   )
 );
 
 TRACE_EVENT(wake_reaper,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 91ccd82097c2..8d6a207c3c59 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include "internal.h"
@@ -754,6 +755,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk)
  */
 static void mark_oom_victim(struct task_struct *tsk)
 {
+   const struct cred *cred;
struct mm_struct *mm = tsk->mm;
 
WARN_ON(oom_killer_disabled);
@@ -773,7 +775,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 */
__thaw_task(tsk);
atomic_inc(&oom_victims);
-   trace_mark_victim(tsk->pid);
+   cred = get_task_cred(tsk);
+   trace_mark_victim(tsk, cred->uid.val);
+   put_cred(cred);
 }
 
 /**
-- 
2.44.0.rc0.258.g7320e95886-goog

Re: [PATCH v2] mm: Update mark_victim tracepoints fields

2024-02-22 Thread Carlos Galo

On Thu, Feb 22, 2024 at 6:16 AM Michal Hocko  wrote:
>
> On Wed 21-02-24 13:30:51, Carlos Galo wrote:
> > On Tue, Feb 20, 2024 at 11:55 PM Michal Hocko  wrote:
> > >
> > > Hi,
> > > sorry I have missed this before.
> > >
> > > On Thu 11-01-24 21:05:30, Carlos Galo wrote:
> > > > The current implementation of the mark_victim tracepoint provides only
> > > > the process ID (pid) of the victim process. This limitation poses
> > > > challenges for userspace tools that need additional information
> > > > about the OOM victim. The association between pid and the additional
> > > > data may be lost after the kill, making it difficult for userspace to
> > > > correlate the OOM event with the specific process.
> > >
> > > You are correct that post OOM all per-process information is lost. On
> > > the other hand we do dump all this information to the kernel log. Could
> > > you explain why that is not suitable for your purpose?
> >
> > Userspace tools often need real-time visibility into OOM situations
> > for userspace intervention. Our use case involves utilizing BPF
> > programs, along with BPF ring buffers, to provide OOM notification to
> > userspace. Parsing kernel logs would be significant overhead as
> > opposed to the event based BPF approach.
>
> Please put that into the changelog.

Will do.

> > > > In order to mitigate this limitation, add the following fields:
> > > >
> > > > - UID
> > > >In Android each installed application has a unique UID. Including
> > > >the `uid` assists in correlating OOM events with specific apps.
> > > >
> > > > - Process Name (comm)
> > > >Enables identification of the affected process.
> > > >
> > > > - OOM Score
> > > >Allows userspace to get additional insights of the relative kill
> > > >priority of the OOM victim.
> > >
> > > What is the oom score useful for?
> > >
> > The OOM score provides us a measure of the victim's importance. On the
> > android side, it allows us to identify if top or foreground apps are
> > killed, which have user perceptible impact.
>
> But the value on its own (wihtout knowing scores of other tasks) doesn't
> really tell you anything, does it?

Android uses the OOM adj_score ranges  to categorize app state
(foreground, background, ...). I'll resend a v3 with the commit text
updated to include details about this.

> > > Is there any reason to provide a different information from the one
> > > reported to the kernel log?
> > > __oom_kill_process:
> > > pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, 
> > > file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB 
> > > oom_score_adj:%hd\n",
> > > message, task_pid_nr(victim), victim->comm, 
> > > K(mm->total_vm),
> > > K(get_mm_counter(mm, MM_ANONPAGES)),
> > > K(get_mm_counter(mm, MM_FILEPAGES)),
> > > K(get_mm_counter(mm, MM_SHMEMPAGES)),
> > > from_kuid(&init_user_ns, task_uid(victim)),
> > > mm_pgtables_bytes(mm) >> 10, 
> > > victim->signal->oom_score_adj);
> > >
> >
> > We added these fields we need (UID, process name, and OOM score), but
> > we're open to adding the others if you prefer that for consistency
> > with the kernel log.
>
> yes, I think the consistency would be better here. For one it reports
> numbers which can tell quite a lot about the killed victim. It is a
> superset of what you already asking for. With a notable exception of the
> oom_score which is really dubious without a wider context.

Sounds good, I'll resend a v3 that includes these fields as well.

Thanks,
Carlos

> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v2] mm: Update mark_victim tracepoints fields

2024-02-21 Thread Carlos Galo

On Tue, Feb 20, 2024 at 11:55 PM Michal Hocko  wrote:
>
> Hi,
> sorry I have missed this before.
>
> On Thu 11-01-24 21:05:30, Carlos Galo wrote:
> > The current implementation of the mark_victim tracepoint provides only
> > the process ID (pid) of the victim process. This limitation poses
> > challenges for userspace tools that need additional information
> > about the OOM victim. The association between pid and the additional
> > data may be lost after the kill, making it difficult for userspace to
> > correlate the OOM event with the specific process.
>
> You are correct that post OOM all per-process information is lost. On
> the other hand we do dump all this information to the kernel log. Could
> you explain why that is not suitable for your purpose?

Userspace tools often need real-time visibility into OOM situations
for userspace intervention. Our use case involves utilizing BPF
programs, along with BPF ring buffers, to provide OOM notification to
userspace. Parsing kernel logs would be significant overhead as
opposed to the event based BPF approach.

> > In order to mitigate this limitation, add the following fields:
> >
> > - UID
> >In Android each installed application has a unique UID. Including
> >the `uid` assists in correlating OOM events with specific apps.
> >
> > - Process Name (comm)
> >Enables identification of the affected process.
> >
> > - OOM Score
> >Allows userspace to get additional insights of the relative kill
> >priority of the OOM victim.
>
> What is the oom score useful for?
>
The OOM score provides us a measure of the victim's importance. On the
android side, it allows us to identify if top or foreground apps are
killed, which have user perceptible impact.

> Is there any reason to provide a different information from the one
> reported to the kernel log?
> __oom_kill_process:
> pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, 
> file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
> message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
> K(get_mm_counter(mm, MM_ANONPAGES)),
> K(get_mm_counter(mm, MM_FILEPAGES)),
> K(get_mm_counter(mm, MM_SHMEMPAGES)),
> from_kuid(&init_user_ns, task_uid(victim)),
> mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
>

We added these fields we need (UID, process name, and OOM score), but
we're open to adding the others if you prefer that for consistency
with the kernel log.

Thanks,
Carlos

> --
> Michal Hocko
> SUSE Labs

Re: WARNING in shmem_release_dquot

2024-02-20 Thread Carlos Maiolino

  ud2
> >2:   90  nop
> >3:   90  nop
> >4:   e9 18 fb ff ff  jmp0xfb21
> >9:   e8 f5 d8 18 00  call   0x18d903
> >e:   e9 a2 fa ff ff  jmp0xfab5
> >   13:   e8  .byte 0xe8
> >   14:   0b d9   or %ecx,%ebx
> > [  246.249288][ T4096] RSP: 0018:c9000604fbc0 EFLAGS: 00010286
> > [  246.250033][ T4096] RAX:  RBX:  RCX: 
> > 814c77da
> > [  246.251035][ T4096] RDX: 888049a58000 RSI: 814c77e7 RDI: 
> > 0001
> > [  246.252036][ T4096] RBP:  R08: 0001 R09: 
> > 
> > [  246.253028][ T4096] R10: 0001 R11: 0001 R12: 
> > 4000
> > [  246.254060][ T4096] R13: 888051bd3000 R14: dc00 R15: 
> > 888051bd3040
> > [ 246.255058][ T4096] ? __warn_printk 
> > (./include/linux/context_tracking.h:155 kernel/panic.c:726)
> > [ 246.255694][ T4096] ? __warn_printk (kernel/panic.c:717)
> > [ 246.256256][ T4096] quota_release_workfn (fs/quota/dquot.c:839)
> > [ 246.256877][ T4096] ? dquot_release (fs/quota/dquot.c:810)
> > [ 246.257467][ T4096] process_one_work (kernel/workqueue.c:2638)
> > [ 246.258126][ T4096] ? lock_sync (kernel/locking/lockdep.c:5722)
> > [ 246.258718][ T4096] ? workqueue_congested (kernel/workqueue.c:2542)
> > [ 246.259339][ T4096] ? assign_work (kernel/workqueue.c:1102)
> > [ 246.259915][ T4096] worker_thread (kernel/workqueue.c:2700 
> > kernel/workqueue.c:2787)
> > [ 246.260529][ T4096] ? lockdep_hardirqs_on (kernel/locking/lockdep.c:4423)
> > [ 246.261176][ T4096] ? process_one_work (kernel/workqueue.c:2733)
> > [ 246.261855][ T4096] kthread (kernel/kthread.c:388)
> > [ 246.262382][ T4096] ? kthread_complete_and_exit (kernel/kthread.c:341)
> > [ 246.263077][ T4096] ret_from_fork (arch/x86/kernel/process.c:153)
> > [ 246.263620][ T4096] ? kthread_complete_and_exit (kernel/kthread.c:341)
> > [ 246.264331][ T4096] ret_from_fork_asm (arch/x86/entry/entry_64.S:250)
> > [  246.264910][ T4096]  
> > [  246.265598][ T4096] Kernel Offset: disabled
> > [  246.266259][ T4096] Rebooting in 86400 seconds..
> >
> > Thank you for taking the time to read this email and we look forward to 
> > working with you further.
> 
> Carlos, this looks like one for you to puzzle over -
> thanks,
> Hugh

I'll look into it, thanks!

Carlos

Re: [PATCH] mm: Update mark_victim tracepoints fields

2024-01-11 Thread Carlos Galo

On Thu, Jan 11, 2024 at 9:08 AM kernel test robot  wrote:
>
> Hi Carlos,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on 0dd3ee31125508cd67f7e7172247f05b7fd1753a]
>
> url:
> https://github.com/intel-lab-lkp/linux/commits/Carlos-Galo/mm-Update-mark_victim-tracepoints-fields/20240111-081635
> base:   0dd3ee31125508cd67f7e7172247f05b7fd1753a
> patch link:
> https://lore.kernel.org/r/20240111001155.746-1-carlosgalo%40google.com
> patch subject: [PATCH] mm: Update mark_victim tracepoints fields
> config: x86_64-defconfig 
> (https://download.01.org/0day-ci/archive/20240112/202401120052.rdfjpivg-...@intel.com/config)
> compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
> reproduce (this is a W=1 build): 
> (https://download.01.org/0day-ci/archive/20240112/202401120052.rdfjpivg-...@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version 
> of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot 
> | Closes: 
> https://lore.kernel.org/oe-kbuild-all/202401120052.rdfjpivg-...@intel.com/
>

Sorry, I missed a comma in my final editing.
I posted a V2 here:
https://lore.kernel.org/lkml/20240111210539.636607-1-carlosg...@google.com/

Thanks,
Carlos

> All errors (new ones prefixed by >>):
>
>In file included from include/trace/define_trace.h:102,
> from include/trace/events/oom.h:206,
> from mm/oom_kill.c:54:
>include/trace/events/oom.h: In function 'trace_raw_output_mark_victim':
> >> include/trace/stages/stage3_trace_output.h:6:17: error: called object is 
> >> not a function or function pointer
>6 | #define __entry field
>  | ^
>include/trace/trace_events.h:203:34: note: in definition of macro 
> 'DECLARE_EVENT_CLASS'
>  203 | trace_event_printf(iter, print);   
>  \
>  |  ^
>include/trace/trace_events.h:45:30: note: in expansion of macro 'PARAMS'
>   45 |  PARAMS(print));   \
>  |  ^~
>include/trace/events/oom.h:74:1: note: in expansion of macro 'TRACE_EVENT'
>   74 | TRACE_EVENT(mark_victim,
>  | ^~~
>include/trace/events/oom.h:93:9: note: in expansion of macro 'TP_printk'
>   93 | TP_printk("pid=%d uid=%u comm=%s oom_score_adj=%hd",
>  | ^
>include/trace/events/oom.h:95:17: note: in expansion of macro '__entry'
>   95 | __entry->uid
>  | ^~~
> >> include/trace/trace_events.h:203:39: error: expected expression before ')' 
> >> token
>  203 | trace_event_printf(iter, print);   
>  \
>  |   ^
>include/trace/trace_events.h:40:9: note: in expansion of macro 
> 'DECLARE_EVENT_CLASS'
>   40 | DECLARE_EVENT_CLASS(name,  \
>  | ^~~
>include/trace/events/oom.h:74:1: note: in expansion of macro 'TRACE_EVENT'
>   74 | TRACE_EVENT(mark_victim,
>  | ^~~
>
>
> vim +6 include/trace/stages/stage3_trace_output.h
>
> af6b9668e85ffd include/trace/stages/stage3_defines.h Steven Rostedt (Google  
> 2022-03-03  4)
> af6b9668e85ffd include/trace/stages/stage3_defines.h Steven Rostedt (Google  
> 2022-03-03  5) #undef __entry
> af6b9668e85ffd include/trace/stages/stage3_defines.h Steven Rostedt (Google  
> 2022-03-03 @6) #define __entry field
> af6b9668e85ffd include/trace/stages/stage3_defines.h Steven Rostedt (Google  
> 2022-03-03  7)
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki

[PATCH v2] mm: Update mark_victim tracepoints fields

2024-01-11 Thread Carlos Galo

The current implementation of the mark_victim tracepoint provides only
the process ID (pid) of the victim process. This limitation poses
challenges for userspace tools that need additional information
about the OOM victim. The association between pid and the additional
data may be lost after the kill, making it difficult for userspace to
correlate the OOM event with the specific process.

In order to mitigate this limitation, add the following fields:

- UID
   In Android each installed application has a unique UID. Including
   the `uid` assists in correlating OOM events with specific apps.

- Process Name (comm)
   Enables identification of the affected process.

- OOM Score
   Allows userspace to get additional insights of the relative kill
   priority of the OOM victim.

Cc: Steven Rostedt 
Cc: Andrew Morton 
Cc: Suren Baghdasaryan 
Signed-off-by: Carlos Galo 
---
v2: Fixed build error. Added missing comma when printing `__entry->uid`.

 include/trace/events/oom.h | 19 +++
 mm/oom_kill.c  |  6 +-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 26a11e4a2c36..fb8a5d1b8a0a 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -72,19 +72,30 @@ TRACE_EVENT(reclaim_retry_zone,
 );
 
 TRACE_EVENT(mark_victim,
-   TP_PROTO(int pid),
+   TP_PROTO(struct task_struct *task, uid_t uid),
 
-   TP_ARGS(pid),
+   TP_ARGS(task, uid),
 
TP_STRUCT__entry(
__field(int, pid)
+   __field(uid_t, uid)
+   __string(comm, task->comm)
+   __field(short, oom_score_adj)
),
 
TP_fast_assign(
-   __entry->pid = pid;
+   __entry->pid = task->pid;
+   __entry->uid = uid;
+   __assign_str(comm, task->comm);
+   __entry->oom_score_adj = task->signal->oom_score_adj;
),
 
-   TP_printk("pid=%d", __entry->pid)
+   TP_printk("pid=%d uid=%u comm=%s oom_score_adj=%hd",
+   __entry->pid,
+   __entry->uid,
+   __get_str(comm),
+   __entry->oom_score_adj
+   )
 );
 
 TRACE_EVENT(wake_reaper,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e6071fde34a..0698c00c5da6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include "internal.h"
@@ -753,6 +754,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk)
  */
 static void mark_oom_victim(struct task_struct *tsk)
 {
+   const struct cred *cred;
struct mm_struct *mm = tsk->mm;
 
WARN_ON(oom_killer_disabled);
@@ -772,7 +774,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 */
__thaw_task(tsk);
atomic_inc(&oom_victims);
-   trace_mark_victim(tsk->pid);
+   cred = get_task_cred(tsk);
+   trace_mark_victim(tsk, cred->uid.val);
+   put_cred(cred);
 }
 
 /**

base-commit: 0dd3ee31125508cd67f7e7172247f05b7fd1753a
-- 
2.43.0.275.g3460e3d667-goog

[PATCH] mm: Update mark_victim tracepoints fields

2024-01-10 Thread Carlos Galo

The current implementation of the mark_victim tracepoint provides only
the process ID (pid) of the victim process. This limitation poses
challenges for userspace tools that need additional information
about the OOM victim. The association between pid and the additional
data may be lost after the kill, making it difficult for userspace to
correlate the OOM event with the specific process.

In order to mitigate this limitation, add the following fields:

- UID
   In Android each installed application has a unique UID. Including
   the `uid` assists in correlating OOM events with specific apps.

- Process Name (comm)
   Enables identification of the affected process.

- OOM Score
   Allows userspace to get additional insights of the relative kill
   priority of the OOM victim.

Cc: Steven Rostedt 
Cc: Andrew Morton 
Cc: Suren Baghdasaryan 
Signed-off-by: Carlos Galo 
---
 include/trace/events/oom.h | 19 +++
 mm/oom_kill.c  |  6 +-
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 26a11e4a2c36..fb8a5d1b8a0a 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -72,19 +72,30 @@ TRACE_EVENT(reclaim_retry_zone,
 );
 
 TRACE_EVENT(mark_victim,
-   TP_PROTO(int pid),
+   TP_PROTO(struct task_struct *task, uid_t uid),
 
-   TP_ARGS(pid),
+   TP_ARGS(task, uid),
 
TP_STRUCT__entry(
__field(int, pid)
+   __field(uid_t, uid)
+   __string(comm, task->comm)
+   __field(short, oom_score_adj)
),
 
TP_fast_assign(
-   __entry->pid = pid;
+   __entry->pid = task->pid;
+   __entry->uid = uid;
+   __assign_str(comm, task->comm);
+   __entry->oom_score_adj = task->signal->oom_score_adj;
),
 
-   TP_printk("pid=%d", __entry->pid)
+   TP_printk("pid=%d uid=%u comm=%s oom_score_adj=%hd",
+   __entry->pid,
+   __entry->uid
+   __get_str(comm),
+   __entry->oom_score_adj,
+   )
 );
 
 TRACE_EVENT(wake_reaper,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e6071fde34a..0698c00c5da6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include "internal.h"
@@ -753,6 +754,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk)
  */
 static void mark_oom_victim(struct task_struct *tsk)
 {
+   const struct cred *cred;
struct mm_struct *mm = tsk->mm;
 
WARN_ON(oom_killer_disabled);
@@ -772,7 +774,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 */
__thaw_task(tsk);
atomic_inc(&oom_victims);
-   trace_mark_victim(tsk->pid);
+   cred = get_task_cred(tsk);
+   trace_mark_victim(tsk, cred->uid.val);
+   put_cred(cred);
 }
 
 /**

base-commit: 0dd3ee31125508cd67f7e7172247f05b7fd1753a
-- 
2.43.0.275.g3460e3d667-goog

[PATCH] static_call: fix unused variable warning

2021-04-16 Thread Carlos Llamas

'mod' is not used when !CONFIG_MODULES. Mark as __maybe_unused to
fix the following warning:

kernel/static_call.c: In function ‘__static_call_update’:
kernel/static_call.c:153:18: warning: unused variable ‘mod’ [-Wunused-variable]
  153 |   struct module *mod = site_mod->mod;
  |  ^~~

Fixes: 9183c3f9ed71 ("static_call: Add inline static call infrastructure")
Reported-by: kernelci.org bot 
Signed-off-by: Carlos Llamas 
---
 kernel/static_call.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/static_call.c b/kernel/static_call.c
index 2c5950b0b90e..8211a34251f8 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -150,7 +150,7 @@ void __static_call_update(struct static_call_key *key, void 
*tramp, void *func)
 
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
bool init = system_state < SYSTEM_RUNNING;
-   struct module *mod = site_mod->mod;
+   struct module __maybe_unused *mod = site_mod->mod;
 
if (!site_mod->sites) {
/*
-- 
2.31.1.368.gbe11c130af-goog

[PATCH] kbuild: buildtar: add riscv support

2021-03-16 Thread Carlos de Paula

Make 'make tar-pkg' and 'tarbz2-pkg' work on riscv.

Signed-off-by: Carlos de Paula 
---
 scripts/package/buildtar | 8 
 1 file changed, 8 insertions(+)

diff --git a/scripts/package/buildtar b/scripts/package/buildtar
index 936198a90477..221aa7df008d 100755
--- a/scripts/package/buildtar
+++ b/scripts/package/buildtar
@@ -123,10 +123,18 @@ case "${ARCH}" in
cp -v -- "${objtree}/arch/arm64/boot/${i}" 
"${tmpdir}/boot/vmlinuz-${KERNELRELEASE}"
break
fi
done
;;
+   riscv)
+   for i in Image.bz2 Image.gz Image; do
+   if [ -f "${objtree}/arch/riscv/boot/${i}" ] ; then
+   cp -v -- "${objtree}/arch/riscv/boot/${i}" 
"${tmpdir}/boot/vmlinux-${KERNELRELEASE}"
+   break
+   fi
+   done
+   ;;
*)
[ -f "${KBUILD_IMAGE}" ] && cp -v -- "${KBUILD_IMAGE}" 
"${tmpdir}/boot/vmlinux-kbuild-${KERNELRELEASE}"
echo "" >&2
echo '** ** **  WARNING  ** ** **' >&2
echo "" >&2
-- 
2.20.1

Re: [PATCH v5] media: vimc: get pixformat info from v4l2_format_info

2021-03-04 Thread Carlos Eduardo Climaco Barbosa

Hi!
After a long while, I'm working in the patch again.
In regard to Dafna's suggestion, I'd have to rename and change the
returning types of vimc pix map related functions.
Would it be better if I just send a patch with the proposed changes or
should we talk about it here?

Regards,

Carlos


On Wed, Jun 24, 2020 at 11:08 AM Hans Verkuil  wrote:
>
> Hi Carlos,
>
> On 30/04/2020 04:32, Carlos E. C. Barbosa wrote:
> > There is overlapping code over two distinct lists. This repurposes
> > vimc_pix_map for mapping formats and remaps the calls to the matching
> > v4l2_format_info.
> >
> > Signed-off-by: Carlos E. C. Barbosa 
>
> This patch no longer applies to the latest vimc in our master tree.
> Can you rebase and post a v6?
>
> Regards,
>
> Hans
>
> >
> > ---
> >
> > Changes in v5:
> > - .bayer member was removed and replaced by v4l2 functions calls
> >
> > Changes in v4:
> > - Unused variables were removed
> >
> > Changes in v3:
> > - Change declaration order of variables and some minor style changes
> >
> > Changes in v2:
> > - Const qualifiers are not removed
> > - Bayer flag is kept
> > - Unnecessary changes are not made anymore
> >
> > v4l2-compliance -m /dev/media0 output:
> > https://pastebin.com/VQV4vrTW
> >
> > ---
> > ---
> >  .../media/test-drivers/vimc/vimc-capture.c| 16 ---
> >  drivers/media/test-drivers/vimc/vimc-common.c | 46 ---
> >  drivers/media/test-drivers/vimc/vimc-common.h |  4 --
> >  .../media/test-drivers/vimc/vimc-debayer.c|  7 ++-
> >  drivers/media/test-drivers/vimc/vimc-scaler.c | 18 ++--
> >  drivers/media/test-drivers/vimc/vimc-sensor.c |  9 +++-
> >  6 files changed, 35 insertions(+), 65 deletions(-)
> >
> > diff --git a/drivers/media/test-drivers/vimc/vimc-capture.c 
> > b/drivers/media/test-drivers/vimc/vimc-capture.c
> > index 5315c201314c..3c280a642ec7 100644
> > --- a/drivers/media/test-drivers/vimc/vimc-capture.c
> > +++ b/drivers/media/test-drivers/vimc/vimc-capture.c
> > @@ -85,6 +85,7 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, 
> > void *priv,
> >   struct v4l2_format *f)
> >  {
> >   struct v4l2_pix_format *format = &f->fmt.pix;
> > + const struct v4l2_format_info *vinfo;
> >   const struct vimc_pix_map *vpix;
> >
> >   format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH,
> > @@ -94,12 +95,13 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, 
> > void *priv,
> >
> >   /* Don't accept a pixelformat that is not on the table */
> >   vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
> > - if (!vpix) {
> > + if (!vpix)
> >   format->pixelformat = fmt_default.pixelformat;
> > - vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
> > - }
> > +
> > + vinfo = v4l2_format_info(format->pixelformat);
> > +
> >   /* TODO: Add support for custom bytesperline values */
> > - format->bytesperline = format->width * vpix->bpp;
> > + format->bytesperline = format->width * vinfo->bpp[0];
> >   format->sizeimage = format->bytesperline * format->height;
> >
> >   if (format->field == V4L2_FIELD_ANY)
> > @@ -386,7 +388,7 @@ static struct vimc_ent_device *vimc_cap_add(struct 
> > vimc_device *vimc,
> >   const char *vcfg_name)
> >  {
> >   struct v4l2_device *v4l2_dev = &vimc->v4l2_dev;
> > - const struct vimc_pix_map *vpix;
> > + const struct v4l2_format_info *vinfo;
> >   struct vimc_cap_device *vcap;
> >   struct video_device *vdev;
> >   struct vb2_queue *q;
> > @@ -434,8 +436,8 @@ static struct vimc_ent_device *vimc_cap_add(struct 
> > vimc_device *vimc,
> >
> >   /* Set default frame format */
> >   vcap->format = fmt_default;
> > - vpix = vimc_pix_map_by_pixelformat(vcap->format.pixelformat);
> > - vcap->format.bytesperline = vcap->format.width * vpix->bpp;
> > + vinfo = v4l2_format_info(vcap->format.pixelformat);
> > + vcap->format.bytesperline = vcap->format.width * vinfo->bpp[0];
> >   vcap->format.sizeimage = vcap->format.bytesperline *
> >vcap->format.height;
> >
> > diff --git a/drivers/media/test-drivers/vimc/vimc-common.c 
> > b/drive

Re: [PATCH v4 2/4] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2021-01-20 Thread Carlos O'Donell

endif
>  
>  #endif /* _ASM_X86_AUXVEC_H */
> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
> index 138a9f5b78d8..761d856f8ef7 100644
> --- a/arch/x86/kernel/signal.c
> +++ b/arch/x86/kernel/signal.c
> @@ -716,6 +716,11 @@ void __init init_sigframe_size(void)
>   max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
>  }
>  
> +unsigned long get_sigframe_size(void)
> +{
> + return max_frame_size;
> +}
> +
>  static inline int is_ia32_compat_frame(struct ksignal *ksig)
>  {
>   return IS_ENABLED(CONFIG_IA32_EMULATION) &&
> 


-- 
Cheers,
Carlos.

Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-11-25 Thread Carlos O'Donell

On 11/25/20 7:17 PM, Thomas Gleixner wrote:
> Carlos, Petr,
> 
> On Wed, Nov 25 2020 at 15:37, Carlos O'Donell wrote:
>> On 11/19/20 7:14 PM, Thomas Gleixner wrote:
>>> So from my point of view asking for distorted time still _is_ a request
>>> for ponies.
>>
>> I'm happy if you say it's more work than the value it provides.
> 
> Thinking more about it. Would a facility which provides:
> 
>  CLOCK_FAKE_MONOTONIC|BOOTTIME|REALTIME
> 
> where you can go wild on setting time to whatever you want solve
> your problem?

We would need a way to inject CLOCK_FAKE_* in lieu of the real
constants.

There are only two straight forward ways I know how to do that
and they aren't very useful e.g. alternative build, syscall hot-path
debug code to alter the constant.

We might write a syscall interception framework using seccomp
and SECCOMP_RET_TRACE, but that involves ptrace'ing the process
under test, and is equivalent to a micro-sandbox. I'm not against
that idea for testing; we would test what we ship.

I don't think eBPF can modify the incoming arguments.

I need to go check if systemtap can modify incoming arguments;
I've never done that in any script.

In what other ways can we inject the new constants?

-- 
Cheers,
Carlos.

Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-11-25 Thread Carlos O'Donell

On 11/19/20 7:14 PM, Thomas Gleixner wrote:
> I hope you are aware that the time namespace offsets have to be set
> _before_ the process starts and can't be changed afterwards,
> i.e. settime() is not an option.

I not interested in settime(). I saw Petr's request and forwarded it on
here to further the educational conversation about CLOCK_REALTIME and
cement a consensus around this issue. I'm happy to evangelize that we
won't support settime() for the specific reasons you call out and that
way I can give architectural guidance to setup systems in a particular
way to use CRIU or VM+container if needed.

> That might limit the usability for your use case and this can't be
> changed at all because there might be armed timers and other time
> related things which would start to go into full confusion mode.

The use of time in these cases, from first principles, seems to
degenerate into:

* Verify a time-dependent action is correct.

In glibc's case:

* Verify various APIs after y2038.

In Petr's case he could split the test into two such tests
but he would have to reproduce the system state for the second
test and I expect he wants to avoid that.

In that case I think Petr has to use CRIU to start the container,
stop it, advance time, and restart. That should work perfectly
in that use case and solve all the problems by relying on the work
done by CRIU.

> The supported use case is container life migration and that _is_ very
> careful about restoring time and armed timers and if their user space
> tools screw it up then they can keep the bits and pieces.

Agreed.

> So in order to utilize that you'd have to checkpoint the container,
> manipulate the offsets and restore it.

Or use the same mechanisms CRIU uses.

I can't rely on CRIU because I have to bootstrap a toolchain and userspace.
We should be able to write a thin veneer into our own testing wrapper and
emulate whatever CRIU does. We know apriori that our test framework starts
the test without anything having been executed yet. So we have that benefit.
Currently we unshare() for NEWUSER/NEWPID/NEWNS, but I expect that will
get a little more advanced. Already having these namespaces helps immensely
when adding fs-related tests.

> Aside of this, there are other things, e.g. file times, packet
> timestamps etc. which are based on CLOCK_REALTIME. What to do about
> them? Translate these to/from name space time or not? There is a long
> list of other horrors which are related to that.

We haven't even started testing for the upcoming negative leap second ;-)

> So _you_ might say, that you don't care about file times, RTC, timers
> expiring at the wrong time, packet timestamps and whatever.

I do care about them, but only given certain contexts.

> But then the next test dude comes around and want's to test exactly
> these interfaces and we have to slap the time namespace conversions for
> REALTIME and TAI all over the place because we already support the
> minimal thing.

That's a decision you need to make when asked those questions.

> Can you see why this is a slippery slope and why I'm extremly reluctant
> to even provide the minimal 'distort realtime when the namespace starts'
> support?

I would argue this is a slippery slope fallacy. If and when we get better
vm+container support we just tear all this code out and tell people to
start using those frameworks. The vm+container frameworks have independent
reasons to exist and so will continue to improve for security isolation
purposes and end up solving time testing issues by allowing us complete
control over the VMs time.

>> Hopefully this ilustrates that real time name space is not "request for
>> ponny" :-)
> 
> I can understand your pain and why you want to distort time, but please
> understand that timekeeping is complex. The primary focus must be
> correctness, scalability and maintainability which is already hard
> enough to achieve. Just for the perspective: It took us only 8 years to
> get the kernel halfways 2038 ready (filesystems still outstanding).

I agree. The upstream glibc community has been working on y2038 since 2018;
not as long as the kernel.

> So from my point of view asking for distorted time still _is_ a request
> for ponies.

I'm happy if you say it's more work than the value it provides.

> The fixed offsets for clock MONOTONIC/BOOTTIME are straight forward,
> absolutely make sense and they have a limited scope of exposure. clock
> REALTIME/TAI are very different beasts which entail a slew of horrors.
> Adding settime() to the mix makes it exponentially harder.

Right.

-- 
Cheers,
Carlos.

Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-11-19 Thread Carlos O'Donell

On 11/6/20 7:47 PM, Thomas Gleixner wrote:
> On Thu, Nov 05 2020 at 12:25, Carlos O'Donell wrote:
>> On 10/30/20 9:38 PM, Thomas Gleixner wrote:
>> If kata grows up quickly perhaps this entire problem becomes solved, but 
>> until
>> then I continue to have a testing need for a distinct CLOCK_REALTIME in a
>> time namespace (and it need not be unconditional, if I have to engage magic
>> then I'm happy to do that).
> 
> Conditional, that might be a way to go.
> 
> Would CONFIG_DEBUG_DISTORTED_CLOCK_REALTIME be a way to go? IOW,
> something which is clearly in the debug section of the kernel which wont
> get turned on by distros (*cough*) and comes with a description that any
> bug reports against it vs. time correctness are going to be ignored.

Yes. I would be requiring CONFIG_DEBUG_DISTORTED_CLOCK_REALTIME.

Let me be clear though, the distros have *+debug kernels for which this
CONFIG_DEBUG_* could get turned on? In Fedora *+debug kernels we enable all
sorts of things like CONFIG_DEBUG_OBJECTS_* and CONFIG_DEBUG_SPINLOCK etc.
etc. etc.

I would push Fedora/RHEL to ship this in the *+debug kernels. That way I can 
have
this on for local test/build cycle. Would you be OK with that?

We could have it disabled by default but enabled via proc like
unprivileged_userns_clone was at one point? I want to avoid accidental use in
Fedora *+debug kernels unless the developer is actively going to run tests that
require time manipulation e.g. thousands of DNSSEC tests with timeouts [1].

I also need a way to determine the feature is enabled or disabled so I can XFAIL
the tests and tell the developer they need to turn on the feature in the host
kernel (and not to complain when CLOCK_REALTIME is wrong). A proc interface 
solves
this in a straight forward way.

I could then also tell my hardware partners to turn it on during certain 
test/build
cycles. It violates "ship what you test" but increases test coverage and can be
run as a distinct test cycle. I could also have our internal builders turn this
feature on so we can run rpm %check phases with this feature enabled (operations
might refuse, but in that case my day-to-day developer testing still helps by
orders of magnitude).

Notes:
[1] Petr Špaček commented on DNSSEC and expiration testing as another 
real-world testing
scenario: https://sourceware.org/pipermail/libc-alpha/2020-November/119785.html
Still a testing scenario, but an example outside of glibc for networking, where 
they
have a need to execute thousands of tests with accelerated timeout. If 
vm+containers
catches up, and I think they will, we'll have a solution in a few years.

>> * Adding CLOCK_REALTIME to the kernel is a lot of work given the expected
>>   guarantees for a local system.
> 
> Correct.
> 
>> * CLOCK_REALTIME is an expensive resource to maintain, even more expensive
>>   than other resources where the kernel can balance their usage.
> 
> Correct.
> 
>> * On balance it would be better to use vm or vm+containers e.g. kata as a
>>   solution to having CLOCK_REALTIME distinct in the container.
> 
> That'd be the optimal solution, but the above might be a middle ground.

Agreed.

-- 
Cheers,
Carlos.

Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-11-05 Thread Carlos O'Donell

On 10/30/20 9:38 PM, Thomas Gleixner wrote:
> Coming back to your test coverage argument. I really don't see a problem
> with the requirement of having qemu installed in order to run 'make
> check'.

Cost. It's is cheaper and easier to maintain and deploy containers.

A full VM requires maintaining and updating images, and kernel builds
independent of what the developer is using for their development system.
This goes out of date quickly and needs a lot of resources to maintain.

When you get away from a VM you can then engage the entire developer
community to just run your userspace testing on *their* hardware on *their*
kernels. So I can go to Arm, Intel, AMD, IBM, SUSE, Red Hat, etc. and say:
"All you need to do is run 'make check' and the tests will verify your
hardware and kernel are working correctly." Those developers don't want their
system clocks adjusted during testing, and they are as busy as you and I are.

Container registries and tooling are much lighter weight and support layering
changes on top of base images in ways which allow different testing scenarios.
I don't have any desire to build a similar ecosystem for VM images or wait for
VM+container (kata) tooling to grow up.

If kata grows up quickly perhaps this entire problem becomes solved, but until
then I continue to have a testing need for a distinct CLOCK_REALTIME in a
time namespace (and it need not be unconditional, if I have to engage magic
then I'm happy to do that).

> If you can't ask that from your contributors, then asking me to provide
> you a namespace magic is just hillarious. The contributor who refuses to
> install qemu will also insist to run on some last century kernel which
> does not even know about name spaces at all.

I don't disagree with you, it is *absolutely* a VM tooling issue, and that
containers are easier to maintain, and deploy. With namespaces I can build
glibc, a sysroot, and run it in isolation very quickly.

Just so I understand, let me reiterate your position:

* Adding CLOCK_REALTIME to the kernel is a lot of work given the expected
  guarantees for a local system.

* CLOCK_REALTIME is an expensive resource to maintain, even more expensive
  than other resources where the kernel can balance their usage.

* On balance it would be better to use vm or vm+containers e.g. kata as a
  solution to having CLOCK_REALTIME distinct in the container.
 Thanks for your feedback.

-- 
Cheers,
Carlos.

Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-10-30 Thread Carlos O'Donell

On 10/30/20 4:06 PM, Thomas Gleixner wrote:
> On Fri, Oct 30 2020 at 12:58, Carlos O'Donell wrote:
>> I expect that more requests for further time isolation will happen
>> given the utility of this in containers.
> 
> There was a lengthy discussion about this and the only "usecase" which
> was brought up was having different NTP servers in name spaces, i.e. the
> leap second ones and the smearing ones.

In the non-"request for ponies" category:

* Running legacy 32-bit applications in containers with CLOCK_REALTIME set
  to some value below y2038.

* Testing kernel and userspace clock handling code without needing to
  run on bare-metal, VM, or other.

> Now imagine 1000 containers each running their own NTP. Guess what the
> host does in each timer interrupt? Chasing 1000 containers and update
> their notion of CLOCK_REALTIME. In the remaining 5% CPU time the 1000
> containers can do their computations.

How is this different than balancing any other resource that you give
to a container/vm on a host?

Can you enable 1000 containers running smbd/nmbd and expect to get
great IO performance?

> But even if you restrict it to a trivial offset without NTP
> capabilities, what's the semantics of that offset when the host time is
> set?

Now you're talking about an implementation. This thread is simply
"Would we implement CLOCK_REALTIME?" Is the answer "Maybe, if we solve
all these other problems?"

>> If we have to use qemu today then that's where we're at, but again
>> I expect our use case is representative of more than just glibc.
> 
> For testing purposes it might be. For real world use cases not so
> much. People tend to rely on the coordinated nature of CLOCK_TAI and
> CLOCK_REALTIME.

Except we have two real world use cases, at the top of this email, 
that could extend to a lot of software. We know legacy 32-bit 
applications exist that will break with CLOCK_REALTIME past
y2038. Software exists that manipulates time and needs testing
with specific time values e.g. month crossings, day crossings,
leap year crossings, etc.

>> Does checkpointing work robustly when userspace APIS use 
>> CLOCK_REALTIME (directly or indirectly) in the container?
> 
> AFAICT, yes. That was the conclusion over the lenghty discussion about
> time name spaces and their requirements.

If this is the case then have we established behaviours that
happen when such processes are migrated to other systems with
different CLOCK_REALTIME clocks? Would these behaviours serve
as the basis of how CLOCK_REALTIME in a namespace would behave?

That is to say that migrating a container to a system with a
different CLOCK_REALTIME should behave similarly to what happens
when CLOCK_REALTIME is changed locally and you have a container
with a unique CLOCK_REALTIME?

> Here is the Linux plumber session related to that:
>  https://www.youtube.com/watch?v=sjRUiqJVzOA

Thanks. I watched the session. Informative :-)

-- 
Cheers,
Carlos.

Re: [Y2038][time namespaces] Question regarding CLOCK_REALTIME support plans in Linux time namespaces

2020-10-30 Thread Carlos O'Donell

On 10/30/20 11:10 AM, Thomas Gleixner via Libc-alpha wrote:
> On Fri, Oct 30 2020 at 10:02, Zack Weinberg wrote:
>> On Fri, Oct 30, 2020 at 9:57 AM Cyril Hrubis  wrote:
>>>> According to patch description [1] and time_namespaces documentation
>>>> [2] the CLOCK_REALTIME is not supported (for now?) to avoid complexity
>>>> and overhead in the kernel.
>> ...
>>>> To be more specific - [if this were supported] it would be possible to 
>>>> modify time after time_t
>>>> 32 bit overflow (i.e. Y2038 bug) on the process running Y2038
>>>> regression tests on the host system (64 bit one). By using Linux time
>>>> namespaces the system time will not be affected in any way.
>>>
>>> And what's exactly wrong with moving the system time forward for a
>>> duration of the test?
>>
>> Interference with other processes on the same computer?  Some of us
>> *do* like to run the glibc test suite on computers not entirely
>> devoted to glibc CI.
> 
> That's what virtual machines are for.

Certainly, that is always an option, just like real hardware.

However, every requirement we add to testing reduces the number of
times that developer will run the test on their system and potentially
catch a problem during development. Yes, CI helps, but "make check"
gives more coverage. More kernel variants tested in all downstream rpm
%check builds or developer systems. Just like kernel self tests help
today.

glibc uses namespaces in "make check" to increase the number of userspace
and kernel features we can test immediately and easily on developer
*or* distribution build systems.

So the natural extension is to further isolate the testing namespace
using the time namespace to test and verify y2038. If we can't use
namespaces then we'll have to move the tests out to the less
frequently run scripts we use for cross-target toolchain testing,
and so we'll see a 100x drop in coverage.

I expect that more requests for further time isolation will happen
given the utility of this in containers.

If we have to use qemu today then that's where we're at, but again
I expect our use case is representative of more than just glibc.

Does checkpointing work robustly when userspace APIS use 
CLOCK_REALTIME (directly or indirectly) in the container?

-- 
Cheers,
Carlos.

Re: [PATCH] clocksource/drivers/timer-ti-dm: Fix suspend and resume for am3 and am4

2020-07-21 Thread Carlos Hernandez




On 7/17/20 6:29 AM, Daniel Lezcano wrote:

On 13/07/2020 18:26, Tony Lindgren wrote:

Carlos Hernandez  reported that we now have a suspend and
resume regresssion on am3 and am4 compared to the earlier kernels. While
suspend and resume works with v5.8-rc3, we now get errors with rtcwake:

pm33xx pm33xx: PM: Could not transition all powerdomains to target state
...
rtcwake: write error

This is because we now fail to idle the system timer clocks that the
idle code checks and the error gets propagated to the rtcwake.

Turns out there are several issues that need to be fixed:

1. Ignore no-idle and no-reset configured timers for the ti-sysc
interconnect target driver as otherwise it will keep the system timer
clocks enabled

2. Toggle the system timer functional clock for suspend for am3 and am4
(but not for clocksource on am3)

3. Only reconfigure type1 timers in dmtimer_systimer_disable()

4. Use of_machine_is_compatible() instead of of_device_is_compatible()
for checking the SoC type

Fixes: 52762fbd1c47 ("clocksource/drivers/timer-ti-dm: Add clockevent and 
clocksource support")
Reported-by: Carlos Hernandez 
Signed-off-by: Tony Lindgren 
---


Tested-by: Carlos Hernandez 



Carlos, were you able to test this patch ?


--
Carlos

Re: [PATCH] clocksource/drivers/timer-ti-dm: Fix suspend and resume for am3 and am4

2020-07-20 Thread Carlos Hernandez




On 7/17/20 5:34 PM, Carlos Hernandez wrote:



On 7/17/20 6:29 AM, Daniel Lezcano wrote:

On 13/07/2020 18:26, Tony Lindgren wrote:

Carlos Hernandez  reported that we now have a suspend and
resume regresssion on am3 and am4 compared to the earlier kernels. While
suspend and resume works with v5.8-rc3, we now get errors with rtcwake:

pm33xx pm33xx: PM: Could not transition all powerdomains to target state
...
rtcwake: write error

This is because we now fail to idle the system timer clocks that the
idle code checks and the error gets propagated to the rtcwake.

Turns out there are several issues that need to be fixed:

1. Ignore no-idle and no-reset configured timers for the ti-sysc
interconnect target driver as otherwise it will keep the system timer
clocks enabled

2. Toggle the system timer functional clock for suspend for am3 and am4
(but not for clocksource on am3)

3. Only reconfigure type1 timers in dmtimer_systimer_disable()

4. Use of_machine_is_compatible() instead of of_device_is_compatible()
for checking the SoC type

Fixes: 52762fbd1c47 ("clocksource/drivers/timer-ti-dm: Add clockevent and 
clocksource support")
Reported-by: Carlos Hernandez
Signed-off-by: Tony Lindgren
---

Carlos, were you able to test this patch ?


Tested the patch on top of 5.8-rc5.

cbdb2617290d (HEAD) clocksource/drivers/timer-ti-dm: Fix suspend and 
resume for am3 and am4

11ba468877bb (tag: v5.8-rc5) Linux 5.8-rc5

It works on am335x-evm but fails on am437x-evm

am3:

[ 122.541423] PM: suspend entry (deep)
[ 122.545498] Filesystems sync: 0.000 seconds
[ 122.549711] PM: Preparing system for sleep (deep)
[ 122.564217] Freezing user space processes ... (elapsed 0.003 
seconds) done.

[ 122.575110] OOM killer disabled.
[ 122.578370] Freezing remaining freezable tasks ... (elapsed 0.001 
seconds) done.

[ 122.587604] PM: Suspending system (deep)
[ 122.591572] printk: Suspending console(s) (use no_console_suspend to 
debug)

[ 122.735877] cpsw 4a10.ethernet eth0: Link is Down
[ 122.742365] PM: suspend of devices complete after 142.546 msecs
[ 122.742397] PM: start suspend of devices complete after 143.777 msecs
[ 122.748722] PM: late suspend of devices complete after 6.257 msecs
[ 122.754662] PM: noirq suspend of devices complete after 5.632 msecs
[ 122.754689] Disabling non-boot CPUs ...
[ 122.754715] pm33xx pm33xx: PM: Successfully put all powerdomains to 
target state

[ 122.754715] PM: Wakeup source RTC Alarm
[ 122.766169] ti-sysc 4a101200.target-module: OCP softreset timed out
[ 122.769222] PM: noirq resume of devices complete after 14.367 msecs
[ 122.772956] PM: early resume of devices complete after 3.428 msecs
[ 122.775749] cpsw 4a10.ethernet: initializing cpsw version 1.12 (0)
[ 122.857132] Qualcomm Atheros AR8031/AR8033 4a101000.mdio:00: 
attached PHY driver [Qualcomm Atheros AR8031/AR8033] 
(mii_bus:phy_addr=4a101000.mdio:00, irq=POLL)

[ 122.874236] ti-sysc 4802a000.target-module: OCP softreset timed out
[ 122.879559] PM: Timekeeping suspended for 18.212 seconds
[ 122.994120] PM: resume of devices complete after 221.091 msecs
[ 123.101133] PM: Finishing wakeup.
[ 123.104493] OOM killer enabled.
[ 123.107657] Restarting tasks ... done.
[ 123.168294] PM: suspend exit
[ 126.005262] cpsw 4a10.ethernet eth0: Link is Up - 1Gbps/Full - 
flow control off

[ 122.742365] PM: suspend of devices complete after 142.546 msecs


am4:

|TRACE LOG|Inside do_cmd:CMD=echo 1 > 
/sys/kernel/debug/pm_debug/enable_off_mode|
|TRACE LOG|suspend function: power_state: mem|
|TRACE LOG|suspend function: max_stime: 10|
|TRACE LOG|suspend function: max_atime: 5|
|TRACE LOG|suspend function: iterations: 2|
|TRACE LOG|suspend function: usb_remove: 0|
|TRACE LOG|suspend function: usb_module: |
|TRACE LOG|===suspend iteration 0===|
sh1577:1590898691->1590898692(1):: wakeup - 9 sec 0 msec
sh1577:1590898691->1590898692(1):: Use rtc to suspend resume, adding 10 secs to 
suspend time
rtcwake: wakeup from "mem" using /dev/rtc0 at Sun May 31 04:18:33 2020
[  106.401004] PM: suspend entry (deep)
[  106.420151] Filesystems sync: 0.015 seconds
[  106.424598] PM: Preparing system for sleep (deep)
[  106.434312] Freezing user space processes ... (elapsed 0.001 seconds) done.
[  106.442808] OOM killer disabled.
[  106.446152] Freezing remaining freezable tasks ... (elapsed 0.001 seconds) 
done.
[  106.454894] PM: Suspending system (deep)
[  106.458893] printk: Suspending console(s) (use no_console_suspend to debug)
** 1196 printk messages dropped **
[  107.379605] [] (gic_handle_irq) from [] 
(__irq_svc+0x6c/0x90)
** 41 printk messages dropped **
[  107.412635] CPU: 0 PID: 0 Comm: swapper/0 Tainted: GW 
5.8.0-rc5-1-gcbdb2617290d #1
** 37 printk messages dropped **
[  107.429822] [] (handle_irq_event) from [] 
(handle_fasteoi_irq+0xc4/0x188)
** 37 printk messages dropped **
[  107.446915] [] (cpu_idle_poll) from [] 
(do_idle+0x7c/0x2b4)
[  107.446946] [] (do_idle) fro

Re: [PATCH v4 0/2] Syscall User Redirection

2020-07-16 Thread Carlos O'Donell

On 7/16/20 4:30 PM, Gabriel Krisman Bertazi wrote:
> Christian Brauner  writes:
> 
>> On Thu, Jul 16, 2020 at 01:25:43PM -0700, Kees Cook wrote:
>>> On Thu, Jul 16, 2020 at 10:22:34PM +0200, Christian Brauner wrote:
>>>> On Thu, Jul 16, 2020 at 01:04:38PM -0700, Kees Cook wrote:
>>>>> On Thu, Jul 16, 2020 at 03:31:39PM -0400, Gabriel Krisman Bertazi wrote:
>>>>>> This is v4 of Syscall User Redirection.  The implementation itself is
>>>>>> not modified from v3, it only applies the latest round of reviews to the
>>>>>> selftests.
>>>>>>
>>>>>> __NR_syscalls is not really exported in header files other than
>>>>>> asm-generic for every architecture, so it felt safer to optionally
>>>>>> expose it with a fallback to a high value.
>>>>>>
>>>>>> Also, I didn't expose tests for PR_GET as that is not currently
>>>>>> implemented.  If possible, I'd have it supported by a future patchset,
>>>>>> since it is not immediately necessary to support this feature.
>>>>>
>>>>> Thanks! That all looks good to me.
>>>>
>>>> Don't have any problem with this but did this ever get exposure on
>>>> linux-api? This is the first time I see this pop up.
>>>
>>> I thought I'd added it to CC in the past, but that might have been other
>>> recent unrelated threads. Does this need a full repost there too, you
>>> think?
>>
>> Nah, wasn't my intention to force a repost. Seems that several people
>> have looked this over. :) Just curious why it didn't get to linux-api
>> and we know quite some people who only do look at linux-api (for sanity). :)
> 
> That's my mistake.  I didn't think about it when submitting :(
> 
> If this get re-spinned again I will make sure to CC linux-api.

Thank you! It helps C library implementors stay up to date and comment
on changes that impact userspace ABIs and APIs. This patch set was new
to me. Interesting new feature.

-- 
Cheers,
Carlos.

Re: [RFC PATCH 0/4] rseq: Introduce extensible struct rseq

2020-07-16 Thread Carlos O'Donell

On 7/15/20 9:02 AM, Mathieu Desnoyers wrote:
> At this point, the main question I would like answered is whether
> it would be acceptable to increase the size and alignment of
> the __rseq_abi symbol (which will be exposed by glibc) between
> e.g. glibc 2.32 and 2.33. If it's not possible, then we can
> find other solutions, for instance using an indirection with
> a pointer to an extended structure, but this appears to be
> slightly less efficient.

The answer is always a soft "maybe" because it depends exactly
on how we do it and what consequences we are willing to accept
in the design.

For example, static applications that call dlopen will fail if
we increase the alignment beyond 32 because we had to special
case this scenario. Why did we have to special case it? Because
the "static" part of the runtime needs to create the initial
thread's static TLS space, and since it doesn't know apriori
what will be loaded in the shared library, it needs to make a
"best guess" at the alignment requirement at startup.
We need to discuss this and agree that it's OK. We already want
to deprecate dynamic loading from static applications, so this
may not be a problem in general, but I hope you see my point.
That there are corner cases to be considered and ironed out.

I want to see a detailed design document explaining the various
compatibility issues and how we solve them along with the way
the extension mechanism would work and how it would be compliant
with C/C++ language rules in userspace without adding undue burden
of potentially having to use atomic instructions all the time.
This includes discussing how the headers change. We should also
talk out the options for symbol versioning and their consequences.

I haven't seen enough details, and there isn't really enough
time to discuss this. I think it is *great* that we are discussing
it, but it's safest if we revert rseq, finish the discussion,
and then finalize the inclusion for 2.33 with these details
ironed out.

I feel like we've made all the technical process we need to actually
include rseq in glibc, but this discussion, and the google example
(even if it doesn't match our use case) shows that if we spend another
month hammering out the extension details could yield something we
can use for years to come while we work out other details e.g. cpu_opv.

I can set aside time in the next month to write up such a document
and discuss these issues with you and Florian. The text would form
even more of the language we'd have to include in the man page for
the feature.

In the meantime I think we should revert rseq in glibc and take
our time to hash this out without the looming deadline of August 1st
for the ABI going out the door.

I know this is disappointing, but I think in a month you'll look
back at this, we'll have Fedora Rawhide using the new extensible
version (and you'll be able to point people at that), and we'll
only be 5 months away from an official release with extensible
rseq.

Could you please respond to Florian's request to revert here?
https://sourceware.org/pipermail/libc-alpha/2020-July/116368.html

I'm looking for a Signed-off-by from you that you're OK with
reverting.

-- 
Cheers,
Carlos.

Re: [RFC PATCH 2/4] rseq: Allow extending struct rseq

2020-07-14 Thread Carlos O'Donell

On 7/14/20 9:19 AM, Mathieu Desnoyers wrote:
> Is there an arch-agnostic way to get the thread pointer from user-space code 
> ? That
> would be needed by all rseq critical section implementations.

Yes, and no. We have void *__builtin_thread_pointer (void), but
few architectures implement the builtin so we'd have to go through
a round of compiler updates and backports. All targets know how to
access the thread pointer because the compiler has to generate
IE-mode accesses to the TLS variables.

I have filed an enhancement request:
Bug 96200 - Implement __builtin_thread_pointer() and 
__builtin_set_thread_pointer() if TLS is supported
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96200

We have glibc internal macro APIs to access the thread pointer,
but I would rather the compiler handle the access since it can
schedule the resulting sequence better.

On some arches setting the therad pointer needs a syscall or
equivalent operation (hppa), and for some arches there is no
fixed register (arm) hence the need for __builtin_thread_pointer()
to force the compiler to place the pointer into a register for
function return.

-- 
Cheers,
Carlos.

Re: [RFC PATCH 0/4] rseq: Introduce extensible struct rseq

2020-07-14 Thread Carlos O'Donell

On 7/13/20 11:03 PM, Mathieu Desnoyers wrote:
> Recent discussion led to a solution for extending struct rseq. This is
> an implementation of the proposed solution.
> 
> Now is a good time to agree on this scheme before the release of glibc
> 2.32, just in case there are small details to fix on the user-space
> side in order to allow extending struct rseq.

Adding extensibility to the rseq registration process would be great,
but we are out of time for the glibc 2.32 release.

Should we revert rseq for glibc 2.32 and spend quality time discussing
the implications of an extensible design, something that Google already
says they are doing?

We can, with a clear head, and an agreed upon extension mechanism
include rseq in glibc 2.33 (release scheduled for Feburary 1st 2021).
We release time boxed every 6 months, no deviation, so you know when
your next merge window will be.

We have already done the hard work of fixing the nesting signal
handler issues, and glibc integration. If we revert today that will 
also give time for Firefox and Chrome to adjust their sandboxes.

Do you wish to go forward with rseq as we have it in glibc 2.32,
or do you wish to revert rseq from glibc 2.32, discuss the extension
mechanism, and put it back into glibc 2.33 with adjustments?

-- 
Cheers,
Carlos.

Re: [RFC PATCH for 5.8 3/4] rseq: Introduce RSEQ_FLAG_RELIABLE_CPU_ID

2020-07-07 Thread Carlos O'Donell

On 7/7/20 8:06 AM, Mathieu Desnoyers wrote:
> - On Jul 7, 2020, at 7:32 AM, Florian Weimer f...@deneb.enyo.de wrote:
> 
>> * Mathieu Desnoyers:
>>
>>> Those are very good points. One possibility we have would be to let
>>> glibc do the rseq registration without the RSEQ_FLAG_RELIABLE_CPU_ID
>>> flag. On kernels with the bug present, the cpu_id field is still good
>>> enough for typical uses of sched_getcpu() which does not appear to
>>> have a very strict correctness requirement on returning the right
>>> cpu number.
>>>
>>> Then libraries and applications which require a reliable cpu_id
>>> field could check this on their own by calling rseq with the
>>> RSEQ_FLAG_RELIABLE_CPU_ID flag. This would not make the state more
>>> complex in __rseq_abi, and let each rseq user decide about its own
>>> fate: whether it uses rseq or keeps using an rseq-free fallback.
>>>
>>> I am still tempted to allow combining RSEQ_FLAG_REGISTER |
>>> RSEQ_FLAG_RELIABLE_CPU_ID for applications which would not be using
>>> glibc, and want to check this flag on thread registration.
>>
>> Well, you could add a bug fix level field to the __rseq_abi variable.
> 
> Even though I initially planned to make the struct rseq_abi extensible,
> the __rseq_abi variable ends up being fix-sized, so we need to be very
> careful in choosing what we place in the remaining 12 bytes of padding.
> I suspect we'd want to keep 8 bytes to express a pointer to an
> "extended" structure.
> 
> I wonder if a bug fix level "version" is the right approach. We could
> instead have a bitmask of fixes, which the application could independently
> check. For instance, some applications may care about cpu_id field
> reliability, and others not.

I agree with Florian.

Developers are not interested in a bitmask of fixes.

They want a version they can check and that at a given version everything
works as expected.

In reality today this is the kernel version.

So rseq is broken from a developer perspective until they can get a new
kernel or an agreement from their downstream vendor that revision Z of
the kernel they are using has the fix you've just committed.

Essentially this problem solves itself at level higher than the interfaces
we're talking about today.

Encoding this as a bitmask of fixes is an overengineered solution for a
problem that the downstream communities already know how to solve.

I would strongly suggest a "version" or nothing.

>> Then applications could check if the kernel has the appropriate level
>> of non-buggyness.  But the same thing could be useful for many other
>> kernel interfaces, and I haven't seen such a fix level value for them.
>> What makes rseq so special?
> 
> I guess my only answer is because I care as a user of the system call, and
> what is a system call without users ? I have real applications which work
> today with end users deploying them on various kernels, old and new, and I
> want to take advantage of this system call to speed them up. However, if I
> have to choose between speed and correctness (in other words, not crashing
> a critical application), I will choose correctness. So if I cannot detect
> that I can safely use the system call, it becomes pretty much useless even
> for my own use-cases.

Yes.

In the case of RHEL we have *tons*  of users in the same predicament.

They just wait until the RHEL kernel team releases a fixed kernel version
and check for that version and deploy with that version.

Likewise every other user of a kernel. They solve it by asking their
kernel provider, internal or external, to verify the fix is applied to
the deployment kernel.

If they are an ISV they have to test and deploy similar strategies for
multiple kernel version.

By trying to automate this you are encoding into the API some level of
package management concepts e.g. patch level, revision level, etc.

This is difficult to do without a more generalized API. Why do it just
for rseq? Why do it with the few bits you have?

It's not a great fit IMO. Just let the kernel version be the arbiter of
correctness.

>> It won't help with the present bug, but maybe we should add an rseq
>> API sub-call that blocks future rseq registration for the thread.
>> Then we can add a glibc tunable that flips off rseq reliably if people
>> do not want to use it for some reason (and switch to the non-rseq
>> fallback code instead).  But that's going to help with future bugs
>> only.
> 
> I don't think it's needed. All I really need is to have _some_ way to
> let lttng-ust or liburcu query whether the cpu_id field is reliable. This
> state does not have to be made quickly accessible to other libraries,
> nor does it have to be shared between libraries. It would allow each
> user library or application to make its own mind on whether it can use
> rseq or not.

That check is "kernel version > x.y.z" :-)

or

"My kernel vendor told me it's fixed."

-- 
Cheers,
Carlos.

Re: [PATCH 1/3] glibc: Perform rseq registration at C startup and thread creation (v22)

2020-07-03 Thread Carlos O'Donell

On 7/2/20 10:46 AM, Florian Weimer wrote:
> * Mathieu Desnoyers via Libc-alpha:
> 
>> Register rseq TLS for each thread (including main), and unregister for
>> each thread (excluding main).  "rseq" stands for Restartable Sequences.
>>
>> See the rseq(2) man page proposed here:
>>   https://lkml.org/lkml/2018/9/19/647
>>
>> Those are based on glibc master branch commit 3ee1e0ec5c.
>> The rseq system call was merged into Linux 4.18.
>>
>> The TLS_STATIC_SURPLUS define is increased to leave additional room for
>> dlopen'd initial-exec TLS, which keeps elf/tst-auditmany working.
>>
>> The increase (76 bytes) is larger than 32 bytes because it has not been
>> increased in quite a while.  The cost in terms of additional TLS storage
>> is quite significant, but it will also obscure some initial-exec-related
>> dlopen failures.
> 
> We need another change to get this working on most non-x86
> architectures:
> 
> diff --git a/elf/dl-tls.c b/elf/dl-tls.c
> index 817bcbbf59..ca13778ca9 100644
> --- a/elf/dl-tls.c
> +++ b/elf/dl-tls.c
> @@ -134,6 +134,12 @@ void
>  _dl_determine_tlsoffset (void)
>  {
>size_t max_align = TLS_TCB_ALIGN;
> +  /* libc.so with rseq has TLS with 32-byte alignment.  Since TLS is
> + initialized before audit modules are loaded and slotinfo
> + information is available, this is not taken into account below in
> + the audit case.  */
> +  max_align = MAX (max_align, 32U);
> +
>size_t freetop = 0;
>size_t freebottom = 0;
> 
> This isn't visible on x86-64 because TLS_TCB_ALIGN is already 64 there.
> 
> I plan to re-test with this fix and push the series.
> 
> Carlos, is it okay if I fold in the dl-tls.c change if testing looks
> good?

I have reviewed the above and I think it is the correct *pragmatic* fix.

The reality is that to fix this fully you must use a two stage loading
process to pre-examine all audit modules *before* setting the fundamental
alignment of the TCB.  This isn't easy with the current loader framework.
Therefore the above is a good pragmatic solution.

There is always going to be a bit of a chicken and an egg situation.
We want to provide a fundamental alignment requirement but we haven't
yet seen all the requirements on alignment. So the best we could do is
look over DT_NEEDED, DT_AUDIT, LD_PRELOAD, etc. get the best answer
and then fail any subsequent dlopen's that load objects with higher
fundamental requirements for alignment of the TCB.

The audit modules are problematic becuase they are loaded *before*
anything else is loaded, *before* we've examined any of the actual
objects we're about to load because they can influence the search
paths. Again, this means the above solution is a perfect pragmatic
choice. The real solution is to rearchitect the early audit module
loading into two stages and that's work we can do later.

OK with the above change.

-- 
Cheers,
Carlos.

Re: [PATCH] futex: send SIGBUS if argument is not aligned on a four-byte boundary

2020-05-19 Thread Carlos O'Donell

On Fri, May 15, 2020 at 1:50 PM Carlos O'Donell  wrote:
> This isn't fixed because this is the older code in pthread_mutex_lock
> which we haven't ported to futex-internal.h yet, otherwise we would abort
> the process.

I filed this upstream as a QoI issue so I don't forget to port the existing code
to the newer internal interfaces for futex handling.

"Bug 25997 - pthread_mutex_lock QoI issue for unaligned futex."
https://sourceware.org/bugzilla/show_bug.cgi?id=25997

I checked that -Wcast-align=strict does warn about this case, but it's
rarely used
in production code that I've worked with. I'm following up with the
compiler people
to see if we can consistently warn in these cases and so avoid this kind of code
existing in the first place.

Cheers,
Carlos.

Re: [PATCH] futex: send SIGBUS if argument is not aligned on a four-byte boundary

2020-05-15 Thread Carlos O'Donell

 case that we would want to support.
To get efficient cache-line usage you'll want to pack things by hand.

>> This patch sends SIGBUS signal to slay task and break busy-loop.
>>
>> Signed-off-by: Konstantin Khlebnikov 
>> Reported-by: Maxim Samoylov 
> 
> Seems like a sensible idea to me.

Please do try to update the linux kernel man pages update to note
the change in behaviour and the version and commit of the released
kernel where this changed.

Please keep `man 2 futex` as accurate as possible for userspace
libc implementations.

Thanks.
 
> Acked-by: Peter Zijlstra (Intel) 
> 
>> ---
>>  kernel/futex.c |   13 -
>>  1 file changed, 12 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/futex.c b/kernel/futex.c
>> index b59532862bc0..8a6d35fa56bc 100644
>> --- a/kernel/futex.c
>> +++ b/kernel/futex.c
>> @@ -508,10 +508,21 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
>> futex_key *key, enum futex_a
>>  
>>  /*
>>   * The futex address must be "naturally" aligned.
>> + * Also send signal to break busy-loop if user-space ignore error.
>> + * EFAULT case should trigger SIGSEGV at access from user-space.
>>   */
>>  key->both.offset = address % PAGE_SIZE;
>> -if (unlikely((address % sizeof(u32)) != 0))
>> +if (unlikely((address % sizeof(u32)) != 0)) {
>> +struct kernel_siginfo info;
>> +
>> +clear_siginfo(&info);
>> +info.si_signo = SIGBUS;
>> +info.si_code  = BUS_ADRALN;
>> +info.si_addr  = uaddr;
>> +force_sig_info(&info);
>> +
>>  return -EINVAL;
>> +}
>>  address -= key->both.offset;
>>  
>>  if (unlikely(!access_ok(uaddr, sizeof(u32
>>
> 

-- 
Cheers,
Carlos.

[PATCH v5] media: vimc: get pixformat info from v4l2_format_info

2020-04-29 Thread Carlos E. C. Barbosa

There is overlapping code over two distinct lists. This repurposes
vimc_pix_map for mapping formats and remaps the calls to the matching
v4l2_format_info.

Signed-off-by: Carlos E. C. Barbosa 

---

Changes in v5:
- .bayer member was removed and replaced by v4l2 functions calls

Changes in v4:
- Unused variables were removed

Changes in v3:
- Change declaration order of variables and some minor style changes

Changes in v2:
- Const qualifiers are not removed
- Bayer flag is kept
- Unnecessary changes are not made anymore

v4l2-compliance -m /dev/media0 output:
https://pastebin.com/VQV4vrTW

---
---
 .../media/test-drivers/vimc/vimc-capture.c| 16 ---
 drivers/media/test-drivers/vimc/vimc-common.c | 46 ---
 drivers/media/test-drivers/vimc/vimc-common.h |  4 --
 .../media/test-drivers/vimc/vimc-debayer.c|  7 ++-
 drivers/media/test-drivers/vimc/vimc-scaler.c | 18 ++--
 drivers/media/test-drivers/vimc/vimc-sensor.c |  9 +++-
 6 files changed, 35 insertions(+), 65 deletions(-)

diff --git a/drivers/media/test-drivers/vimc/vimc-capture.c 
b/drivers/media/test-drivers/vimc/vimc-capture.c
index 5315c201314c..3c280a642ec7 100644
--- a/drivers/media/test-drivers/vimc/vimc-capture.c
+++ b/drivers/media/test-drivers/vimc/vimc-capture.c
@@ -85,6 +85,7 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void 
*priv,
struct v4l2_format *f)
 {
struct v4l2_pix_format *format = &f->fmt.pix;
+   const struct v4l2_format_info *vinfo;
const struct vimc_pix_map *vpix;
 
format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH,
@@ -94,12 +95,13 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void 
*priv,
 
/* Don't accept a pixelformat that is not on the table */
vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
-   if (!vpix) {
+   if (!vpix)
format->pixelformat = fmt_default.pixelformat;
-   vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
-   }
+
+   vinfo = v4l2_format_info(format->pixelformat);
+
/* TODO: Add support for custom bytesperline values */
-   format->bytesperline = format->width * vpix->bpp;
+   format->bytesperline = format->width * vinfo->bpp[0];
format->sizeimage = format->bytesperline * format->height;
 
if (format->field == V4L2_FIELD_ANY)
@@ -386,7 +388,7 @@ static struct vimc_ent_device *vimc_cap_add(struct 
vimc_device *vimc,
const char *vcfg_name)
 {
struct v4l2_device *v4l2_dev = &vimc->v4l2_dev;
-   const struct vimc_pix_map *vpix;
+   const struct v4l2_format_info *vinfo;
struct vimc_cap_device *vcap;
struct video_device *vdev;
struct vb2_queue *q;
@@ -434,8 +436,8 @@ static struct vimc_ent_device *vimc_cap_add(struct 
vimc_device *vimc,
 
/* Set default frame format */
vcap->format = fmt_default;
-   vpix = vimc_pix_map_by_pixelformat(vcap->format.pixelformat);
-   vcap->format.bytesperline = vcap->format.width * vpix->bpp;
+   vinfo = v4l2_format_info(vcap->format.pixelformat);
+   vcap->format.bytesperline = vcap->format.width * vinfo->bpp[0];
vcap->format.sizeimage = vcap->format.bytesperline *
 vcap->format.height;
 
diff --git a/drivers/media/test-drivers/vimc/vimc-common.c 
b/drivers/media/test-drivers/vimc/vimc-common.c
index c95c17c048f2..00f0e301b4e3 100644
--- a/drivers/media/test-drivers/vimc/vimc-common.c
+++ b/drivers/media/test-drivers/vimc/vimc-common.c
@@ -21,146 +21,100 @@ static const struct vimc_pix_map vimc_pix_map_list[] = {
{
.code = MEDIA_BUS_FMT_BGR888_1X24,
.pixelformat = V4L2_PIX_FMT_BGR24,
-   .bpp = 3,
-   .bayer = false,
},
{
.code = MEDIA_BUS_FMT_RGB888_1X24,
.pixelformat = V4L2_PIX_FMT_RGB24,
-   .bpp = 3,
-   .bayer = false,
},
{
.code = MEDIA_BUS_FMT_ARGB_1X32,
.pixelformat = V4L2_PIX_FMT_ARGB32,
-   .bpp = 4,
-   .bayer = false,
},
 
/* Bayer formats */
{
.code = MEDIA_BUS_FMT_SBGGR8_1X8,
.pixelformat = V4L2_PIX_FMT_SBGGR8,
-   .bpp = 1,
-   .bayer = true,
},
{
.code = MEDIA_BUS_FMT_SGBRG8_1X8,
.pixelformat = V4L2_PIX_FMT_SGBRG8,
-   .bpp = 1,
-   .bayer = true,
},
{
.code = MEDIA_BUS_FMT_SGRBG8_1X8,
.pixelformat = V4L2_PIX_FMT_SGRBG8,
-   .bpp = 1,
-   .bayer = true,
},
{
.code = MEDIA_BUS_FMT_SRGGB8_1X8,

[PATCH] media: vimc: get pixformat info from v4l2_format_info to avoid code repetition

2019-10-15 Thread Carlos E. C. Barbosa

From: "Carlos E.C. Barbosa" 

There is overlapping code over two distinct lists. This repurposes one
of vimc_pix_map for strictly mapping formats and remaps other calls to
the matching v4l2_format_info.

---

Change in v2:
- Change commit message
- Remove struct with mbus code and pointer to v4l2_format_info
- The map and info are directly called when strictly needed

Signed-off-by: Carlos E. C. Barbosa 
---
 drivers/media/platform/vimc/vimc-capture.c | 14 ++--
 drivers/media/platform/vimc/vimc-common.c  | 78 ++
 drivers/media/platform/vimc/vimc-common.h  |  9 +--
 drivers/media/platform/vimc/vimc-debayer.c |  9 ++-
 drivers/media/platform/vimc/vimc-scaler.c  | 39 +--
 drivers/media/platform/vimc/vimc-sensor.c  | 26 +---
 6 files changed, 89 insertions(+), 86 deletions(-)

diff --git a/drivers/media/platform/vimc/vimc-capture.c 
b/drivers/media/platform/vimc/vimc-capture.c
index 602f80323031..65282d4d8a1e 100644
--- a/drivers/media/platform/vimc/vimc-capture.c
+++ b/drivers/media/platform/vimc/vimc-capture.c
@@ -85,7 +85,8 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void 
*priv,
struct v4l2_format *f)
 {
struct v4l2_pix_format *format = &f->fmt.pix;
-   const struct vimc_pix_map *vpix;
+   struct vimc_pix_map *vpix;
+   struct v4l2_format_info *vinfo;
 
format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH,
VIMC_FRAME_MAX_WIDTH) & ~1;
@@ -99,7 +100,8 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void 
*priv,
vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
}
/* TODO: Add support for custom bytesperline values */
-   format->bytesperline = format->width * vpix->bpp;
+   vinfo = v4l2_format_info(vpix->pixelformat);
+   format->bytesperline = format->width * vinfo->bpp[0];
format->sizeimage = format->bytesperline * format->height;
 
if (format->field == V4L2_FIELD_ANY)
@@ -159,7 +161,7 @@ static int vimc_cap_enum_fmt_vid_cap(struct file *file, 
void *priv,
 static int vimc_cap_enum_framesizes(struct file *file, void *fh,
struct v4l2_frmsizeenum *fsize)
 {
-   const struct vimc_pix_map *vpix;
+   struct vimc_pix_map *vpix;
 
if (fsize->index)
return -EINVAL;
@@ -387,7 +389,8 @@ struct vimc_ent_device *vimc_cap_add(struct vimc_device 
*vimc,
 const char *vcfg_name)
 {
struct v4l2_device *v4l2_dev = &vimc->v4l2_dev;
-   const struct vimc_pix_map *vpix;
+   struct vimc_pix_map *vpix;
+   struct v4l2_format_info *vinfo;
struct vimc_cap_device *vcap;
struct video_device *vdev;
struct vb2_queue *q;
@@ -443,7 +446,8 @@ struct vimc_ent_device *vimc_cap_add(struct vimc_device 
*vimc,
/* Set default frame format */
vcap->format = fmt_default;
vpix = vimc_pix_map_by_pixelformat(vcap->format.pixelformat);
-   vcap->format.bytesperline = vcap->format.width * vpix->bpp;
+   vinfo = v4l2_format_info(vpix->pixelformat);
+   vcap->format.bytesperline = vcap->format.width * vinfo->bpp[0];
vcap->format.sizeimage = vcap->format.bytesperline *
 vcap->format.height;
 
diff --git a/drivers/media/platform/vimc/vimc-common.c 
b/drivers/media/platform/vimc/vimc-common.c
index a3120f4f7a90..73feea089921 100644
--- a/drivers/media/platform/vimc/vimc-common.c
+++ b/drivers/media/platform/vimc/vimc-common.c
@@ -14,185 +14,151 @@
  * NOTE: non-bayer formats need to come first (necessary for enum_mbus_code
  * in the scaler)
  */
-static const struct vimc_pix_map vimc_pix_map_list[] = {
+static struct vimc_pix_map vimc_pix_map_list[] = {
/* TODO: add all missing formats */
 
/* RGB formats */
{
.code = MEDIA_BUS_FMT_BGR888_1X24,
.pixelformat = V4L2_PIX_FMT_BGR24,
-   .bpp = 3,
-   .bayer = false,
},
{
.code = MEDIA_BUS_FMT_RGB888_1X24,
.pixelformat = V4L2_PIX_FMT_RGB24,
-   .bpp = 3,
-   .bayer = false,
},
{
.code = MEDIA_BUS_FMT_ARGB_1X32,
.pixelformat = V4L2_PIX_FMT_ARGB32,
-   .bpp = 4,
-   .bayer = false,
},
 
/* Bayer formats */
{
.code = MEDIA_BUS_FMT_SBGGR8_1X8,
.pixelformat = V4L2_PIX_FMT_SBGGR8,
-   .bpp = 1,
-   .bayer = true,
},
{
.code = MEDIA_BUS_FMT_SGBRG8_1X8,
.pixelformat = V4L2_PIX_FMT_SGBRG8,
-   .bpp = 1,
-   .bayer = true,
},
{
.code = MEDIA_BUS_FMT_SGRBG8_1X8

[PATCH] media: vimc: get pixformat info from v4l2_format_info to avoid code repetition

2019-10-05 Thread Carlos E. C. Barbosa

From: "Carlos E.C. Barbosa" 

As the info found in vim_pix_map members are already available in
v4l2_format_info those were removed and their calls remapped to it.

Signed-off-by: Carlos E. C. Barbosa 
---
 drivers/media/platform/vimc/vimc-capture.c |  20 ++--
 drivers/media/platform/vimc/vimc-common.c  | 107 +
 drivers/media/platform/vimc/vimc-common.h  |  27 --
 drivers/media/platform/vimc/vimc-debayer.c |   6 +-
 drivers/media/platform/vimc/vimc-scaler.c  |  26 +++--
 drivers/media/platform/vimc/vimc-sensor.c  |  25 ++---
 6 files changed, 105 insertions(+), 106 deletions(-)

diff --git a/drivers/media/platform/vimc/vimc-capture.c 
b/drivers/media/platform/vimc/vimc-capture.c
index 602f80323031..8be2f81d5da3 100644
--- a/drivers/media/platform/vimc/vimc-capture.c
+++ b/drivers/media/platform/vimc/vimc-capture.c
@@ -85,7 +85,7 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void 
*priv,
struct v4l2_format *f)
 {
struct v4l2_pix_format *format = &f->fmt.pix;
-   const struct vimc_pix_map *vpix;
+   struct vimc_pix_map vpix;
 
format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH,
VIMC_FRAME_MAX_WIDTH) & ~1;
@@ -94,12 +94,12 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void 
*priv,
 
/* Don't accept a pixelformat that is not on the table */
vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
-   if (!vpix) {
+   if (!vpix.info) {
format->pixelformat = fmt_default.pixelformat;
vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
}
/* TODO: Add support for custom bytesperline values */
-   format->bytesperline = format->width * vpix->bpp;
+   format->bytesperline = format->width * vpix.info->bpp[0];
format->sizeimage = format->bytesperline * format->height;
 
if (format->field == V4L2_FIELD_ANY)
@@ -146,12 +146,12 @@ static int vimc_cap_s_fmt_vid_cap(struct file *file, void 
*priv,
 static int vimc_cap_enum_fmt_vid_cap(struct file *file, void *priv,
 struct v4l2_fmtdesc *f)
 {
-   const struct vimc_pix_map *vpix = vimc_pix_map_by_index(f->index);
+   const struct vimc_pix_map vpix = vimc_pix_map_by_index(f->index);
 
-   if (!vpix)
+   if (!vpix.info)
return -EINVAL;
 
-   f->pixelformat = vpix->pixelformat;
+   f->pixelformat = vpix.info->format;
 
return 0;
 }
@@ -159,14 +159,14 @@ static int vimc_cap_enum_fmt_vid_cap(struct file *file, 
void *priv,
 static int vimc_cap_enum_framesizes(struct file *file, void *fh,
struct v4l2_frmsizeenum *fsize)
 {
-   const struct vimc_pix_map *vpix;
+   struct vimc_pix_map vpix;
 
if (fsize->index)
return -EINVAL;
 
/* Only accept code in the pix map table */
vpix = vimc_pix_map_by_code(fsize->pixel_format);
-   if (!vpix)
+   if (!vpix.info)
return -EINVAL;
 
fsize->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
@@ -387,7 +387,7 @@ struct vimc_ent_device *vimc_cap_add(struct vimc_device 
*vimc,
 const char *vcfg_name)
 {
struct v4l2_device *v4l2_dev = &vimc->v4l2_dev;
-   const struct vimc_pix_map *vpix;
+   struct vimc_pix_map vpix;
struct vimc_cap_device *vcap;
struct video_device *vdev;
struct vb2_queue *q;
@@ -443,7 +443,7 @@ struct vimc_ent_device *vimc_cap_add(struct vimc_device 
*vimc,
/* Set default frame format */
vcap->format = fmt_default;
vpix = vimc_pix_map_by_pixelformat(vcap->format.pixelformat);
-   vcap->format.bytesperline = vcap->format.width * vpix->bpp;
+   vcap->format.bytesperline = vcap->format.width * vpix.info->bpp[0];
vcap->format.sizeimage = vcap->format.bytesperline *
 vcap->format.height;
 
diff --git a/drivers/media/platform/vimc/vimc-common.c 
b/drivers/media/platform/vimc/vimc-common.c
index a3120f4f7a90..9ea698810e1a 100644
--- a/drivers/media/platform/vimc/vimc-common.c
+++ b/drivers/media/platform/vimc/vimc-common.c
@@ -14,186 +14,167 @@
  * NOTE: non-bayer formats need to come first (necessary for enum_mbus_code
  * in the scaler)
  */
-static const struct vimc_pix_map vimc_pix_map_list[] = {
+static struct vimc_pix_map_fmt vimc_pix_map_fmt_list[] = {
/* TODO: add all missing formats */
 
/* RGB formats */
{
.code = MEDIA_BUS_FMT_BGR888_1X24,
.pixelformat = V4L2_PIX_FMT_BGR24,
-   .bpp = 3,
-   .bayer = false,
},
{
.code = MEDIA_BUS_FMT_RGB888_1X24,
.pixelformat = V4L2_PIX_FMT

[PATCH 2/2] media: vimc: include pointer to info in struct vimc_pix_map

2019-10-03 Thread Carlos Eduardo Clímaco Barbosa

Change struct vimc_pix_map and add vimc_pix_map_fmt to clean up and
increase readability
---
 drivers/media/platform/vimc/vimc-capture.c | 24 -
 drivers/media/platform/vimc/vimc-common.c  | 61 --
 drivers/media/platform/vimc/vimc-common.h  | 25 ++---
 drivers/media/platform/vimc/vimc-debayer.c |  8 ++-
 drivers/media/platform/vimc/vimc-scaler.c  | 26 -
 drivers/media/platform/vimc/vimc-sensor.c  | 29 +-
 6 files changed, 100 insertions(+), 73 deletions(-)

diff --git a/drivers/media/platform/vimc/vimc-capture.c
b/drivers/media/platform/vimc/vimc-capture.c
index 49bc8c133980..42719605f9d1 100644
--- a/drivers/media/platform/vimc/vimc-capture.c
+++ b/drivers/media/platform/vimc/vimc-capture.c
@@ -91,8 +91,7 @@ static int vimc_cap_try_fmt_vid_cap(struct file
*file, void *priv,
 struct v4l2_format *f)
 {
 struct v4l2_pix_format *format = &f->fmt.pix;
-const struct vimc_pix_map *vpix;
-const struct v4l2_format_info *vinfo;
+struct vimc_pix_map vpix;

 format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH,
 VIMC_FRAME_MAX_WIDTH) & ~1;
@@ -101,13 +100,12 @@ static int vimc_cap_try_fmt_vid_cap(struct file
*file, void *priv,

 /* Don't accept a pixelformat that is not on the table */
 vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
-if (!vpix) {
+if (!vpix.info) {
 format->pixelformat = fmt_default.pixelformat;
 vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
 }
-vinfo = v4l2_format_info(format->pixelformat);
 /* TODO: Add support for custom bytesperline values */
-format->bytesperline = format->width * vinfo->bpp[0];
+format->bytesperline = format->width * vpix.info->bpp[0];
 format->sizeimage = format->bytesperline * format->height;

 if (format->field == V4L2_FIELD_ANY)
@@ -154,12 +152,12 @@ static int vimc_cap_s_fmt_vid_cap(struct file
*file, void *priv,
 static int vimc_cap_enum_fmt_vid_cap(struct file *file, void *priv,
  struct v4l2_fmtdesc *f)
 {
-const struct vimc_pix_map *vpix = vimc_pix_map_by_index(f->index);
+const struct vimc_pix_map vpix = vimc_pix_map_by_index(f->index);

-if (!vpix)
+if (!vpix.info)
 return -EINVAL;

-f->pixelformat = vpix->pixelformat;
+f->pixelformat = vpix.info->format;

 return 0;
 }
@@ -167,14 +165,14 @@ static int vimc_cap_enum_fmt_vid_cap(struct file
*file, void *priv,
 static int vimc_cap_enum_framesizes(struct file *file, void *fh,
 struct v4l2_frmsizeenum *fsize)
 {
-const struct vimc_pix_map *vpix;
+struct vimc_pix_map vpix;

 if (fsize->index)
 return -EINVAL;

 /* Only accept code in the pix map table */
 vpix = vimc_pix_map_by_code(fsize->pixel_format);
-if (!vpix)
+if (!vpix.info)
 return -EINVAL;

 fsize->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
@@ -398,8 +396,7 @@ static int vimc_cap_comp_bind(struct device *comp,
struct device *master,
 {
 struct v4l2_device *v4l2_dev = master_data;
 struct vimc_platform_data *pdata = comp->platform_data;
-const struct vimc_pix_map *vpix;
-const struct v4l2_format_info *vinfo;
+struct vimc_pix_map vpix;
 struct vimc_cap_device *vcap;
 struct video_device *vdev;
 struct vb2_queue *q;
@@ -455,8 +452,7 @@ static int vimc_cap_comp_bind(struct device *comp,
struct device *master,
 /* Set default frame format */
 vcap->format = fmt_default;
 vpix = vimc_pix_map_by_pixelformat(vcap->format.pixelformat);
-vinfo = v4l2_format_info(vpix->pixelformat);
-vcap->format.bytesperline = vcap->format.width * vinfo->bpp[0];
+vcap->format.bytesperline = vcap->format.width * vpix.info->bpp[0];
 vcap->format.sizeimage = vcap->format.bytesperline *
  vcap->format.height;

diff --git a/drivers/media/platform/vimc/vimc-common.c
b/drivers/media/platform/vimc/vimc-common.c
index 2d1205f0a600..e807734bb373 100644
--- a/drivers/media/platform/vimc/vimc-common.c
+++ b/drivers/media/platform/vimc/vimc-common.c
@@ -14,7 +14,7 @@
  * NOTE: non-bayer formats need to come first (necessary for enum_mbus_code
  * in the scaler)
  */
-static const struct vimc_pix_map vimc_pix_map_list[] = {
+static struct vimc_pix_map_fmt vimc_pix_map_fmt_list[] = {
 /* TODO: add all missing formats */

 /* RGB formats */
@@ -118,36 +118,63 @@ static const struct vimc_pix_map vimc_pix_map_list[] = {
 },
 };

-const struct vimc_pix_map *vimc_pix_map_by_index(unsigned int i)
+struct vimc_pix_map vimc_pix_map_fmt_info(const struct vimc_pix_map_fmt *vfmt)
 {
-if (i >= ARRAY_SIZE(vimc_pix_map_list))
-return NULL;

-return &vimc_pix_map_list[i];
+struct vimc_pix_map vpix = {
+.code = vfmt->code,
+.info = v4l2_format_info(vfmt->pixelformat),
+};
+return vpix;
+}
+
+const struct vimc_pix_map vimc_pix_map_by_index(unsigned int i)
+{
+struct vimc_pix_map

[PATCH 1/2] media: vimc: get pixformat info from v4l2_format_info to avoid code repetition

2019-10-03 Thread Carlos Eduardo Clímaco Barbosa

As the info found in vim_pix_map members are already available in
v4l2_format_info so those were removed to optimize the code and their
calls were remapped to matching ones.
---
 drivers/media/platform/vimc/vimc-capture.c |  8 +++-
 drivers/media/platform/vimc/vimc-common.c  | 46 --
 drivers/media/platform/vimc/vimc-common.h  |  2 -
 drivers/media/platform/vimc/vimc-debayer.c |  6 ++-
 drivers/media/platform/vimc/vimc-scaler.c  | 20 +++---
 drivers/media/platform/vimc/vimc-sensor.c  |  8 +++-
 6 files changed, 31 insertions(+), 59 deletions(-)

diff --git a/drivers/media/platform/vimc/vimc-capture.c
b/drivers/media/platform/vimc/vimc-capture.c
index 1d56b91830ba..49bc8c133980 100644
--- a/drivers/media/platform/vimc/vimc-capture.c
+++ b/drivers/media/platform/vimc/vimc-capture.c
@@ -92,6 +92,7 @@ static int vimc_cap_try_fmt_vid_cap(struct file
*file, void *priv,
 {
 struct v4l2_pix_format *format = &f->fmt.pix;
 const struct vimc_pix_map *vpix;
+const struct v4l2_format_info *vinfo;

 format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH,
 VIMC_FRAME_MAX_WIDTH) & ~1;
@@ -104,8 +105,9 @@ static int vimc_cap_try_fmt_vid_cap(struct file
*file, void *priv,
 format->pixelformat = fmt_default.pixelformat;
 vpix = vimc_pix_map_by_pixelformat(format->pixelformat);
 }
+vinfo = v4l2_format_info(format->pixelformat);
 /* TODO: Add support for custom bytesperline values */
-format->bytesperline = format->width * vpix->bpp;
+format->bytesperline = format->width * vinfo->bpp[0];
 format->sizeimage = format->bytesperline * format->height;

 if (format->field == V4L2_FIELD_ANY)
@@ -397,6 +399,7 @@ static int vimc_cap_comp_bind(struct device *comp,
struct device *master,
 struct v4l2_device *v4l2_dev = master_data;
 struct vimc_platform_data *pdata = comp->platform_data;
 const struct vimc_pix_map *vpix;
+const struct v4l2_format_info *vinfo;
 struct vimc_cap_device *vcap;
 struct video_device *vdev;
 struct vb2_queue *q;
@@ -452,7 +455,8 @@ static int vimc_cap_comp_bind(struct device *comp,
struct device *master,
 /* Set default frame format */
 vcap->format = fmt_default;
 vpix = vimc_pix_map_by_pixelformat(vcap->format.pixelformat);
-vcap->format.bytesperline = vcap->format.width * vpix->bpp;
+vinfo = v4l2_format_info(vpix->pixelformat);
+vcap->format.bytesperline = vcap->format.width * vinfo->bpp[0];
 vcap->format.sizeimage = vcap->format.bytesperline *
  vcap->format.height;

diff --git a/drivers/media/platform/vimc/vimc-common.c
b/drivers/media/platform/vimc/vimc-common.c
index 7e1ae0b12f1e..2d1205f0a600 100644
--- a/drivers/media/platform/vimc/vimc-common.c
+++ b/drivers/media/platform/vimc/vimc-common.c
@@ -21,146 +21,100 @@ static const struct vimc_pix_map vimc_pix_map_list[] = {
 {
 .code = MEDIA_BUS_FMT_BGR888_1X24,
 .pixelformat = V4L2_PIX_FMT_BGR24,
-.bpp = 3,
-.bayer = false,
 },
 {
 .code = MEDIA_BUS_FMT_RGB888_1X24,
 .pixelformat = V4L2_PIX_FMT_RGB24,
-.bpp = 3,
-.bayer = false,
 },
 {
 .code = MEDIA_BUS_FMT_ARGB_1X32,
 .pixelformat = V4L2_PIX_FMT_ARGB32,
-.bpp = 4,
-.bayer = false,
 },

 /* Bayer formats */
 {
 .code = MEDIA_BUS_FMT_SBGGR8_1X8,
 .pixelformat = V4L2_PIX_FMT_SBGGR8,
-.bpp = 1,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SGBRG8_1X8,
 .pixelformat = V4L2_PIX_FMT_SGBRG8,
-.bpp = 1,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SGRBG8_1X8,
 .pixelformat = V4L2_PIX_FMT_SGRBG8,
-.bpp = 1,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SRGGB8_1X8,
 .pixelformat = V4L2_PIX_FMT_SRGGB8,
-.bpp = 1,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SBGGR10_1X10,
 .pixelformat = V4L2_PIX_FMT_SBGGR10,
-.bpp = 2,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SGBRG10_1X10,
 .pixelformat = V4L2_PIX_FMT_SGBRG10,
-.bpp = 2,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SGRBG10_1X10,
 .pixelformat = V4L2_PIX_FMT_SGRBG10,
-.bpp = 2,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SRGGB10_1X10,
 .pixelformat = V4L2_PIX_FMT_SRGGB10,
-.bpp = 2,
-.bayer = true,
 },

 /* 10bit raw bayer a-law compressed to 8 bits */
 {
 .code = MEDIA_BUS_FMT_SBGGR10_ALAW8_1X8,
 .pixelformat = V4L2_PIX_FMT_SBGGR10ALAW8,
-.bpp = 1,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SGBRG10_ALAW8_1X8,
 .pixelformat = V4L2_PIX_FMT_SGBRG10ALAW8,
-.bpp = 1,
-.bayer = true,
 },
 {
 .code = MEDIA_BUS_FMT_SGRBG10_ALAW8_1X8,
 .pixel

Re: [PATCH glibc 2.31 1/5] glibc: Perform rseq(2) registration at C startup and thread creation (v12)

2019-09-11 Thread Carlos O'Donell

On 9/11/19 3:08 PM, Florian Weimer wrote:
> * Carlos O'Donell:
> 
>> It would be easier to merge the patch set if it were just an unconditional
>> registration like we do for set_robust_list().
> 
> Note that this depends on the in-tree system call numbers list, which I
> still need to finish according to Joseph's specifications.

Which work is this? Do you have a URL reference to WIP?

> (We have something that should work for us as long as we can get large
> machines from the lab, but I agree that it's not very useful if glibc
> bot-cycle time is roughly one business day.)

Yeah, we have to discuss how to accelerate this.

-- 
Cheers,
Carlos.

Re: [PATCH glibc 2.31 1/5] glibc: Perform rseq(2) registration at C startup and thread creation (v12)

2019-09-11 Thread Carlos O'Donell

On 9/11/19 2:26 PM, Florian Weimer wrote:
> * Mathieu Desnoyers:
> 
>> +#ifdef SHARED
>> +  if (rtld_active ())
>> +{
>> +  /* Register rseq ABI to the kernel.   */
>> +  (void) rseq_register_current_thread ();
>> +}
>> +#else
> 
> I think this will need *another* check for the inner libc in an audit
> module.  See what we do in malloc.  __libc_multiple_libcs is supposed to
> cover that, but it's unfortunately not reliable.
> 
> I believe without that additional check, the first audit module we load
> will claim rseq, and the main program will not have control over the
> rseq area.  Reversed roles would be desirable here.
> 
> In October, I hope to fix __libc_multiple_libcs, and then you can use
> just that.  (We have a Fedora patch that is supposed to fix it, I need
> to documen the mechanism and upstream it.)

This is a technical issue we can resolve.

>> +/* Advertise Restartable Sequences registration ownership across
>> +   application and shared libraries.
>> +
>> +   Libraries and applications must check whether this variable is zero or
>> +   non-zero if they wish to perform rseq registration on their own. If it
>> +   is zero, it means restartable sequence registration is not handled, and
>> +   the library or application is free to perform rseq registration. In
>> +   that case, the library or application is taking ownership of rseq
>> +   registration, and may set __rseq_handled to 1. It may then set it back
>> +   to 0 after it completes unregistering rseq.
>> +
>> +   If __rseq_handled is found to be non-zero, it means that another
>> +   library (or the application) is currently handling rseq registration.
>> +
>> +   Typical use of __rseq_handled is within library constructors and
>> +   destructors, or at program startup.  */
>> +
>> +int __rseq_handled;
> 
> Are there any programs that use __rseq_handled *today*?
> 
> I'm less convinced that we actually need this.  I don't think we have
> ever done anything like that before, and I don't think it's necessary.
> Any secondary rseq library just needs to note if it could perform
> registration, and if it failed to do so, do not perform unregistration
> in a pthread destructor callback.
> 
> Sure, there's the matter of pthread destructor ordering, but that
> problem is not different from any other singleton (thread-local or not),
> and the fix for the last time this has come up (TLS destructors vs
> dlclose) was to upgrade glibc.

This is a braoder issue.

Mathieu,

It would be easier to merge the patch set if it were just an unconditional
registration like we do for set_robust_list().

What's your thought there?

-- 
Cheers,
Carlos.

Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER

2019-08-23 Thread Carlos Eduardo de Paula

On Thu, Aug 22, 2019 at 5:56 PM David Abdurachmanov
 wrote:
>
> This patch was extensively tested on Fedora/RISCV (applied by default on
> top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc
> on QEMU and SiFive Unleashed board.
>
> libseccomp (userspace) was rebased:
> https://github.com/seccomp/libseccomp/pull/134
>
> Fully passes libseccomp regression testing (simulation and live).
>
> There is one failing kernel selftest: global.user_notification_signal
>
> v1 -> v2:
>   - return immediatly if secure_computing(NULL) returns -1
>   - fixed whitespace issues
>   - add missing seccomp.h
>   - remove patch #2 (solved now)
>   - add riscv to seccomp kernel selftest
>
> Cc: keesc...@chromium.org
> Cc: m...@carlosedp.com
>
> Signed-off-by: David Abdurachmanov 
> Tested-by: Carlos de Paula 
> ---
>  arch/riscv/Kconfig| 14 ++
>  arch/riscv/include/asm/seccomp.h  | 10 +++
>  arch/riscv/include/asm/thread_info.h  |  5 +++-
>  arch/riscv/kernel/entry.S | 27 +--
>  arch/riscv/kernel/ptrace.c| 10 +++
>  tools/testing/selftests/seccomp/seccomp_bpf.c |  8 +-
>  6 files changed, 70 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/seccomp.h
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 59a4727ecd6c..441e63ff5adc 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -31,6 +31,7 @@ config RISCV
> select GENERIC_SMP_IDLE_THREAD
> select GENERIC_ATOMIC64 if !64BIT
> select HAVE_ARCH_AUDITSYSCALL
> +   select HAVE_ARCH_SECCOMP_FILTER
> select HAVE_MEMBLOCK_NODE_MAP
> select HAVE_DMA_CONTIGUOUS
> select HAVE_FUTEX_CMPXCHG if FUTEX
> @@ -235,6 +236,19 @@ menu "Kernel features"
>
>  source "kernel/Kconfig.hz"
>
> +config SECCOMP
> +   bool "Enable seccomp to safely compute untrusted bytecode"
> +   help
> + This kernel feature is useful for number crunching applications
> + that may need to compute untrusted bytecode during their
> + execution. By using pipes or other transports made available to
> + the process as file descriptors supporting the read/write
> + syscalls, it's possible to isolate those applications in
> + their own address space using seccomp. Once seccomp is
> + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
> + and the task is only allowed to execute a few safe syscalls
> + defined by each seccomp mode.
> +
>  endmenu
>
>  menu "Boot options"
> diff --git a/arch/riscv/include/asm/seccomp.h 
> b/arch/riscv/include/asm/seccomp.h
> new file mode 100644
> index ..bf7744ee3b3d
> --- /dev/null
> +++ b/arch/riscv/include/asm/seccomp.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _ASM_SECCOMP_H
> +#define _ASM_SECCOMP_H
> +
> +#include 
> +
> +#include 
> +
> +#endif /* _ASM_SECCOMP_H */
> diff --git a/arch/riscv/include/asm/thread_info.h 
> b/arch/riscv/include/asm/thread_info.h
> index 905372d7eeb8..a0b2a29a0da1 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -75,6 +75,7 @@ struct thread_info {
>  #define TIF_MEMDIE 5   /* is terminating due to OOM killer */
>  #define TIF_SYSCALL_TRACEPOINT  6   /* syscall tracepoint 
> instrumentation */
>  #define TIF_SYSCALL_AUDIT  7   /* syscall auditing */
> +#define TIF_SECCOMP8   /* syscall secure computing */
>
>  #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
> @@ -82,11 +83,13 @@ struct thread_info {
>  #define _TIF_NEED_RESCHED  (1 << TIF_NEED_RESCHED)
>  #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
> +#define _TIF_SECCOMP   (1 << TIF_SECCOMP)
>
>  #define _TIF_WORK_MASK \
> (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED)
>
>  #define _TIF_SYSCALL_WORK \
> -   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT)
> +   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \
> +_TIF_SECCOMP )
>
>  #endif /* _ASM_RISCV_THREAD_INFO_H */
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index bc7a56e1ca6f..0bbedfa3e47d 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -203,8 +203,25 @@ check

Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER

2019-08-23 Thread Carlos Eduardo de Paula

to make sure we don't jump to a bogus syscall number. */
> li t0, __NR_syscalls
> la s0, sys_ni_syscall
> -   /* Syscall number held in a7 */
> -   bgeu a7, t0, 1f
> +   /*
> +* The tracer can change syscall number to valid/invalid value.
> +* We use syscall_set_nr helper in syscall_trace_enter thus we
> +* cannot trust the current value in a7 and have to reload from
> +* the current task pt_regs.
> +*/
> +   REG_L a7, PT_A7(sp)
> +   /*
> +* Syscall number held in a7.
> +* If syscall number is above allowed value, redirect to ni_syscall.
> +*/
> +   bge a7, t0, 1f
> +   /*
> +* Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1.
> +* If yes, we pretend it was executed.
> +*/
> +   li t1, -1
> +   beq a7, t1, ret_from_syscall_rejected
> +   /* Call syscall */
> la s0, sys_call_table
> slli t0, a7, RISCV_LGPTR
> add s0, s0, t0
> @@ -215,6 +232,12 @@ check_syscall_nr:
>  ret_from_syscall:
> /* Set user a0 to kernel a0 */
> REG_S a0, PT_A0(sp)
> +   /*
> +* We didn't execute the actual syscall.
> +* Seccomp already set return value for the current task pt_regs.
> +* (If it was configured with SECCOMP_RET_ERRNO/TRACE)
> +*/
> +ret_from_syscall_rejected:
> /* Trace syscalls, but only if requested by the user. */
> REG_L t0, TASK_TI_FLAGS(tp)
> andi t0, t0, _TIF_SYSCALL_WORK
> diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
> index 368751438366..63e47c9f85f0 100644
> --- a/arch/riscv/kernel/ptrace.c
> +++ b/arch/riscv/kernel/ptrace.c
> @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs)
> if (tracehook_report_syscall_entry(regs))
> syscall_set_nr(current, regs, -1);
>
> +   /*
> +* Do the secure computing after ptrace; failures should be fast.
> +* If this fails we might have return value in a0 from seccomp
> +* (via SECCOMP_RET_ERRNO/TRACE).
> +*/
> +   if (secure_computing(NULL) == -1) {
> +   syscall_set_nr(current, regs, -1);
> +   return;
> +   }
> +
>  #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
> if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
> trace_sys_enter(regs, syscall_get_nr(current, regs));
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
> b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 6ef7f16c4cf5..492e0adad9d3 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -112,6 +112,8 @@ struct seccomp_data {
>  #  define __NR_seccomp 383
>  # elif defined(__aarch64__)
>  #  define __NR_seccomp 277
> +# elif defined(__riscv)
> +#  define __NR_seccomp 277
>  # elif defined(__hppa__)
>  #  define __NR_seccomp 338
>  # elif defined(__powerpc__)
> @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS struct user_pt_regs
>  # define SYSCALL_NUM   regs[8]
>  # define SYSCALL_RET   regs[0]
> +#elif defined(__riscv) && __riscv_xlen == 64
> +# define ARCH_REGS struct user_regs_struct
> +# define SYSCALL_NUM   a7
> +# define SYSCALL_RET   a0
>  #elif defined(__hppa__)
>  # define ARCH_REGS struct user_regs_struct
>  # define SYSCALL_NUM   gr[20]
> @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata,
> EXPECT_EQ(0, ret) {}
>
>  #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
> -defined(__s390__) || defined(__hppa__)
> +defined(__s390__) || defined(__hppa__) || defined(__riscv)
> {
> regs.SYSCALL_NUM = syscall;
> }
> --
> 2.21.0
>

Tested-by: Carlos de Paula 
-- 

Carlos Eduardo de Paula
m...@carlosedp.com
http://carlosedp.com
http://twitter.com/carlosedp
Linkedin

Re: [PATCH] rtl8712: rtl871x_ioctl_linux.c: fix unnecessary typecast

2019-08-06 Thread José Carlos Cazarin Filho

Sorry folks, I haven't properly tested this before sending the patch
After I've removed the cast, I got this error:

drivers/staging/rtl8712/rtl871x_ioctl_linux.c:668:13: error: SSE
register return with SSE disabled
(fwrq->m >= 2.412e8) &&
~^~~

But I think it's strange since you said that this compiled fine in
your environments.
I think we need to take a deeper look into this.

Kind regards

Em ter, 6 de ago de 2019 às 10:10, David Laight
 escreveu:
>
> From: Dan Carpenter
> > Sent: 06 August 2019 12:53
> > On Mon, Aug 05, 2019 at 10:33:29PM -0300, Jose Carlos Cazarin Filho wrote:
> > > Fix checkpath warning:
> > > WARNING: Unnecessary typecast of c90 int constant
> > >
> > > Signed-off-by: Jose Carlos Cazarin Filho 
> > > ---
> > >  Hello all!
> > >  This is my first commit to the Linux Kernel, I'm doing this to learn and 
> > > be able
> > >  to contribute more in the future
> > >  Peace all!
> > >  drivers/staging/rtl8712/rtl871x_ioctl_linux.c | 4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
> > b/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
> > > index 944336e0d..da371072e 100644
> > > --- a/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
> > > +++ b/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
> > > @@ -665,8 +665,8 @@ static int r8711_wx_set_freq(struct net_device *dev,
> > >
> > >  /* If setting by frequency, convert to a channel */
> > > if ((fwrq->e == 1) &&
> > > - (fwrq->m >= (int) 2.412e8) &&
> > > - (fwrq->m <= (int) 2.487e8)) {
> > > + (fwrq->m >= 2.412e8) &&
> > > + (fwrq->m <= 2.487e8)) {
> >
> > I don't think we can do this.  You're not allowed to use floats in the
> > kernel (because they make context switching slow).  I could have sworn
> > that we use the -nofp to stop the compile when people use floats but
> > this compiles fine for me.
>
> My guess is the 'c90 int constant' text.
>
> It rather implies that '2.412e8' has become the same as '214120'.
> Which is rather worrying because suddenly 'int_var * 2.4e8' might
> be an integer multiply rather than a double one and overflow.
> Have the standard people broken code again?
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> 1PT, UK
> Registration No: 1397386 (Wales)
>

[PATCH] staging: rtl8723bs: fix brace position in enum declaration

2019-08-06 Thread Jose Carlos Cazarin Filho

Fix two checkpath errors of type:
"open brace '{' following enum go on the same line"

Signed-off-by: Jose Carlos Cazarin Filho 
---
 drivers/staging/rtl8723bs/include/rtw_mlme.h | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/rtl8723bs/include/rtw_mlme.h 
b/drivers/staging/rtl8723bs/include/rtw_mlme.h
index d3c07d1c3..2223e1f13 100644
--- a/drivers/staging/rtl8723bs/include/rtw_mlme.h
+++ b/drivers/staging/rtl8723bs/include/rtw_mlme.h
@@ -81,15 +81,13 @@ enum dot11AuthAlgrthmNum {
 };
 
 /*  Scan type including active and passive scan. */
-typedef enum _RT_SCAN_TYPE
-{
+typedef enum _RT_SCAN_TYPE {
SCAN_PASSIVE,
SCAN_ACTIVE,
SCAN_MIX,
 }RT_SCAN_TYPE, *PRT_SCAN_TYPE;
 
-enum  _BAND
-{
+enum  _BAND {
GHZ24_50 = 0,
GHZ_50,
GHZ_24,
-- 
2.17.1

[PATCH] rtl8712: rtl871x_ioctl_linux.c: fix unnecessary typecast

2019-08-05 Thread Jose Carlos Cazarin Filho

Fix checkpath warning:
WARNING: Unnecessary typecast of c90 int constant

Signed-off-by: Jose Carlos Cazarin Filho 
---
 Hello all!
 This is my first commit to the Linux Kernel, I'm doing this to learn and be 
able
 to contribute more in the future
 Peace all! 
 drivers/staging/rtl8712/rtl871x_ioctl_linux.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rtl8712/rtl871x_ioctl_linux.c 
b/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
index 944336e0d..da371072e 100644
--- a/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
+++ b/drivers/staging/rtl8712/rtl871x_ioctl_linux.c
@@ -665,8 +665,8 @@ static int r8711_wx_set_freq(struct net_device *dev,
 
 /* If setting by frequency, convert to a channel */
if ((fwrq->e == 1) &&
- (fwrq->m >= (int) 2.412e8) &&
- (fwrq->m <= (int) 2.487e8)) {
+ (fwrq->m >= 2.412e8) &&
+ (fwrq->m <= 2.487e8)) {
int f = fwrq->m / 10;
int c = 0;
 
-- 
2.17.1

[PATCH] rtl8188eu: hal: phy.c

2019-08-02 Thread Jose Carlos Cazarin Filho

Fix a lot of checkpath errors of the type:
-CHECK: spaces preferred around that
-CHECK: Alignment should match open parenthesis

Signed-off-by: Jose Carlos Cazarin Filho 
---
 My second commit to the kernel, I know you ppl don't like these kind of commits
 fixing style-only erros pointed by the checkpath, but I'm doing this just to 
learn
 Thanks!
 drivers/staging/rtl8188eu/hal/phy.c | 54 ++---
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/drivers/staging/rtl8188eu/hal/phy.c 
b/drivers/staging/rtl8188eu/hal/phy.c
index 51c40abfa..f0d242dec 100644
--- a/drivers/staging/rtl8188eu/hal/phy.c
+++ b/drivers/staging/rtl8188eu/hal/phy.c
@@ -52,7 +52,7 @@ void phy_set_bb_reg(struct adapter *adapt, u32 regaddr, u32 
bitmask, u32 data)
 }
 
 static u32 rf_serial_read(struct adapter *adapt,
-   enum rf_radio_path rfpath, u32 offset)
+ enum rf_radio_path rfpath, u32 offset)
 {
u32 ret = 0;
struct bb_reg_def *phyreg = &adapt->HalData->PHYRegDef[rfpath];
@@ -69,10 +69,10 @@ static u32 rf_serial_read(struct adapter *adapt,
bMaskDWord);
 
tmplong2 = (tmplong2 & (~bLSSIReadAddress)) |
-  (offset<<23) | bLSSIReadEdge;
+  (offset << 23) | bLSSIReadEdge;
 
phy_set_bb_reg(adapt, rFPGA0_XA_HSSIParameter2, bMaskDWord,
-  tmplong&(~bLSSIReadEdge));
+  tmplong & (~bLSSIReadEdge));
udelay(10);
 
phy_set_bb_reg(adapt, phyreg->rfHSSIPara2, bMaskDWord, tmplong2);
@@ -102,12 +102,12 @@ static void rf_serial_write(struct adapter *adapt,
struct bb_reg_def *phyreg = &adapt->HalData->PHYRegDef[rfpath];
 
offset &= 0xff;
-   data_and_addr = ((offset<<20) | (data&0x000f)) & 0x0fff;
+   data_and_addr = ((offset << 20) | (data & 0x000f)) & 0x0fff;
phy_set_bb_reg(adapt, phyreg->rf3wireOffset, bMaskDWord, data_and_addr);
 }
 
 u32 rtw_hal_read_rfreg(struct adapter *adapt, enum rf_radio_path rf_path,
-u32 reg_addr, u32 bit_mask)
+  u32 reg_addr, u32 bit_mask)
 {
u32 original_value, bit_shift;
 
@@ -117,7 +117,7 @@ u32 rtw_hal_read_rfreg(struct adapter *adapt, enum 
rf_radio_path rf_path,
 }
 
 void phy_set_rf_reg(struct adapter *adapt, enum rf_radio_path rf_path,
-u32 reg_addr, u32 bit_mask, u32 data)
+   u32 reg_addr, u32 bit_mask, u32 data)
 {
u32 original_value, bit_shift;
 
@@ -143,20 +143,20 @@ static void get_tx_power_index(struct adapter *adapt, u8 
channel, u8 *cck_pwr,
for (TxCount = 0; TxCount < path_nums; TxCount++) {
if (TxCount == RF_PATH_A) {
cck_pwr[TxCount] = 
hal_data->Index24G_CCK_Base[TxCount][index];
-   ofdm_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index]+
+   ofdm_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index] +

hal_data->OFDM_24G_Diff[TxCount][RF_PATH_A];
 
-   bw20_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index]+
+   bw20_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index] +

hal_data->BW20_24G_Diff[TxCount][RF_PATH_A];
bw40_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[TxCount][index];
} else if (TxCount == RF_PATH_B) {
cck_pwr[TxCount] = 
hal_data->Index24G_CCK_Base[TxCount][index];
-   ofdm_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index]+
-   hal_data->BW20_24G_Diff[RF_PATH_A][index]+
+   ofdm_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index] +
+   hal_data->BW20_24G_Diff[RF_PATH_A][index] +
hal_data->BW20_24G_Diff[TxCount][index];
 
-   bw20_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index]+
-   hal_data->BW20_24G_Diff[TxCount][RF_PATH_A]+
+   bw20_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[RF_PATH_A][index] +
+   hal_data->BW20_24G_Diff[TxCount][RF_PATH_A] +
hal_data->BW20_24G_Diff[TxCount][index];
bw40_pwr[TxCount] = 
hal_data->Index24G_BW40_Base[TxCount][index];
}
@@ -190,7 +190,7 @@ void phy_set_tx_power_level(struct adapter *adapt, u8 
channel)
 
rtl88eu_phy_rf6052_set_cck_txpower(adapt, &cck_pwr[0]);
rtl88eu_phy_rf6052_set_ofdm_txpower(adapt, &ofdm_pwr[0], &bw20_pwr[0],
- &

[PATCH] isdn: hysdn: Fix error spaces around '*'

2019-08-02 Thread Jose Carlos Cazarin Filho

Fix checkpath error:
CHECK: spaces preferred around that '*' (ctx:WxV)
+extern hysdn_card *card_root;/* pointer to first card */

Signed-off-by: Jose Carlos Cazarin Filho 
---
 Hello all!
 This is my first commit to the Linux Kernel, I'm doing this to learn and be 
able
 to contribute more in the future
 Peace all! 

 drivers/staging/isdn/hysdn/hysdn_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/isdn/hysdn/hysdn_defs.h 
b/drivers/staging/isdn/hysdn/hysdn_defs.h
index cdac46a21..f20150d62 100644
--- a/drivers/staging/isdn/hysdn/hysdn_defs.h
+++ b/drivers/staging/isdn/hysdn/hysdn_defs.h
@@ -220,7 +220,7 @@ typedef struct hycapictrl_info hycapictrl_info;
 /*/
 /* exported vars */
 /*/
-extern hysdn_card *card_root;  /* pointer to first card */
+extern hysdn_card * card_root; /* pointer to first card */
 
 
 
-- 
2.20.1

Re: [PATCH] fs: xfs: xfs_log: Change return type from int to void

2019-07-03 Thread Carlos Maiolino

On Tue, Jul 02, 2019 at 11:45:47PM +0530, Hariprasad Kelam wrote:
> Change return types of below functions as they never fails
> xfs_log_mount_cancel
> xlog_recover_cancel
> xlog_recover_cancel_intents
> 
> fix below issue reported by coccicheck
> fs/xfs/xfs_log_recover.c:4886:7-12: Unneeded variable: "error". Return
> "0" on line 4926
> 
> Signed-off-by: Hariprasad Kelam 


Looks ok.

You can add:

Reviewed-by: Carlos Maiolino 

> ---
>  fs/xfs/xfs_log.c |  8 ++--
>  fs/xfs/xfs_log.h |  2 +-
>  fs/xfs/xfs_log_priv.h|  2 +-
>  fs/xfs/xfs_log_recover.c | 12 +++-
>  4 files changed, 7 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index cbaf348..00e9f5c 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -769,16 +769,12 @@ xfs_log_mount_finish(
>   * The mount has failed. Cancel the recovery if it hasn't completed and 
> destroy
>   * the log.
>   */
> -int
> +void
>  xfs_log_mount_cancel(
>   struct xfs_mount*mp)
>  {
> - int error;
> -
> - error = xlog_recover_cancel(mp->m_log);
> + xlog_recover_cancel(mp->m_log);
>   xfs_log_unmount(mp);
> -
> - return error;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
> index f27b1cb..84e0680 100644
> --- a/fs/xfs/xfs_log.h
> +++ b/fs/xfs/xfs_log.h
> @@ -117,7 +117,7 @@ int xfs_log_mount(struct xfs_mount*mp,
>   xfs_daddr_t start_block,
>   int num_bblocks);
>  intxfs_log_mount_finish(struct xfs_mount *mp);
> -int  xfs_log_mount_cancel(struct xfs_mount *);
> +void xfs_log_mount_cancel(struct xfs_mount *);
>  xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
>  xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
>  void   xfs_log_space_wake(struct xfs_mount *mp);
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 8acacbc..b880c23 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -418,7 +418,7 @@ xlog_recover(
>  extern int
>  xlog_recover_finish(
>   struct xlog *log);
> -extern int
> +extern void
>  xlog_recover_cancel(struct xlog *);
>  
>  extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header 
> *rhead,
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 1fc70ac..13d1d3e 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -4875,12 +4875,11 @@ xlog_recover_process_intents(
>   * A cancel occurs when the mount has failed and we're bailing out.
>   * Release all pending log intent items so they don't pin the AIL.
>   */
> -STATIC int
> +STATIC void
>  xlog_recover_cancel_intents(
>   struct xlog *log)
>  {
>   struct xfs_log_item *lip;
> - int error = 0;
>   struct xfs_ail_cursor   cur;
>   struct xfs_ail  *ailp;
>  
> @@ -4920,7 +4919,6 @@ xlog_recover_cancel_intents(
>  
>   xfs_trans_ail_cursor_done(&cur);
>   spin_unlock(&ailp->ail_lock);
> - return error;
>  }
>  
>  /*
> @@ -5779,16 +5777,12 @@ xlog_recover_finish(
>   return 0;
>  }
>  
> -int
> +void
>  xlog_recover_cancel(
>   struct xlog *log)
>  {
> - int error = 0;
> -
>   if (log->l_flags & XLOG_RECOVERY_NEEDED)
> - error = xlog_recover_cancel_intents(log);
> -
> - return error;
> + xlog_recover_cancel_intents(log);
>  }
>  
>  #if defined(DEBUG)
> -- 
> 2.7.4
> 

-- 
Carlos

Re: [PATCH 1/5] glibc: Perform rseq(2) registration at C startup and thread creation (v10)

2019-06-10 Thread Carlos O'Donell

On 6/6/19 7:57 AM, Florian Weimer wrote:
> Let me ask the key question again: Does it matter if code observes the
> rseq area first without kernel support, and then with kernel support?
> If we don't expect any problems immediately, we do not need to worry
> much about the constructor ordering right now.  I expect that over time,
> fixing this properly will become easier.

I just wanted to chime in and say that splitting this into:

* Ownership (__rseq_handled)

* Initialization (__rseq_abi)

Makes sense to me.

I agree we need an answer to this question of ownership but not yet
initialized, to owned and initialized.

I like the idea of having __rseq_handled in ld.so.

-- 
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-04-09 Thread Carlos O'Donell


On 4/9/19 9:58 AM, Tulio Magno Quites Machado Filho wrote:

Alan Modra  writes:

Yes, looks fine to me, except that in VLE mode (do we care?)
".long 0x0fe50553" disassembles as
0:  0f e5   se_cmphl r5,r30
2:  05 53   se_mullw r3,r5
No illegal/trap/privileged insn there.

".long 0x0fe5000b" might be better to cover VLE.


Looks good for me too.


The requirement that it be a valid instruction is simply to aid in the
disassembly of rseq regions which may be hand written assembly with a
thin veneer of CFI/DWARF information.

It has already been pointed out that POWER uses data in the instruction
stream for jump tables to implement switch statements, but that specific
use has compiler support and one presumes good debug information. So as
Alan says, there is already data in the insn stream, though such things
can't be good for performance (pollutes D-cache, problematic for
speculative execution).


Actually, it better fits what Carlos O'Donnell had requested:


I think the order of preference is:

1.  An uncommon insn (with random immediate values), in a literal pool, that is
  not a useful ROP/JOP sequence (very uncommon)
2a. A uncommon TRAP hopefully with some immediate data encoded (maybe uncommon)
2b. A NOP to avoid affecting speculative execution (maybe uncommon)

With 2a/2b being roughly equivalent depending on speculative execution policy.


Yes, though "in a literal pool" is something that is not required, since
users might not want literal pools and so we shouldn't require that
feature (it also pollutes D-cache).

Keep in mind the insn will never execute.

If a trap insn calls out the nature of the signature more clearly then
use that instead.

--
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-04-08 Thread Carlos O'Donell


On 4/8/19 3:20 PM, Tulio Magno Quites Machado Filho wrote:

Carlos O'Donell  writes:


On 4/5/19 5:16 AM, Florian Weimer wrote:

* Carlos O'Donell:

It is valuable that it be a trap, particularly for constant pools because
it means that a jump into the constant pool will trap.


Sorry, I don't understand why this matters in this context.  Would you
please elaborate?


Sorry, I wasn't very clear.

My point is only that any accidental jumps, either with off-by-one (like you
fixed in gcc/glibc's signal unwinding most recently), result in a process fault
rather than executing RSEQ_SIG as a valid instruction *and then* continuing
onwards to the handler.

A process fault is achieved either by a trap, or an invalid instruction, or
a privileged insn (like suggested for MIPS in this thread).


In that case, mtmsr (Move to Machine State Register) seems a good candidate.

mtmsr is available both on 32 and 64 bits since their first implementations.

It's a privileged instruction and should never appear in userspace
code (causes SIGILL).

Any comments?
 
That seems good to me.


Mathieu,

What's required to move this forward for POWER?

--
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-04-05 Thread Carlos O'Donell


On 4/5/19 5:16 AM, Florian Weimer wrote:

* Carlos O'Donell:

It is valuable that it be a trap, particularly for constant pools because
it means that a jump into the constant pool will trap.


Sorry, I don't understand why this matters in this context.  Would you
please elaborate?


Sorry, I wasn't very clear.

My point is only that any accidental jumps, either with off-by-one (like you
fixed in gcc/glibc's signal unwinding most recently), result in a process fault
rather than executing RSEQ_SIG as a valid instruction *and then* continuing
onwards to the handler.

A process fault is achieved either by a trap, or an invalid instruction, or
a privileged insn (like suggested for MIPS in this thread).

--
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-04-04 Thread Carlos O'Donell


On 3/25/19 11:54 AM, Mathieu Desnoyers wrote:

Hi Carlos,

- On Mar 22, 2019, at 4:09 PM, Carlos O'Donell codon...@redhat.com wrote:

[...]

I took care of all your comments for an upcoming round of patches, except the
following that remain open (see answer inline). I'm adding Linux maintainers
for ARM, aarch64, MIPS, powerpc, s390, x86 in CC to discuss the choice of
code signature prior to the abort handler for each of those architectures.


Thank you for kicking off this conversation.

Every architecture should have a reasonable RSEQ_SIG that applies to their
ISA with a comment about why that instruction was chosen. It should be a
conscious choice, without a default.


* Support for automatically registering threads with the Linux rseq(2)
   system call has been added.  This system call is implemented starting
   from Linux 4.18.  The Restartable Sequences ABI accelerates user-space
   operations on per-cpu data.  It allows user-space to perform updates
   on per-cpu data without requiring heavy-weight atomic operations. See
   https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/
   for further explanation.

   In order to be activated, it requires that glibc is built against
   kernel headers that include this system call, and that glibc detects
   availability of that system call at runtime.


Suggest:

* Support for automatically registering threads with the Linux rseq(2)
  system call has been added.  This system call is implemented starting
  from Linux 4.18.  The Restartable Sequences ABI accelerates user-space
  operations on per-cpu data.  It allows user-space to perform updates
  on per-cpu data without requiring heavy-weight atomic operations.
  Automatically registering threads allows all libraries, including libc,
  to make immediate use of the rseq(2) support by using the documented ABI.
  See 'man 2 rseq' for the details of the ABI shared between libc and the
  kernel.



For reference the assembly code I'm pointing at below can be found
in the Linux selftests under:

tools/testing/selftests/rseq/rseq-*.h


OK.



+++ b/sysdeps/unix/sysv/linux/arm/bits/rseq.h

[...]

+
+/* Signature required before each abort handler code.  */
+#define RSEQ_SIG 0x53053053


Why isn't this an arm specific op code? Does the user have to mark this
up as part of a constant pool when placing it in front of the abort handler
to avoid disassembling the constant as code? This was at one point required
to get gdb to work properly.



For arm, the abort is defined as:

#define __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown,   \
 abort_label, version, flags,\
 start_ip, post_commit_offset, abort_ip) \
 ".balign 32\n\t"\
 __rseq_str(table_label) ":\n\t" \
 ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
 ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 
0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
 ".word " __rseq_str(RSEQ_SIG) "\n\t"\
 __rseq_str(label) ":\n\t"   \
 teardown\
 "b %l[" __rseq_str(abort_label) "]\n\t"

Which contains a copy of the struct rseq_cs for that critical section
close to the actual critical section, within the code, followed by the
signature. The reason why we have a copy of the struct rseq_cs there is
to speed up entry into the critical section by using the "adr" instruction
to compute the address to store into __rseq_cs->rseq_cs.

AFAIU, a literal pool on ARM is defined as something which is always
jumped over (never executed), which is the case here. We always have
an unconditional branch instruction ("b") skipping over each
RSEQ_ASM_DEFINE_ABORT().

Therefore, given that the signature is part of a literal pool on ARM,
it can be any value we choose and should not need to be an actual valid
instruction.

aarch64 defines the abort as:

#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)   
\
 "   b   222f\n"
 \
 "   .inst   "   __rseq_str(RSEQ_SIG) "\n"  
 \
 __rseq_str(label) ":\n"
 \
 "   b   %l[" __rseq_str(abort_label) "]\n" 
 \
 "222:\n"

Where the signature actually maps to a valid instruction. Considering that
aarch64 also have literal pools, and we branch over the signature, I

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-04-04 Thread Carlos O'Donell


On 4/2/19 3:08 AM, Florian Weimer wrote:

* Michael Ellerman:


I'm a bit vague on what we're trying to do here.

But it seems like you want some sort of "eye catcher" prior to the branch?

That value is a valid instruction on current CPUs (rlwimi.
r5,r24,6,1,9), and even if it wasn't it could become one in future.

If you change it to 0x8053530 that is both a valid instruction and is a
nop (conditional trap immediate but with no conditions set).


I think we need something that is very unlikely to appear in the
instruction stream.  It's just a marker.  The instruction will never be
executed, and it does not have to be a trap, either (I believe that a
standard trap instruction would be a bad choice).


I assume you want to avoid a standard trap instruction because it would
be common, and so not meet the intent of the RSEQ_SIG choice as being something
that is *uncommon* right?

It is valuable that it be a trap, particularly for constant pools because
it means that a jump into the constant pool will trap.

--
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-04-04 Thread Carlos O'Donell


On 4/2/19 2:02 AM, Michael Ellerman wrote:

Mathieu Desnoyers  writes:

Hi Carlos,

- On Mar 22, 2019, at 4:09 PM, Carlos O'Donell codon...@redhat.com wrote:

...


[...]

+++ b/sysdeps/unix/sysv/linux/powerpc/bits/rseq.h

[...]

+/* Signature required before each abort handler code.  */
+#define RSEQ_SIG 0x53053053


Why isn't this an opcode specific to power?


On powerpc 32/64, the abort is placed in a __rseq_failure executable section:

#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)   
\
 ".pushsection __rseq_failure, \"ax\"\n\t"  
 \
 ".long " __rseq_str(RSEQ_SIG) "\n\t"   
 \
 __rseq_str(label) ":\n\t"  
 \
 "b %l[" __rseq_str(abort_label) "]\n\t"
 \
 ".popsection\n\t"

That section only contains snippets of those trampolines. Arguably, it would be
good if disassemblers could find valid instructions there. Boqun Feng could 
perhaps
shed some light on this signature choice ? Now would be a good time to decide
once and for all whether a valid instruction would be a better choice.


I'm a bit vague on what we're trying to do here.

But it seems like you want some sort of "eye catcher" prior to the branch?

That value is a valid instruction on current CPUs (rlwimi.
r5,r24,6,1,9), and even if it wasn't it could become one in future.

If you change it to 0x8053530 that is both a valid instruction and is a
nop (conditional trap immediate but with no conditions set).


The s390 IBM team needs to respond to this and I want to make sure they ACK
the NOP being used here because it impacts them directly.

I'd like to see Martin's opinion on this.

--
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-03-27 Thread Carlos O'Donell


On 3/27/19 5:16 AM, Martin Schwidefsky wrote:

On Mon, 25 Mar 2019 11:54:32 -0400 (EDT)
Mathieu Desnoyers  wrote:


+++ b/sysdeps/unix/sysv/linux/s390/bits/rseq.h

[...]

+
+/* Signature required before each abort handler code.  */
+#define RSEQ_SIG 0x53053053


Why not a s390 specific value here?


s390 also has the abort handler in a __rseq_failure section:

#define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label) \
 ".pushsection __rseq_failure, \"ax\"\n\t"   \
 ".long " __rseq_str(RSEQ_SIG) "\n\t"\
 __rseq_str(label) ":\n\t"   \
 teardown\
 "j %l[" __rseq_str(abort_label) "]\n\t" \
 ".popsection\n\t"

Same question applies as powerpc: since disassemblers will try to decode
that instruction, would it be better to define it as a valid one ?

[...]


A 4-byte sequence starting with 0x53 is decoded as a "diebr" instruction.
And please replace that "j %l[...]" with a "jg %l[...]", the branch target
range of the "j" instruction is 64K, not enough for the general case.


Why was this particular operated selected?
 
So on s390 the RSEQ_SIG will show up as an unexpected "divide to integer"

instruction that can't be reached by any control flow?

Can we use a NOP with a unique value in an immediate operand?

The goal being to have something that won't confuse during a debug
session, or that the debugger can ignore (like constant pools on Arm)

--
Cheers,
Carlos.

Re: [PATCH 2/4] glibc: sched_getcpu(): use rseq cpu_id TLS on Linux

2019-03-22 Thread Carlos O'Donell


On 2/12/19 2:42 PM, Mathieu Desnoyers wrote:

When available, use the cpu_id field from __rseq_abi on Linux to
implement sched_getcpu(). Fall-back on the vgetcpu vDSO if unavailable.

Benchmarks:

x86-64: Intel E5-2630 v3@2.40GHz, 16-core, hyperthreading


This patch looks good to me for master, but is blocked on patch 1/4
being reworked.

Reviewed-by: Carlos O'Donell 


glibc sched_getcpu(): 13.7 ns (baseline)
glibc sched_getcpu() using rseq:   2.5 ns (speedup:  5.5x)
inline load cpuid from __rseq_abi TLS: 0.8 ns (speedup: 17.1x)

Signed-off-by: Mathieu Desnoyers 
CC: Carlos O'Donell 
CC: Florian Weimer 
CC: Joseph Myers 
CC: Szabolcs Nagy 
CC: Thomas Gleixner 
CC: Ben Maurer 
CC: Peter Zijlstra 
CC: "Paul E. McKenney" 
CC: Boqun Feng 
CC: Will Deacon 
CC: Dave Watson 
CC: Paul Turner 
CC: libc-al...@sourceware.org
CC: linux-kernel@vger.kernel.org
CC: linux-...@vger.kernel.org
---
  sysdeps/unix/sysv/linux/sched_getcpu.c | 25 +++--
  1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c 
b/sysdeps/unix/sysv/linux/sched_getcpu.c
index fb0d317f83..8bfb03778b 100644
--- a/sysdeps/unix/sysv/linux/sched_getcpu.c
+++ b/sysdeps/unix/sysv/linux/sched_getcpu.c
@@ -24,8 +24,8 @@
  #endif
  #include 
  
-int

-sched_getcpu (void)
+static int
+vsyscall_sched_getcpu (void)


OK.


  {
  #ifdef __NR_getcpu
unsigned int cpu;
@@ -37,3 +37,24 @@ sched_getcpu (void)
return -1;
  #endif
  }
+
+#ifdef __NR_rseq
+#include 
+
+extern __attribute__ ((tls_model ("initial-exec")))
+__thread volatile struct rseq __rseq_abi;


OK.


+
+int
+sched_getcpu (void)
+{
+  int cpu_id = __rseq_abi.cpu_id;
+
+  return cpu_id >= 0 ? cpu_id : vsyscall_sched_getcpu ();


OK. Impressive :-)


+}
+#else
+int
+sched_getcpu (void)
+{
+  return vsyscall_sched_getcpu ();


OK.


+}
+#endif




--
Cheers,
Carlos.

Re: [PATCH 1/4] glibc: Perform rseq(2) registration at C startup and thread creation (v7)

2019-03-22 Thread Carlos O'Donell


On 2/12/19 2:42 PM, Mathieu Desnoyers wrote:

Register rseq(2) TLS for each thread (including main), and unregister
for each thread (excluding main). "rseq" stands for Restartable
Sequences.


Thanks, the implementation is looking good, before this goes in it needs
some procedural cleanup and the following:

(a) I would like to see a final position from Florian, either accept,
object (objection recorded), or sustained objection (blocks this patch),
on the issue of the registration protocol.

I suggested a possible way forward to remove the registration protocol
down the line, leaving only the __rseq_lib_abi as the residual artifact
once we deprecate coordination with other libraries. Applications would
have to be written with this in mind, and coordination still remains
with libc for the time being. Perhaps Mathieu has another suggestion.

(b) I would like to see a final position from Rich Felker, either accept,
object (objection recorded), or sustained objection (blocks this patch),
on the patch set as a whole. I trust Rich's opinion as an alternative
libc maintainer in this matter and as a independent implementor of
similar standards. Rich will have to implement this for musl, and it
would be good to have consensus from him.

So while I reviewed this patch, and provided nit-picky feedback below,
I think we need a whole new thread, to finalize the way forward for the
registration process.

Next steps:

- Start a new thread to talk *only* about options for removing the
  refcounted registration.


See the rseq(2) man page proposed here:
   https://lkml.org/lkml/2018/9/19/647


Thank you for having a man page, that makes for a very easy to follow
description of the structures, syscall and use cases.



This patch is based on glibc-2.29. The rseq(2) system call was merged
into Linux 4.18.

Signed-off-by: Mathieu Desnoyers 
CC: Carlos O'Donell 
CC: Florian Weimer 
CC: Joseph Myers 
CC: Szabolcs Nagy 
CC: Thomas Gleixner 
CC: Ben Maurer 
CC: Peter Zijlstra 
CC: "Paul E. McKenney" 
CC: Boqun Feng 
CC: Will Deacon 
CC: Dave Watson 
CC: Paul Turner 
CC: Rich Felker 
CC: libc-al...@sourceware.org
CC: linux-kernel@vger.kernel.org
CC: linux-...@vger.kernel.org
---
Changes since v1:
- Move __rseq_refcount to an extra field at the end of __rseq_abi to
   eliminate one symbol.

   All libraries/programs which try to register rseq (glibc,
   early-adopter applications, early-adopter libraries) should use the
   rseq refcount. It becomes part of the ABI within a user-space
   process, but it's not part of the ABI shared with the kernel per se.

- Restructure how this code is organized so glibc keeps building on
   non-Linux targets.

- Use non-weak symbol for __rseq_abi.

- Move rseq registration/unregistration implementation into its own
   nptl/rseq.c compile unit.

- Move __rseq_abi symbol under GLIBC_2.29.

Changes since v2:
- Move __rseq_refcount to its own symbol, which is less ugly than
   trying to play tricks with the rseq uapi.
- Move __rseq_abi from nptl to csu (C start up), so it can be used
   across glibc, including memory allocator and sched_getcpu(). The
   __rseq_refcount symbol is kept in nptl, because there is no reason
   to use it elsewhere in glibc.

Changes since v3:
- Set __rseq_refcount TLS to 1 on register/set to 0 on unregister
   because glibc is the first/last user.
- Unconditionally register/unregister rseq at thread start/exit, because
   glibc is the first/last user.
- Add missing abilist items.
- Rebase on glibc master commit a502c5294.
- Add NEWS entry.

Changes since v4:
- Do not use "weak" symbols for __rseq_abi and __rseq_refcount. Based on
   "System V Application Binary Interface", weak only affects the link
   editor, not the dynamic linker.
- Install a new sys/rseq.h system header on Linux, which contains the
   RSEQ_SIG definition, __rseq_abi declaration and __rseq_refcount
   declaration. Move those definition/declarations from rseq-internal.h
   to the installed sys/rseq.h header.
- Considering that rseq is only available on Linux, move csu/rseq.c to
   sysdeps/unix/sysv/linux/rseq-sym.c.
- Move __rseq_refcount from nptl/rseq.c to
   sysdeps/unix/sysv/linux/rseq-sym.c, so it is only defined on Linux.
- Move both ABI definitions for __rseq_abi and __rseq_refcount to
   sysdeps/unix/sysv/linux/Versions, so they only appear on Linux.
- Document __rseq_abi and __rseq_refcount volatile.
- Document the RSEQ_SIG signature define.
- Move registration functions from rseq.c to rseq-internal.h static
   inline functions. Introduce empty stubs in misc/rseq-internal.h,
   which can be overridden by architecture code in
   sysdeps/unix/sysv/linux/rseq-internal.h.
- Rename __rseq_register_current_thread and __rseq_unregister_current_thread
   to rseq_register_current_thread and rseq_unregister_current_thread,
   now that those are only visible as internal static inline functions.
-

Re: Official Linux system wrapper library?

2018-12-10 Thread Carlos O'Donell

On 12/10/18 11:27 AM, Jonathan Corbet wrote:
> On Sat, 8 Dec 2018 20:38:56 -0800
> Randy Dunlap  wrote:
> 
>> On 11/12/18 8:08 AM, Jonathan Corbet wrote:
>>> On Sun, 11 Nov 2018 18:36:30 -0800
>>> Greg KH  wrote:
>>>   
>>>> We should have a checklist.  That's a great idea.  Now to find someone
>>>> to write it... :)  
>>>
>>> Do we think the LPC session might have the right people to create such a
>>> thing?  If so, I can try to put together a coherent presentation of the
>>> result.  
>>
>> Hi,
>> Did anything ever happen with this syscall checklist suggestion?
> 
> No, we really didn't have the right people around to do that,
> unfortunately.  

We already have Documentation/process/adding-syscalls.rst.

The documentation there is quite thorough.

It lists things that people commonly forget e.g. email 
linux-...@vger.kernel.org.

Would it be acceptable to attempt to collate per-libc information
into the adding-syscalls.rst under a new section called:

"Integration with libc"

-- 
Cheers,
Carlos.

Re: [PATCH V2] xfs: libxfs: move xfs_perag_put late

2018-11-27 Thread Carlos Maiolino

On Tue, Nov 27, 2018 at 08:33:38AM +0800, Pan Bian wrote:
> The function xfs_alloc_get_freelist calls xfs_perag_put to drop the
> reference. However, pag->pagf_btreeblks is read and write after the 
> put operation. This patch moves the put operation late.

I'm not a native English speaker too, but I believe it should be "is read and
written after..."

But, for the code itself, you can add:

Reviewed-by: Carlos Maiolino 

Cheers

> 
> Signed-off-by: Pan Bian 
> ---
> V2: correct the commit log
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index e1c0c0d..4be387d 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -2435,7 +2435,6 @@ xfs_alloc_get_freelist(
>   be32_add_cpu(&agf->agf_flcount, -1);
>   xfs_trans_agflist_delta(tp, -1);
>   pag->pagf_flcount--;
> - xfs_perag_put(pag);
>  
>   logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
>   if (btreeblk) {
> @@ -2443,6 +2442,7 @@ xfs_alloc_get_freelist(
>   pag->pagf_btreeblks++;
>   logflags |= XFS_AGF_BTREEBLKS;
>   }
> + xfs_perag_put(pag);
>  
>   xfs_alloc_log_agf(tp, agbp, logflags);
>   *bnop = bno;
> -- 
> 2.7.4
> 
> 

-- 
Carlos

Re: [PATCH] xfs: libxfs: move xfs_perag_put late

2018-11-26 Thread Carlos Maiolino

On Sat, Nov 24, 2018 at 05:44:20PM +0800, Pan Bian wrote:
> The function xfs_alloc_get_freelist calls xfs_perag_put to drop the
> reference. In this case, pag may be released. However,
> pag->pagf_btreeblks is read and write after the put operation. This may
> result in a use-after-free bug. This patch moves the put operation late.
> 

The patch looks reasonable, can you detail more how did you find it? Via code
inspection of you hit this user-after-free in some way?

Cheers

> Signed-off-by: Pan Bian 
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index e1c0c0d..4be387d 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -2435,7 +2435,6 @@ xfs_alloc_get_freelist(
>   be32_add_cpu(&agf->agf_flcount, -1);
>   xfs_trans_agflist_delta(tp, -1);
>   pag->pagf_flcount--;
> - xfs_perag_put(pag);
>  
>   logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
>   if (btreeblk) {
> @@ -2443,6 +2442,7 @@ xfs_alloc_get_freelist(
>   pag->pagf_btreeblks++;
>   logflags |= XFS_AGF_BTREEBLKS;
>   }
> + xfs_perag_put(pag);
>  
>   xfs_alloc_log_agf(tp, agbp, logflags);
>   *bnop = bno;
> -- 
> 2.7.4
> 
> 

-- 
Carlos

Re: Official Linux system wrapper library?

2018-11-15 Thread Carlos O'Donell

On 11/15/18 12:08 PM, Theodore Y. Ts'o wrote:
> On Thu, Nov 15, 2018 at 04:29:43PM +, Joseph Myers wrote:
>> On Thu, 15 Nov 2018, Theodore Y. Ts'o wrote:
>>
>>> That's great.  But is it or is it not true (either de jure or de
>>> facto) that "a single active glibc developer" can block a system call
>>> from being supported by glibc by objecting?  And if not, under what is
>>> the process by resolving a conflict?
>>
>> We use a consensus-building process as described at 
>> <https://sourceware.org/glibc/wiki/Consensus>.
> 
> So can a single glibc developer can block Consensus?

Yes.

I think the comparison to the "liberum veto" is not a fair
comparison to the way the glibc community works :-)

(1) Community consensus.

Consensus need not imply unanimity.

Consensus is only from the set of important and concerned
interests. The community gets to decide that you're a troll
that does no real work, and can therefore ignore you.

Consensus is blocked only by sustained objection (not just
normal objections, which are recorded as part of the 
development process e.g. "I don't like it, but I leave it
up to you to decide").

Therefore an involved glibc developer can lodge a sustained
objection, and block consensus.

(2) The GNU package maintainers for glibc.

There are 8 GNU package maintainers for glibc.

The package maintainers created the consensus process to
empower the community, but they can act as a final 
review committee to move issues where there are two
reasonable but competing view points.

As Joseph points out we haven't ever used the GNU pakcage
maintainers to vote on a stuck issue, but I will arrange
it when the need arises. If you think we're at that point
with wrapper functions, just say so, but it doesn't seem
like it to me.

-- 
Cheers,
Carlos.

Re: Official Linux system wrapper library?

2018-11-15 Thread Carlos O'Donell

On 11/14/18 1:47 PM, Joseph Myers wrote:
> On Wed, 14 Nov 2018, Daniel Colascione wrote:
> 
>> A good demonstration of a new commitment to pragmatism would be
>> merging the trivial wrappers for gettid(2).
> 
> I support the addition of gettid (for use with those syscalls that take 
> tids, and with appropriate documentation explaining the properties of 
> tids) - and, generally, wrappers for all non-obsolescent 
> architecture-independent Linux kernel syscalls, including ones that are 
> very Linux-specific, except maybe for a few interfaces fundamentally 
> inconsistent with glibc managing TLS etc. - they are, at least, no worse 
> as a source of APIs than all the old BSD / SVID interfaces we have from 
> when those were used as sources of APIs.
 
I agree. Documentation is important.

-- 
Cheers,
Carlos.

Re: Official Linux system wrapper library?

2018-11-14 Thread Carlos O'Donell

On 11/14/18 6:58 AM, Szabolcs Nagy wrote:
> an actual proposal in the thread that i think is
> worth considering is to make the linux syscall
> design process involve libc devs so the c api is
> designed together with the syscall abi.

Right, I see at least 2 actionable items:

* "The Checklist" which everyone making a syscall should
  follow and we create the checklist with input from both
  sides and it becomes the thing you reference e.g.
  "Did you follow the checklist? Where is X?"

* Programmatic / Machine readable description of syscalls.
  This way the kernel gives users the ability to autogenerate
  all the wrappers *if they want to* in a consistent way that
  matches this syscall description format.

-- 
Cheers,
Carlos.

Re: Official Linux system wrapper library?

2018-11-13 Thread Carlos O'Donell

On 11/12/18 11:43 AM, Joseph Myers wrote:
> On Sun, 11 Nov 2018, Florian Weimer wrote:
> 
>> People may have disappeared from glibc development who have objected to
>> gettid.  I thought this was the case with strlcpy/strlcat, but it was
>> not.
> 
> Well, I know of two main people who were objecting to the notion of adding 
> bindings for all non-obsolescent syscalls, Linux-specific if not suitable 
> for adding to the OS-independent GNU API, and neither seems to have posted 
> in the past year.
> 
>> At present, it takes one semi-active glibc contributor to block addition
>> of a system call.  The process to override a sustained objection has
>> never been used successfully, and it is a lot of work to get it even
>> started.
> 
> We don't have such a process.  (I've suggested, e.g. in conversation with 
> Carlos at the Cauldron, that we should have something involving a 
> supermajority vote of the GNU maintainers for glibc in cases where we're 
> unable to reach a consensus in the community as a whole.)
 
... and I need a good excuse to propose such a process :-)

-- 
Cheers,
Carlos.

Re: Official Linux system wrapper library?

2018-11-11 Thread Carlos O'Donell

On 11/10/18 2:20 PM, Greg KH wrote:
> Also, what about the basic work of making sure our uapi header files can
> actually be used untouched by a libc?  That isn't the case these days as
> the bionic maintainers like to keep reminding me.  That might be a good
> thing to do _before_ trying to add new things like syscall wrappers.
I agree completely. There are many steps in the checklist to writing
a new syscall, heck we should probably have a checklist!

Socially the issue is difficult because the various communities only
marginally share the same network of developers, care about different
features, or the same features with different priorities.

That doesn't mean we shouldn't try to integrate better. As was pointed
out, various people from the userspace and toolchain communities are
going to LPC to do just this.

-- 
Cheers,
Carlos.

Re: Official Linux system wrapper library?

2018-11-11 Thread Carlos O'Donell

On 11/10/18 2:58 PM, Vlastimil Babka wrote:
> On 11/10/18 8:20 PM, Greg KH wrote:
>> On Sat, Nov 10, 2018 at 10:52:06AM -0800, Daniel Colascione wrote:
>>> Now that glibc is basically not adding any new system call wrappers,
>>
>> Why are they not doing that anymore?
> 
> FYI just noticed there's a topic relevant to this in LPC Toolchain MC:
> 
> https://linuxplumbersconf.org/event/2/contributions/149/

Yes, and Adhemerval put it there on purpose to continue the discussion
between glibc developers and kernel developers. Florian Weimer and I have
both provided input to that talk, so if something comes out of the talk
and you want to talk more, please just reach out.

I hope that kernel developers interested in this topic will attend
and discuss the various ways forward on certain interesting topics :-)

-- 
Cheers,
Carlos.

[PATCH 1/2] dt-bindings: iio: pressure: Add support for Honeywell HSC SPI sensors

2018-10-26 Thread Carlos Iglesias

Add device tree bindings for the HSC pressure sensors.

Signed-off-by: Carlos Iglesias 
---
 .../bindings/iio/pressure/hsc_spi.txt | 85 +++
 1 file changed, 85 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/iio/pressure/hsc_spi.txt

diff --git a/Documentation/devicetree/bindings/iio/pressure/hsc_spi.txt 
b/Documentation/devicetree/bindings/iio/pressure/hsc_spi.txt
new file mode 100644
index ..2302d6eef7c7
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/pressure/hsc_spi.txt
@@ -0,0 +1,85 @@
+Honeywell HSC Series of Pressure Sensors
+
+Pressure sensors from Honeywell with analog, I2C and SPI interfaces
+
+Required properties:
+- compatible: selects the sensor model; must be one of the following
+   "honeywell,hsc001baa"
+   "honeywell,hsc001bab"
+   "honeywell,hsc001bac"
+   "honeywell,hsc001baf"
+   "honeywell,hsc1_6baa"
+   "honeywell,hsc1_6bab"
+   "honeywell,hsc1_6bac"
+   "honeywell,hsc1_6baf"
+   "honeywell,hsc2_5baa"
+   "honeywell,hsc2_5bab"
+   "honeywell,hsc2_5bac"
+   "honeywell,hsc2_5baf"
+   "honeywell,hsc004baa"
+   "honeywell,hsc004bab"
+   "honeywell,hsc004bac"
+   "honeywell,hsc004baf"
+   "honeywell,hsc006baa"
+   "honeywell,hsc006bab"
+   "honeywell,hsc006bac"
+   "honeywell,hsc006baf"
+   "honeywell,hsc010baa"
+   "honeywell,hsc010bab"
+   "honeywell,hsc010bac"
+   "honeywell,hsc010baf"
+   "honeywell,hsc100kaa"
+   "honeywell,hsc100kab"
+   "honeywell,hsc100kac"
+   "honeywell,hsc100kaf"
+   "honeywell,hsc160kaa"
+   "honeywell,hsc160kab"
+   "honeywell,hsc160kac"
+   "honeywell,hsc160kaf"
+   "honeywell,hsc250kaa"
+   "honeywell,hsc250kab"
+   "honeywell,hsc250kac"
+   "honeywell,hsc250kaf"
+   "honeywell,hsc400kaa"
+   "honeywell,hsc400kab"
+   "honeywell,hsc400kac"
+   "honeywell,hsc400kaf"
+   "honeywell,hsc600kaa"
+   "honeywell,hsc600kab"
+   "honeywell,hsc600kac"
+   "honeywell,hsc600kaf"
+   "honeywell,hsc001gaa"
+   "honeywell,hsc001gab"
+   "honeywell,hsc001gac"
+   "honeywell,hsc001gaf"
+   "honeywell,hsc015paa"
+   "honeywell,hsc015pab"
+   "honeywell,hsc015pac"
+   "honeywell,hsc015paf"
+   "honeywell,hsc030paa"
+   "honeywell,hsc030pab"
+   "honeywell,hsc030pac"
+   "honeywell,hsc030paf"
+   "honeywell,hsc060paa"
+   "honeywell,hsc060pab"
+   "honeywell,hsc060pac"
+   "honeywell,hsc060paf"
+   "honeywell,hsc100paa"
+   "honeywell,hsc100pab"
+   "honeywell,hsc100pac"
+   "honeywell,hsc100paf"
+   "honeywell,hsc150paa"
+   "honeywell,hsc150pab"
+   "honeywell,hsc150pac"
+   "honeywell,hsc150paf"
+- reg: the SPI chip select number used by the sensor.
+- spi-max-frequency: maximum clock frequency (Hz) used for the SPI bus.
+  The maximum value supported by the sensors is 40.
+
+Example:
+
+   hsc_spi0: hsc@0 {
+   compatible = "honeywell,hsc010baa";
+   reg = <0>;
+   spi-max-frequency = <40>;
+   };
-- 
2.19.1

[PATCH 2/2] iio: pressure: Add support for Honeywell HSC SPI sensors

2018-10-26 Thread Carlos Iglesias

Add device driver for the HSC pressure sensors with SPI interface.
In addition to the main measurement -pressure- these sensors also
provide temperature measurement.

Signed-off-by: Carlos Iglesias 
---
 MAINTAINERS|   6 +
 drivers/iio/pressure/Kconfig   |  10 +
 drivers/iio/pressure/Makefile  |   2 +
 drivers/iio/pressure/hsc_spi.c | 543 +
 4 files changed, 561 insertions(+)
 create mode 100644 drivers/iio/pressure/hsc_spi.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 556f902b3766..a4af4cbd4b0f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6696,6 +6696,12 @@ W:   
http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi
 S: Maintained
 F: fs/hpfs/
 
+HSC SERIES PRESSURE SENSORS
+M; Carlos Iglesias 
+S: Maintained
+F: Documentation/devicetree/bindings/iio/pressure/hsc_spi.txt
+F: drivers/iio/pressure/hsc_spi.c
+
 HSI SUBSYSTEM
 M: Sebastian Reichel 
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/sre/linux-hsi.git
diff --git a/drivers/iio/pressure/Kconfig b/drivers/iio/pressure/Kconfig
index eaa7cfcb4c2a..593663d4ffa8 100644
--- a/drivers/iio/pressure/Kconfig
+++ b/drivers/iio/pressure/Kconfig
@@ -227,4 +227,14 @@ config ZPA2326_SPI
tristate
select REGMAP_SPI
 
+config HSC_SPI
+   tristate "Honeywell HSC pressure sensors (SPI)"
+   depends on SPI_MASTER
+   help
+ Say yes here to build support for the Honeywell HSC range of
+ pressure sensors (SPI interface only).
+
+ To compile this driver as a module, choose M here: the module will
+ be called hsc_spi.
+
 endmenu
diff --git a/drivers/iio/pressure/Makefile b/drivers/iio/pressure/Makefile
index c2058d7b2f93..b1db2116e9f6 100644
--- a/drivers/iio/pressure/Makefile
+++ b/drivers/iio/pressure/Makefile
@@ -31,3 +31,5 @@ obj-$(CONFIG_ZPA2326_SPI) += zpa2326_spi.o
 
 obj-$(CONFIG_IIO_ST_PRESS_I2C) += st_pressure_i2c.o
 obj-$(CONFIG_IIO_ST_PRESS_SPI) += st_pressure_spi.o
+
+obj-$(CONFIG_HSC_SPI) += hsc_spi.o
diff --git a/drivers/iio/pressure/hsc_spi.c b/drivers/iio/pressure/hsc_spi.c
new file mode 100644
index ..efe45a09da8f
--- /dev/null
+++ b/drivers/iio/pressure/hsc_spi.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * hsc_spi.c - Driver for Honeywell HSC pressure sensors with
+ * SPI interface
+ *
+ * Copyright (c) 2018 Carlos Iglesias 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define HSC_MAX_SPI_FREQ_HZ40
+
+#define HSC_TEMP_BITS  11
+#define HSC_PRESS_BITS 14
+#define HSC_TEMP_MASK  (0x7FF)
+#define HSC_TEMP_SHIFT (5)
+
+#define HSC_STATUS_S0  BIT(14)
+#define HSC_STATUS_S1  BIT(15)
+#define HSC_STATUS_MSK ((HSC_STATUS_S0) | (HSC_STATUS_S1))
+#define HSC_STATUS_CMD HSC_STATUS_S0
+#define HSC_STATUS_STALE   HSC_STATUS_S1
+#define HSC_STATUS_DIAG((HSC_STATUS_S0) | (HSC_STATUS_S1))
+
+static inline int hsc_status_error(struct device dev, int val)
+{
+   int st_check = val & HSC_STATUS_MSK;
+
+   if (st_check) {
+   switch (st_check) {
+   case HSC_STATUS_CMD:
+   dev_warn(&dev, "%s:Device in COMMAND MODE\n",
+__func__);
+   return -EIO;
+   case HSC_STATUS_STALE:
+   dev_warn(&dev, "%s:Stale data - sampling too fast?\n",
+__func__);
+   return -EAGAIN;
+   case HSC_STATUS_DIAG:
+   dev_warn(&dev, "%s:Calibration signature changed\n",
+__func__);
+   return -EIO;
+   default:
+   dev_err(&dev, "%s:Invalid status code (%d)\n",
+   __func__, st_check);
+   return -EIO;
+   }
+   }
+
+   return 0;
+}
+
+enum hsc_variant {
+   /* Note: Only the absolute range sensors are supported */
+   HSC001BAA, HSC001BAB, HSC001BAC, HSC001BAF,
+   HSC1_6BAA, HSC1_6BAB, HSC1_6BAC, HSC1_6BAF,
+   HSC2_5BAA, HSC2_5BAB, HSC2_5BAC, HSC2_5BAF,
+   HSC004BAA, HSC004BAB, HSC004BAC, HSC004BAF,
+   HSC006BAA, HSC006BAB, HSC006BAC, HSC006BAF,
+   HSC010BAA, HSC010BAB, HSC010BAC, HSC010BAF,
+
+   HSC100KAA, HSC100KAB, HSC100KAC, HSC100KAF,
+   HSC160KAA, HSC160KAB, HSC160KAC, HSC160KAF,
+   HSC250KAA, HSC250KAB, HSC250KAC, HSC250KAF,
+   HSC400KAA, HSC400KAB, HSC400KAC, HSC400KAF,
+   HSC600KAA, HSC600KAB, HSC600KAC, HSC600KAF,
+   HSC001GAA, HSC001GAB, HSC001GAC, HSC001GAF,
+
+   HSC015PAA, HSC015PAB, HSC015PAC, HSC015PAF,
+   HSC030PAA, HSC030PAB, HSC030PAC, HSC030PAF,
+   HSC060PAA, HSC060PAB, HSC060PAC, HSC060PAF,
+   HSC100PAA, HS

[PATCH 0/2] iio: pressure: Add support for Honeywell HSC SPI sensors

2018-10-26 Thread Carlos Iglesias

Hi all,

The Honeywell HSC is a series of pressure sensors that provide different
pressure ranges and measurement types (absolute, differential, gauge). The
available output types are analog, I2C or SPI (see [1]).

This driver provides support for a limited set of the HSC sensor models,
namely the absolute pressure sensors with SPI interface.

References:
[1] Datasheet 
https://sensing.honeywell.com/honeywell-sensing-trustability-hsc-series-high-accuracy-board-mount-pressure-sensors-50099148-a-en.pdf
[2] SPI Communications with Honeywell Digital Output Pressure Sensors 
https://sensing.honeywell.com/spi-comms-digital-ouptu-pressure-sensors-tn-008202-3-en-final-30may12.pdf

Carlos Iglesias (2):
  dt-bindings: iio: pressure: Add support for Honeywell HSC SPI sensors
  iio: pressure: Add support for Honeywell HSC SPI sensors

 .../bindings/iio/pressure/hsc_spi.txt |  85 +++
 MAINTAINERS   |   6 +
 drivers/iio/pressure/Kconfig  |  10 +
 drivers/iio/pressure/Makefile |   2 +
 drivers/iio/pressure/hsc_spi.c| 543 ++
 5 files changed, 646 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/iio/pressure/hsc_spi.txt
 create mode 100644 drivers/iio/pressure/hsc_spi.c

-- 
2.19.1

Request for Partnership-

2018-08-17 Thread Juan Carlos

Sir/Madam,
 
I have access to very vital information that can be used to move huge amount of 
money. I have done my homework very well and i have the machineries in place to 
get it done since I am still in active service. If it was possible for me to do 
it alone I would not have bothered contacting you. Ultimately I need you to 
play an important role in the completion of this business transaction.
 
Reply if you are willing to do the business.
 
Regards,
Juan Carlos

[no subject]

2018-08-07 Thread Joao Carlos Camargo de Oliveira

Re: [PATCH v2 0/4] vfs: track per-sb writeback errors and report them via fsinfo()

2018-07-12 Thread Carlos Maiolino

On Tue, Jul 10, 2018 at 10:01:23AM -0400, Jeff Layton wrote:
> v2: drop buffer.c patch to record wb errors when underlying blockdev
> flush fails. We may eventually want that, but at this point I don't have
> a clear way to test it to determine its efficacy.
> 
> At LSF/MM this year, the PostgreSQL developers mentioned that they'd
> like to have some mechanism to check whether there have been any
> writeback errors on a filesystem, without necessarily flushing any of
> the cached data first.
> 
> Given that we have a new fsinfo syscall being introduced, we may as well
> use it to report writeback errors on a per superblock basis. This allows
> us to provide the info that the PostgreSQL developers wanted, without
> needing to change an existing interface.
> 
> This seems to do the right thing when tested by hand, but I don't yet
> have an xfstest for it, since the syscall is still quite new. Once that
> goes in and we get fsinfo support in xfs_io, it should be rather
> trivial to roll a testcase for this.
> 

Whole patch sounds fine, you can add:

Reviewed-by: Carlos Maiolino 

Cheers

> Al, if this looks ok, could you pull this into the branch where you
> have David's fsinfo patches queued up?
> 
> Thanks,
> Jeff
> 
> Jeff Layton (4):
>   vfs: track per-sb writeback errors
>   errseq: add a new errseq_scrape function
>   vfs: allow fsinfo to fetch the current state of s_wb_err
>   samples: extend test-fsinfo to access error_state
> 
>  fs/statfs.c |  9 +
>  include/linux/errseq.h  |  1 +
>  include/linux/fs.h  |  3 +++
>  include/linux/pagemap.h |  5 -
>  include/uapi/linux/fsinfo.h | 11 +++
>  lib/errseq.c| 33 +++--
>  samples/statx/test-fsinfo.c | 13 +
>  7 files changed, 72 insertions(+), 3 deletions(-)
> 
> -- 
> 2.17.1
> 

-- 
Carlos

Re: [PATCH] vfs: remove iterate_supers_type

2018-05-24 Thread Carlos Maiolino

On Thu, May 17, 2018 at 11:47:37AM -0400, Jeff Layton wrote:
> From: Jeff Layton 
> 
> Nothing calls this.
> 
> Signed-off-by: Jeff Layton 

looks like there are no reviews here yet, and it looks good to me, so, feel free
to add.

Reviewed-by: Carlos Maiolino 

> ---
>  fs/super.c | 36 
>  include/linux/fs.h |  2 --
>  2 files changed, 38 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index 30b7490bd049..456276033e59 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -631,42 +631,6 @@ void iterate_supers(void (*f)(struct super_block *, 
> int), int arg)
>   spin_unlock(&sb_lock);
>  }
>  
> -/**
> - *   iterate_supers_type - call function for superblocks of given type
> - *   @type: fs type
> - *   @f: function to call
> - *   @arg: argument to pass to it
> - *
> - *   Scans the superblock list and calls given function, passing it
> - *   locked superblock and given argument.
> - */
> -void iterate_supers_type(struct file_system_type *type,
> - void (*f)(struct super_block *, void *), void *arg)
> -{
> - struct super_block *sb, *p = NULL;
> -
> - spin_lock(&sb_lock);
> - hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
> - sb->s_count++;
> - spin_unlock(&sb_lock);
> -
> - down_read(&sb->s_umount);
> - if (sb->s_root && (sb->s_flags & SB_BORN))
> - f(sb, arg);
> - up_read(&sb->s_umount);
> -
> - spin_lock(&sb_lock);
> - if (p)
> - __put_super(p);
> - p = sb;
> - }
> - if (p)
> - __put_super(p);
> - spin_unlock(&sb_lock);
> -}
> -
> -EXPORT_SYMBOL(iterate_supers_type);
> -
>  static struct super_block *__get_super(struct block_device *bdev, bool excl)
>  {
>   struct super_block *sb;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 7107d291d853..c4761eba3b44 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3116,8 +3116,6 @@ extern struct super_block *get_active_super(struct 
> block_device *bdev);
>  extern void drop_super(struct super_block *sb);
>  extern void drop_super_exclusive(struct super_block *sb);
>  extern void iterate_supers(void (*)(struct super_block *, int), int);
> -extern void iterate_supers_type(struct file_system_type *,
> - void (*)(struct super_block *, void *), void *);
>  
>  extern int dcache_dir_open(struct inode *, struct file *);
>  extern int dcache_dir_close(struct inode *, struct file *);
> -- 
> 2.17.0
> 

-- 
Carlos

1 2 3 4 >

1 - 100 of 343 matches

Mail list logo