Re: [PATCH net-next v3 1/3] vsock/virtio: use skb_frag_*() helpers

2024-01-02 Thread Stefano Garzarella

On Wed, Dec 20, 2023 at 01:45:00PM -0800, Mina Almasry wrote:

Minor fix for virtio: code wanting to access the fields inside an skb
frag should use the skb_frag_*() helpers, instead of accessing the
fields directly. This allows for extensions where the underlying
memory is not a page.

Signed-off-by: Mina Almasry 

---

v2:

- Also fix skb_frag_off() + skb_frag_size() (David)
- Did not apply the reviewed-by from Stefano since the patch changed
relatively much.


Sorry for the delay, I was off.

LGTM!

Acked-by: Stefano Garzarella 

Possibly we can also send this patch alone if the series is still under
discussion because it's definitely an improvement to the current code.

Thanks,
Stefano



---
net/vmw_vsock/virtio_transport.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index f495b9e5186b..1748268e0694 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -153,10 +153,10 @@ virtio_transport_send_pkt_work(struct work_struct *work)
 * 'virt_to_phys()' later to fill the buffer 
descriptor.
 * We don't touch memory at "virtual" address 
of this page.
 */
-   va = page_to_virt(skb_frag->bv_page);
+   va = page_to_virt(skb_frag_page(skb_frag));
sg_init_one(sgs[out_sg],
-   va + skb_frag->bv_offset,
-   skb_frag->bv_len);
+   va + skb_frag_off(skb_frag),
+   skb_frag_size(skb_frag));
out_sg++;
}
}
--
2.43.0.472.g3155946c3a-goog






Re: [RFC PATCH v1] vsock/test: add '--peer-port' input argument

2024-01-15 Thread Stefano Garzarella

Hi Arseniy,
thanks for this patch!

On Sat, Jan 13, 2024 at 12:21:10AM +0300, Arseniy Krasnov wrote:

Implement port for given CID as input argument instead of using
hardcoded value '1234'. This allows to run different test instances
on a single CID. Port argument is not required parameter and if it is
not set, then default value will be '1234' - thus we preserve previous
behaviour.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/util.c| 17 +++-
tools/testing/vsock/util.h|  4 +
tools/testing/vsock/vsock_diag_test.c | 18 -
tools/testing/vsock/vsock_test.c  | 96 +--
tools/testing/vsock/vsock_test_zerocopy.c | 12 +--
tools/testing/vsock/vsock_uring_test.c| 16 +++-
6 files changed, 107 insertions(+), 56 deletions(-)

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index ae2b33c21c45..554b290fefdc 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -33,8 +33,7 @@ void init_signals(void)
signal(SIGPIPE, SIG_IGN);
}

-/* Parse a CID in string representation */
-unsigned int parse_cid(const char *str)
+static unsigned int parse_uint(const char *str, const char *err_str)
{
char *endptr = NULL;
unsigned long n;
@@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str)
errno = 0;
n = strtoul(str, &endptr, 10);
if (errno || *endptr != '\0') {
-   fprintf(stderr, "malformed CID \"%s\"\n", str);
+   fprintf(stderr, "malformed %s \"%s\"\n", err_str, str);
exit(EXIT_FAILURE);
}
return n;
}

+/* Parse a CID in string representation */
+unsigned int parse_cid(const char *str)
+{
+   return parse_uint(str, "CID");
+}
+
+/* Parse a port in string representation */
+unsigned int parse_port(const char *str)
+{
+   return parse_uint(str, "port");
+}
+
/* Wait for the remote to close the connection */
void vsock_wait_remote_close(int fd)
{
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 03c88d0cb861..e95e62485959 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -12,10 +12,13 @@ enum test_mode {
TEST_MODE_SERVER
};

+#define DEFAULT_PEER_PORT  1234
+
/* Test runner options */
struct test_opts {
enum test_mode mode;
unsigned int peer_cid;
+   unsigned int peer_port;
};

/* A test case definition.  Test functions must print failures to stderr and
@@ -35,6 +38,7 @@ struct test_case {

void init_signals(void);
unsigned int parse_cid(const char *str);
+unsigned int parse_port(const char *str);
int vsock_stream_connect(unsigned int cid, unsigned int port);
int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_diag_test.c 
b/tools/testing/vsock/vsock_diag_test.c
index fa927ad16f8a..5e6049226b77 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct 
test_opts *opts)
} addr = {
.svm = {
.svm_family = AF_VSOCK,
-   .svm_port = 1234,
+   .svm_port = opts->peer_port,
.svm_cid = VMADDR_CID_ANY,
},
};
@@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts 
*opts)
LIST_HEAD(sockets);
struct vsock_stat *st;

-   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
@@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts 
*opts)
LIST_HEAD(sockets);
int client_fd;

-   client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
if (client_fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
@@ -461,6 +461,11 @@ static const struct option longopts[] = {
.has_arg = required_argument,
.val = 'p',
},
+   {
+   .name = "peer-port",
+   .has_arg = required_argument,
+   .val = 'q',
+   },
{
.name = "list",
.has_arg = no_argument,
@@ -481,7 +486,7 @@ static const struct option longopts[] = {

static void usage(void)
{
-   fprintf(stderr, "Usage: vsock_diag_test [--help] [--control-host=] 
--control-port= --mode=client|server --peer-cid= [--list] [--skip=]\n"
+   fprintf(stderr, "Usage: vsock_diag_test [--help] [--control-host=] --control-port= 
--mode=client|server --peer-cid= [--peer-port=] [--list] [--skip=]\n"
"\n"
"  Server: vsock_diag_test --control-port=1234 --mode=server 
--peer

Re: [PATCH V1] vdpa_sim: reset must not run

2024-01-22 Thread Stefano Garzarella

On Wed, Jan 17, 2024 at 11:23:23AM -0800, Steve Sistare wrote:

vdpasim_do_reset sets running to true, which is wrong, as it allows
vdpasim_kick_vq to post work requests before the device has been
configured.  To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK
is set.

Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op")
Signed-off-by: Steve Sistare 
Reviewed-by: Eugenio Pérez 
---
drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..6304cb0b4770 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 
flags)
}
}

-   vdpasim->running = true;
+   vdpasim->running = false;
spin_unlock(&vdpasim->iommu_lock);

vdpasim->features = 0;
@@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 
status)

mutex_lock(&vdpasim->mutex);
vdpasim->status = status;
+   vdpasim->running = (status & VIRTIO_CONFIG_S_FEATURES_OK) != 0;
mutex_unlock(&vdpasim->mutex);


Should we do something similar also in vdpasim_resume() ?

I mean something like this:

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..55e4633d5442 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -520,7 +520,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa)
int i;

mutex_lock(&vdpasim->mutex);
-   vdpasim->running = true;
+   vdpasim->running = (vdpasim->status & VIRTIO_CONFIG_S_FEATURES_OK) != 0;

if (vdpasim->pending_kick) {
/* Process pending descriptors */

Thanks,
Stefano




Re: Re: [PATCH V1] vdpa_sim: reset must not run

2024-01-22 Thread Stefano Garzarella

On Mon, Jan 22, 2024 at 11:47:22AM +0100, Eugenio Perez Martin wrote:

On Mon, Jan 22, 2024 at 11:22 AM Stefano Garzarella  wrote:


On Wed, Jan 17, 2024 at 11:23:23AM -0800, Steve Sistare wrote:
>vdpasim_do_reset sets running to true, which is wrong, as it allows
>vdpasim_kick_vq to post work requests before the device has been
>configured.  To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK
>is set.
>
>Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op")
>Signed-off-by: Steve Sistare 
>Reviewed-by: Eugenio Pérez 
>---
> drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim.c
>index be2925d0d283..6304cb0b4770 100644
>--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
>+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
>@@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 
flags)
>   }
>   }
>
>-  vdpasim->running = true;
>+  vdpasim->running = false;
>   spin_unlock(&vdpasim->iommu_lock);
>
>   vdpasim->features = 0;
>@@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, 
u8 status)
>
>   mutex_lock(&vdpasim->mutex);
>   vdpasim->status = status;
>+  vdpasim->running = (status & VIRTIO_CONFIG_S_FEATURES_OK) != 0;
>   mutex_unlock(&vdpasim->mutex);

Should we do something similar also in vdpasim_resume() ?

I mean something like this:

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index be2925d0d283..55e4633d5442 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -520,7 +520,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa)
 int i;

 mutex_lock(&vdpasim->mutex);
-   vdpasim->running = true;
+   vdpasim->running = (vdpasim->status & VIRTIO_CONFIG_S_FEATURES_OK) != 0;

 if (vdpasim->pending_kick) {
 /* Process pending descriptors */

Thanks,
Stefano



The suspend and resume operation should not be called before
DRIVER_OK, so maybe we should add that protection at
drivers/vhost/vdpa.c actually?


Yeah, I think so!

Anyway, IMHO we should at least return an error in vdpa_sim if 
vdpasim_suspend/resume are called before DRIVER_OK (in another patch of 
course).


Stefano




Re: [PATCH net-next v2] vsock/test: add '--peer-port' input argument

2024-01-23 Thread Stefano Garzarella

On Tue, Jan 23, 2024 at 10:27:50AM +0300, Arseniy Krasnov wrote:

Implement port for given CID as input argument instead of using
hardcoded value '1234'. This allows to run different test instances
on a single CID. Port argument is not required parameter and if it is
not set, then default value will be '1234' - thus we preserve previous
behaviour.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Reword usage message.
 * Add commas after last field in 'opts' declaration.
 * 'RFC' -> 'net-next'.


Thanks for the changes, LGTM!

Reviewed-by: Stefano Garzarella 



tools/testing/vsock/util.c|  17 +++-
tools/testing/vsock/util.h|   4 +
tools/testing/vsock/vsock_diag_test.c |  21 +++--
tools/testing/vsock/vsock_test.c  | 102 +-
tools/testing/vsock/vsock_test_zerocopy.c |  12 +--
tools/testing/vsock/vsock_uring_test.c|  17 +++-
6 files changed, 115 insertions(+), 58 deletions(-)

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index ae2b33c21c45..554b290fefdc 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -33,8 +33,7 @@ void init_signals(void)
signal(SIGPIPE, SIG_IGN);
}

-/* Parse a CID in string representation */
-unsigned int parse_cid(const char *str)
+static unsigned int parse_uint(const char *str, const char *err_str)
{
char *endptr = NULL;
unsigned long n;
@@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str)
errno = 0;
n = strtoul(str, &endptr, 10);
if (errno || *endptr != '\0') {
-   fprintf(stderr, "malformed CID \"%s\"\n", str);
+   fprintf(stderr, "malformed %s \"%s\"\n", err_str, str);
exit(EXIT_FAILURE);
}
return n;
}

+/* Parse a CID in string representation */
+unsigned int parse_cid(const char *str)
+{
+   return parse_uint(str, "CID");
+}
+
+/* Parse a port in string representation */
+unsigned int parse_port(const char *str)
+{
+   return parse_uint(str, "port");
+}
+
/* Wait for the remote to close the connection */
void vsock_wait_remote_close(int fd)
{
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 03c88d0cb861..e95e62485959 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -12,10 +12,13 @@ enum test_mode {
TEST_MODE_SERVER
};

+#define DEFAULT_PEER_PORT  1234
+
/* Test runner options */
struct test_opts {
enum test_mode mode;
unsigned int peer_cid;
+   unsigned int peer_port;
};

/* A test case definition.  Test functions must print failures to stderr and
@@ -35,6 +38,7 @@ struct test_case {

void init_signals(void);
unsigned int parse_cid(const char *str);
+unsigned int parse_port(const char *str);
int vsock_stream_connect(unsigned int cid, unsigned int port);
int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_diag_test.c 
b/tools/testing/vsock/vsock_diag_test.c
index fa927ad16f8a..9d61b1f1c4c3 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct 
test_opts *opts)
} addr = {
.svm = {
.svm_family = AF_VSOCK,
-   .svm_port = 1234,
+   .svm_port = opts->peer_port,
.svm_cid = VMADDR_CID_ANY,
},
};
@@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts 
*opts)
LIST_HEAD(sockets);
struct vsock_stat *st;

-   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
@@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts 
*opts)
LIST_HEAD(sockets);
int client_fd;

-   client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
if (client_fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
@@ -461,6 +461,11 @@ static const struct option longopts[] = {
.has_arg = required_argument,
.val = 'p',
},
+   {
+   .name = "peer-port",
+   .has_arg = required_argument,
+   .val = 'q',
+   },
{
.name = "list",
.has_arg = no_argument,
@@ -481,7 +486,7 @@ static const struct option longopts[] = {

static void usage(void)
{
-   fprintf(stderr, "Usag

Re: [PATCH net-next v1] vsock/test: print type for SOCK_SEQPACKET

2024-01-25 Thread Stefano Garzarella

On Wed, Jan 24, 2024 at 10:32:55PM +0300, Arseniy Krasnov wrote:

SOCK_SEQPACKET is supported for virtio transport, so do not interpret
such type of socket as unknown.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_diag_test.c | 2 ++
1 file changed, 2 insertions(+)


Yeah, LGTM!

Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/vsock_diag_test.c 
b/tools/testing/vsock/vsock_diag_test.c
index 5e6049226b77..17aeba7cbd14 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -39,6 +39,8 @@ static const char *sock_type_str(int type)
return "DGRAM";
case SOCK_STREAM:
return "STREAM";
+   case SOCK_SEQPACKET:
+   return "SEQPACKET";
default:
return "INVALID TYPE";
}
--
2.25.1






[PATCH] vhost-vdpa: fail enabling virtqueue in certain conditions

2024-02-06 Thread Stefano Garzarella
If VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not negotiated, we expect
the driver to enable virtqueue before setting DRIVER_OK. If the driver
tries anyway, better to fail right away as soon as we get the ioctl.
Let's also update the documentation to make it clearer.

We had a problem in QEMU for not meeting this requirement, see
https://lore.kernel.org/qemu-devel/20240202132521.32714-1-kw...@redhat.com/

Fixes: 9f09fd6171fe ("vdpa: accept VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK 
backend feature")
Cc: epere...@redhat.com
Signed-off-by: Stefano Garzarella 
---
 include/uapi/linux/vhost_types.h | 3 ++-
 drivers/vhost/vdpa.c | 4 
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index d7656908f730..5df49b6021a7 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -182,7 +182,8 @@ struct vhost_vdpa_iova_range {
 /* Device can be resumed */
 #define VHOST_BACKEND_F_RESUME  0x5
 /* Device supports the driver enabling virtqueues both before and after
- * DRIVER_OK
+ * DRIVER_OK. If this feature is not negotiated, the virtqueues must be
+ * enabled before setting DRIVER_OK.
  */
 #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK  0x6
 /* Device may expose the virtqueue's descriptor area, driver area and
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index bc4a51e4638b..1fba305ba8c1 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -651,6 +651,10 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, 
unsigned int cmd,
case VHOST_VDPA_SET_VRING_ENABLE:
if (copy_from_user(&s, argp, sizeof(s)))
return -EFAULT;
+   if (!vhost_backend_has_feature(vq,
+   VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) &&
+   (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
+   return -EINVAL;
ops->set_vq_ready(vdpa, idx, s.num);
return 0;
case VHOST_VDPA_GET_VRING_GROUP:
-- 
2.43.0




Re: Re: [PATCH] vhost-vdpa: fail enabling virtqueue in certain conditions

2024-02-06 Thread Stefano Garzarella

On Tue, Feb 06, 2024 at 10:56:50AM -0500, Michael S. Tsirkin wrote:

better @subj: try late vq enable only if negotiated


I rewrote it 3/4 times, and before sending it I was not happy with the 
result.


Thank you, much better! I'll change it in v2.

Stefano




Re: Re: [PATCH] vhost-vdpa: fail enabling virtqueue in certain conditions

2024-02-07 Thread Stefano Garzarella

On Wed, Feb 07, 2024 at 11:27:14AM +0800, Jason Wang wrote:

On Tue, Feb 6, 2024 at 10:52 PM Stefano Garzarella  wrote:


If VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not negotiated, we expect
the driver to enable virtqueue before setting DRIVER_OK. If the driver
tries anyway, better to fail right away as soon as we get the ioctl.
Let's also update the documentation to make it clearer.

We had a problem in QEMU for not meeting this requirement, see
https://lore.kernel.org/qemu-devel/20240202132521.32714-1-kw...@redhat.com/


Maybe it's better to only enable cvq when the backend supports
VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK. Eugenio, any comment on this?



Fixes: 9f09fd6171fe ("vdpa: accept VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK backend 
feature")
Cc: epere...@redhat.com
Signed-off-by: Stefano Garzarella 
---
 include/uapi/linux/vhost_types.h | 3 ++-
 drivers/vhost/vdpa.c | 4 
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index d7656908f730..5df49b6021a7 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -182,7 +182,8 @@ struct vhost_vdpa_iova_range {
 /* Device can be resumed */
 #define VHOST_BACKEND_F_RESUME  0x5
 /* Device supports the driver enabling virtqueues both before and after
- * DRIVER_OK
+ * DRIVER_OK. If this feature is not negotiated, the virtqueues must be
+ * enabled before setting DRIVER_OK.
  */
 #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK  0x6
 /* Device may expose the virtqueue's descriptor area, driver area and
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index bc4a51e4638b..1fba305ba8c1 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -651,6 +651,10 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, 
unsigned int cmd,
case VHOST_VDPA_SET_VRING_ENABLE:
if (copy_from_user(&s, argp, sizeof(s)))
return -EFAULT;
+   if (!vhost_backend_has_feature(vq,
+   VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) &&
+   (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
+   return -EINVAL;


As discussed, without VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK, we don't
know if parents can do vq_ready after driver_ok.

So maybe we need to keep this behaviour to unbreak some "legacy" userspace?


I'm not sure it's a good idea, since "legacy" userspace are currently 
broken if used with VDUSE device. So we need to fix userspace in any 
case, and IMHO is better if we start to return an error, so the user 
understands what went wrong, because the problem in QEMU took us quite 
some time to figure out that we couldn't enable vq after DRIVER_OK.


Since userspace is unable to understand if a vhost-vdpa device is VDUSE 
or not, I think we have only 2 options either merge this patch or fix 
VDUSE somehow. But the last one I think is more complicated/intrusive.


Thanks,
Stefano



For example ifcvf did:

static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev,
   u16 qid, bool ready)
{
 struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);

   ifcvf_set_vq_ready(vf, qid, ready);
}

And it did:

void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready)
{
   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;

   vp_iowrite16(qid, &cfg->queue_select);
   vp_iowrite16(ready, &cfg->queue_enable);
}

Though it didn't advertise VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK?

Adding LingShan for more thought.

Thanks


ops->set_vq_ready(vdpa, idx, s.num);
return 0;
case VHOST_VDPA_GET_VRING_GROUP:
--
2.43.0








Re: [PATCH net-next 1/2] net/vsockmon: Leverage core stats allocator

2024-02-26 Thread Stefano Garzarella

On Fri, Feb 23, 2024 at 03:58:37AM -0800, Breno Leitao wrote:

With commit 34d21de99cea9 ("net: Move {l,t,d}stats allocation to core and
convert veth & vrf"), stats allocation could be done on net core
instead of this driver.

With this new approach, the driver doesn't have to bother with error
handling (allocation failure checking, making sure free happens in the
right spot, etc). This is core responsibility now.

Remove the allocation in the vsockmon driver and leverage the network
core allocation instead.

Signed-off-by: Breno Leitao 
---
drivers/net/vsockmon.c | 16 +---
1 file changed, 1 insertion(+), 15 deletions(-)


Thanks for this patch!

Reviewed-by: Stefano Garzarella 



diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c
index b1bb1b04b664..a0b4dca36baf 100644
--- a/drivers/net/vsockmon.c
+++ b/drivers/net/vsockmon.c
@@ -13,19 +13,6 @@
#define DEFAULT_MTU (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + \
 sizeof(struct af_vsockmon_hdr))

-static int vsockmon_dev_init(struct net_device *dev)
-{
-   dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
-   if (!dev->lstats)
-   return -ENOMEM;
-   return 0;
-}
-
-static void vsockmon_dev_uninit(struct net_device *dev)
-{
-   free_percpu(dev->lstats);
-}
-
struct vsockmon {
struct vsock_tap vt;
};
@@ -79,8 +66,6 @@ static int vsockmon_change_mtu(struct net_device *dev, int 
new_mtu)
}

static const struct net_device_ops vsockmon_ops = {
-   .ndo_init = vsockmon_dev_init,
-   .ndo_uninit = vsockmon_dev_uninit,
.ndo_open = vsockmon_open,
.ndo_stop = vsockmon_close,
.ndo_start_xmit = vsockmon_xmit,
@@ -112,6 +97,7 @@ static void vsockmon_setup(struct net_device *dev)
dev->flags = IFF_NOARP;

dev->mtu = DEFAULT_MTU;
+   dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS;
}

static struct rtnl_link_ops vsockmon_link_ops __read_mostly = {
--
2.39.3






Re: [PATCH net-next 2/2] net/vsockmon: Do not set zeroed statistics

2024-02-26 Thread Stefano Garzarella

On Fri, Feb 23, 2024 at 03:58:38AM -0800, Breno Leitao wrote:

Do not set rtnl_link_stats64 fields to zero, since they are zeroed
before ops->ndo_get_stats64 is called in core dev_get_stats() function.

Signed-off-by: Breno Leitao 
---
drivers/net/vsockmon.c | 3 ---
1 file changed, 3 deletions(-)


Reviewed-by: Stefano Garzarella 



diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c
index a0b4dca36baf..a1ba5169ed5d 100644
--- a/drivers/net/vsockmon.c
+++ b/drivers/net/vsockmon.c
@@ -46,9 +46,6 @@ static void
vsockmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes);
-
-   stats->tx_packets = 0;
-   stats->tx_bytes = 0;
}

static int vsockmon_is_valid_mtu(int new_mtu)
--
2.39.3






Re: [PATCH v3] vhost/vdpa: Add MSI translation tables to iommu for software-managed MSI

2024-03-20 Thread Stefano Garzarella

On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote:

From: Rong Wang 

Once enable iommu domain for one device, the MSI
translation tables have to be there for software-managed MSI.
Otherwise, platform with software-managed MSI without an
irq bypass function, can not get a correct memory write event
from pcie, will not get irqs.
The solution is to obtain the MSI phy base address from
iommu reserved region, and set it to iommu MSI cookie,
then translation tables will be created while request irq.

Change log
--

v1->v2:
- add resv iotlb to avoid overlap mapping.
v2->v3:
- there is no need to export the iommu symbol anymore.

Signed-off-by: Rong Wang 
---
drivers/vhost/vdpa.c | 59 +---
1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ba52d128aeb7..28b56b10372b 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -49,6 +49,7 @@ struct vhost_vdpa {
struct completion completion;
struct vdpa_device *vdpa;
struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS];
+   struct vhost_iotlb resv_iotlb;
struct device dev;
struct cdev cdev;
atomic_t opened;
@@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v)
static int vhost_vdpa_reset(struct vhost_vdpa *v)
{
v->in_batch = 0;
+   vhost_iotlb_reset(&v->resv_iotlb);
return _compat_vdpa_reset(v);
}

@@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct 
vhost_vdpa *v,
msg->iova + msg->size - 1 > v->range.last)
return -EINVAL;

+   if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova,
+   msg->iova + msg->size - 1))
+   return -EINVAL;
+
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;

+


Unnecessary new line here.


if (vdpa->use_va)
return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size,
 msg->uaddr, msg->perm);
@@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb 
*iocb,
return vhost_chr_write_iter(dev, from);
}

+static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct 
device *dma_dev,
+   struct vhost_iotlb *resv_iotlb)
+{
+   struct list_head dev_resv_regions;
+   phys_addr_t resv_msi_base = 0;
+   struct iommu_resv_region *region;
+   int ret = 0;
+   bool with_sw_msi = false;
+   bool with_hw_msi = false;
+
+   INIT_LIST_HEAD(&dev_resv_regions);
+   iommu_get_resv_regions(dma_dev, &dev_resv_regions);
+
+   list_for_each_entry(region, &dev_resv_regions, list) {
+   ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start,
+   region->start + region->length - 1,
+   0, 0, NULL);
+   if (ret) {
+   vhost_iotlb_reset(resv_iotlb);
+   break;
+   }
+
+   if (region->type == IOMMU_RESV_MSI)
+   with_hw_msi = true;
+
+   if (region->type == IOMMU_RESV_SW_MSI) {
+   resv_msi_base = region->start;


Can it happen that there are multiple regions of the IOMMU_RESV_SW_MSI 
type?


In this case, is it correct to overwrite `resv_msi_base`?


+   with_sw_msi = true;
+   }
+   }
+
+   if (!ret && !with_hw_msi && with_sw_msi)
+   ret = iommu_get_msi_cookie(domain, resv_msi_base);


If `iommu_get_msi_cookie()` fails:
 - Should we avoid calling iommu_put_resv_regions()?
 - Should we also call `vhost_iotlb_reset(resv_iotlb)` like for the
   vhost_iotlb_add_range_ctx() failure ?

If it is the case, maybe it's better to add an error label where do the 
cleanup.



+
+   iommu_put_resv_regions(dma_dev, &dev_resv_regions);
+
+   return ret;
+}
+
static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
{
struct vdpa_device *vdpa = v->vdpa;
@@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)

ret = iommu_attach_device(v->domain, dma_dev);
if (ret)
-   goto err_attach;
+   goto err_alloc_domain;

-   return 0;
+   ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb);
+   if (ret)
+   goto err_attach_device;

-err_attach:
+   return 0;


I suggest to add a new line here to separate the error path for the 
success path.



+err_attach_device:
+   iommu_detach_device(v->domain, dma_dev);
+err_alloc_domain:
iommu_domain_free(v->domain);
v->domain = NULL;
return ret;
@@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
goto err;
}

+   vhost_iotlb_init(&v->resv_iotlb, 0, 0);
+


IIUC t

Re: [PATCH] vsock/virtio: fix packet delivery to tap device

2024-03-25 Thread Stefano Garzarella

On Mon, Mar 25, 2024 at 06:12:38PM +0100, Marco Pinna wrote:

Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added
virtio_transport_deliver_tap_pkt() for handing packets to the
vsockmon device. However, in virtio_transport_send_pkt_work(),
the function is called before actually sending the packet (i.e.
before placing it in the virtqueue with virtqueue_add_sgs() and checking
whether it returned successfully).


From here..


This may cause timing issues since
the sending of the packet may fail, causing it to be re-queued
(possibly multiple times), while the tap device would show the
packet being sent correctly.


to here...

This a bit unclear, I would rephrase with something like this:

Queuing the packet in the virtqueue can fail even multiple times.
However, in virtio_transport_deliver_tap_pkt() we deliver the packet
to the monitoring tap interface only the first time we call it.
This certainly avoids seeing the same packet replicated multiple
times in the monitoring interface, but it can show the packet
sent with the wrong timestamp or even before we succeed to queue
it in the virtqueue.



Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs()
and making sure it returned successfully.

Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks")
Signed-off-by: Marco Pinna 
---
net/vmw_vsock/virtio_transport.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 1748268e0694..ee5d306a96d0 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work)
if (!skb)
break;

-   virtio_transport_deliver_tap_pkt(skb);
reply = virtio_vsock_skb_reply(skb);
sgs = vsock->out_sgs;
sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
@@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work)
break;
}

+   virtio_transport_deliver_tap_pkt(skb);
+


I was just worried that consume_skb(), called in
virtio_transport_tx_work() when the host sends an interrupt to the guest
after it has consumed the packet, might be called before this point,
but both run with `vsock->tx_lock` held, so we are protected from
this case.

So, the patch LGTM, I would just clarify the commit message.

Thanks,
Stefano


if (reply) {
struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
int val;
--
2.44.0






Re: [PATCH v3 1/3] vhost: Add smp_rmb() in vhost_vq_avail_empty()

2024-03-28 Thread Stefano Garzarella

On Thu, Mar 28, 2024 at 10:21:47AM +1000, Gavin Shan wrote:

A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by
Will. Otherwise, it's not ensured the available ring entries pushed
by guest can be observed by vhost in time, leading to stale available
ring entries fetched by vhost in vhost_get_vq_desc(), as reported by
Yihuang Yu on NVidia's grace-hopper (ARM64) platform.

 /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
 -accel kvm -machine virt,gic-version=host -cpu host  \
 -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
 -m 4096M,slots=16,maxmem=64G \
 -object memory-backend-ram,id=mem0,size=4096M\
  :   \
 -netdev tap,id=vnet0,vhost=true  \
 -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
  :
 guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
 virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch()
returns true, it means there's still pending tx buffers. Since it might
read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc().
Note that it should be safe until vq->avail_idx is changed by commit
275bf960ac697 ("vhost: better detection of available buffers").

Fixes: 275bf960ac69 ("vhost: better detection of available buffers")
Cc:  # v4.11+
Reported-by: Yihuang Yu 
Suggested-by: Will Deacon 
Signed-off-by: Gavin Shan 
Acked-by: Jason Wang 
---
drivers/vhost/vhost.c | 12 +++-
1 file changed, 11 insertions(+), 1 deletion(-)


Reviewed-by: Stefano Garzarella 



diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..29df65b2ebf2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
r = vhost_get_avail_idx(vq, &avail_idx);
if (unlikely(r))
return false;
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Since we have updated avail_idx, the following
+* call to vhost_get_vq_desc() will read available
+* ring entries. Make sure that read happens after
+* the avail_idx read.
+*/
+   smp_rmb();
+   return false;
+   }

-   return vq->avail_idx == vq->last_avail_idx;
+   return true;
}
EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);

--
2.44.0






Re: [PATCH v3 2/3] vhost: Add smp_rmb() in vhost_enable_notify()

2024-03-28 Thread Stefano Garzarella

On Thu, Mar 28, 2024 at 10:21:48AM +1000, Gavin Shan wrote:

A smp_rmb() has been missed in vhost_enable_notify(), inspired by
Will. Otherwise, it's not ensured the available ring entries pushed
by guest can be observed by vhost in time, leading to stale available
ring entries fetched by vhost in vhost_get_vq_desc(), as reported by
Yihuang Yu on NVidia's grace-hopper (ARM64) platform.

 /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64  \
 -accel kvm -machine virt,gic-version=host -cpu host  \
 -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \
 -m 4096M,slots=16,maxmem=64G \
 -object memory-backend-ram,id=mem0,size=4096M\
  :   \
 -netdev tap,id=vnet0,vhost=true  \
 -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0
  :
 guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM
 virtio_net virtio0: output.0:id 100 is not a head!

Add the missed smp_rmb() in vhost_enable_notify(). When it returns true,
it means there's still pending tx buffers. Since it might read indices,
so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that
it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb
("vhost: cache avail index in vhost_enable_notify()").

Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()")
Cc:  # v5.18+
Reported-by: Yihuang Yu 
Suggested-by: Will Deacon 
Signed-off-by: Gavin Shan 
Acked-by: Jason Wang 
---
drivers/vhost/vhost.c | 12 +++-
1 file changed, 11 insertions(+), 1 deletion(-)


Thanks for fixing this!

Reviewed-by: Stefano Garzarella 



diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 29df65b2ebf2..32686c79c41d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct 
vhost_virtqueue *vq)
   &vq->avail->idx, r);
return false;
}
+
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+   if (vq->avail_idx != vq->last_avail_idx) {
+   /* Since we have updated avail_idx, the following
+* call to vhost_get_vq_desc() will read available
+* ring entries. Make sure that read happens after
+* the avail_idx read.
+*/
+   smp_rmb();
+   return true;
+   }

-   return vq->avail_idx != vq->last_avail_idx;
+   return false;
}
EXPORT_SYMBOL_GPL(vhost_enable_notify);

--
2.44.0






Re: [PATCH net v2] vsock/virtio: fix packet delivery to tap device

2024-03-29 Thread Stefano Garzarella

On Fri, Mar 29, 2024 at 05:12:59PM +0100, Marco Pinna wrote:

Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added
virtio_transport_deliver_tap_pkt() for handing packets to the
vsockmon device. However, in virtio_transport_send_pkt_work(),
the function is called before actually sending the packet (i.e.
before placing it in the virtqueue with virtqueue_add_sgs() and checking
whether it returned successfully).
Queuing the packet in the virtqueue can fail even multiple times.
However, in virtio_transport_deliver_tap_pkt() we deliver the packet
to the monitoring tap interface only the first time we call it.
This certainly avoids seeing the same packet replicated multiple times
in the monitoring interface, but it can show the packet sent with the
wrong timestamp or even before we succeed to queue it in the virtqueue.

Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs()
and making sure it returned successfully.

Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks")
Cc: sta...@vge.kernel.org
Signed-off-by: Marco Pinna 
---
net/vmw_vsock/virtio_transport.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)


Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 1748268e0694..ee5d306a96d0 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work)
if (!skb)
break;

-   virtio_transport_deliver_tap_pkt(skb);
reply = virtio_vsock_skb_reply(skb);
sgs = vsock->out_sgs;
sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
@@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work)
break;
}

+   virtio_transport_deliver_tap_pkt(skb);
+
if (reply) {
struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
int val;
--
2.44.0






Re: [PATCH] vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE

2024-04-03 Thread Stefano Garzarella

On Tue, Apr 02, 2024 at 05:21:39PM -0400, Michael S. Tsirkin wrote:

VDPA_GET_VRING_SIZE by mistake uses the already occupied
ioctl # 0x80 and we never noticed - it happens to work
because the direction and size are different, but confuses
tools such as perf which like to look at just the number,
and breaks the extra robustness of the ioctl numbering macros.

To fix, sort the entries and renumber the ioctl - not too late
since it wasn't in any released kernels yet.

Cc: Arnaldo Carvalho de Melo 
Reported-by: Namhyung Kim 
Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size")
Cc: "Zhu Lingshan" 
Signed-off-by: Michael S. Tsirkin 
---

Build tested only - userspace patches using this will have to adjust.
I will merge this in a week or so unless I hear otherwise,
and afterwards perf can update there header.


Fortunately, we haven't released any kernels with this yet, right?
(other than v6.9-rc*)

LGTM:

Reviewed-by: Stefano Garzarella 



include/uapi/linux/vhost.h | 15 ---
1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index bea697390613..b95dd84eef2d 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -179,12 +179,6 @@
/* Get the config size */
#define VHOST_VDPA_GET_CONFIG_SIZE  _IOR(VHOST_VIRTIO, 0x79, __u32)

-/* Get the count of all virtqueues */
-#define VHOST_VDPA_GET_VQS_COUNT   _IOR(VHOST_VIRTIO, 0x80, __u32)
-
-/* Get the number of virtqueue groups. */
-#define VHOST_VDPA_GET_GROUP_NUM   _IOR(VHOST_VIRTIO, 0x81, __u32)
-
/* Get the number of address spaces. */
#define VHOST_VDPA_GET_AS_NUM   _IOR(VHOST_VIRTIO, 0x7A, unsigned int)

@@ -228,10 +222,17 @@
#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F,   \
  struct vhost_vring_state)

+
+/* Get the count of all virtqueues */
+#define VHOST_VDPA_GET_VQS_COUNT   _IOR(VHOST_VIRTIO, 0x80, __u32)
+
+/* Get the number of virtqueue groups. */
+#define VHOST_VDPA_GET_GROUP_NUM   _IOR(VHOST_VIRTIO, 0x81, __u32)
+
/* Get the queue size of a specific virtqueue.
 * userspace set the vring index in vhost_vring_state.index
 * kernel set the queue size in vhost_vring_state.num
 */
-#define VHOST_VDPA_GET_VRING_SIZE  _IOWR(VHOST_VIRTIO, 0x80,   \
+#define VHOST_VDPA_GET_VRING_SIZE  _IOWR(VHOST_VIRTIO, 0x82,   \
  struct vhost_vring_state)
#endif
--
MST







Re: [PATCH] vhost/vsock: always initialize seqpacket_allow

2024-05-16 Thread Stefano Garzarella

On Wed, May 15, 2024 at 11:05:43AM GMT, Michael S. Tsirkin wrote:

There are two issues around seqpacket_allow:
1. seqpacket_allow is not initialized when socket is
  created. Thus if features are never set, it will be
  read uninitialized.
2. if VIRTIO_VSOCK_F_SEQPACKET is set and then cleared,
  then seqpacket_allow will not be cleared appropriately
  (existing apps I know about don't usually do this but
   it's legal and there's no way to be sure no one relies
   on this).

To fix:
- initialize seqpacket_allow after allocation
- set it unconditionally in set_features

Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com
Reported-by: Jeongjun Park 
Fixes: ced7b713711f ("vhost/vsock: support SEQPACKET for transport").
Cc: Arseny Krasnov 
Cc: David S. Miller 
Cc: Stefan Hajnoczi 
Signed-off-by: Michael S. Tsirkin 
Acked-by: Arseniy Krasnov 
Tested-by: Arseniy Krasnov 

---


Reposting now it's been tested.

drivers/vhost/vsock.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)


Thanks for fixing this issue!

Reviewed-by: Stefano Garzarella 




Re: [RFC PATCH v1 0/2] send credit update during setting SO_RCVLOWAT

2023-11-15 Thread Stefano Garzarella

On Wed, Nov 08, 2023 at 10:20:02AM +0300, Arseniy Krasnov wrote:

Hello,

  DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

 TEST

   INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
  hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


STEPS

   SENDER  RECEIVER
1) sends 128KB + 1 byte in a
  single buffer. 128KB will
  be sent, but for 1 byte
  sender will wait for free
  space at peer. Sender goes
  to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
  only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value


Thanks for adding the test!


for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless).


I see, I'll leave a comment in the patch!

Thanks,
Stefano



Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=ff269e2cd5adce4ae14f883fc9c8803bc43ee1e9

Arseniy Krasnov (2):
 virtio/vsock: send credit update during setting SO_RCVLOWAT
 vsock/test: SO_RCVLOWAT + deferred credit update test

drivers/vhost/vsock.c   |   2 +
include/linux/virtio_vsock.h|   1 +
net/vmw_vsock/virtio_transport.c|   2 +
net/vmw_vsock/virtio_transport_common.c |  31 ++
net/vmw_vsock/vsock_loopback.c  |   2 +
tools/testing/vsock/vsock_test.c| 131 
6 files changed, 169 insertions(+)

--
2.25.1






Re: [RFC PATCH v1 1/2] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-15 Thread Stefano Garzarella

On Wed, Nov 08, 2023 at 10:20:03AM +0300, Arseniy Krasnov wrote:

This adds sending credit update message when SO_RCVLOWAT is updated and
it is bigger than number of bytes in rx queue. It is needed, because
'poll()' will wait until number of bytes in rx queue will be not smaller
than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual
hungup for tx/rx is possible: sender waits for free space and receiver
is waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
drivers/vhost/vsock.c   |  2 ++
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  2 ++
net/vmw_vsock/virtio_transport_common.c | 31 +
net/vmw_vsock/vsock_loopback.c  |  2 ++
5 files changed, 38 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ecfa5c11f5ee 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},

.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..97dc1bebc69c 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val);
#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..cf3431189d0c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},

.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e22c81435ef7..88a58163046e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1676,6 +1676,37 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
}
EXPORT_SYMBOL_GPL(virtio_transport_read_skb);

+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update = false;


I'd declare this not initialized.


+
+   spin_lock_bh(&vvs->rx_lock);
+
+   /* If number of available bytes is less than new
+* SO_RCVLOWAT value, kick sender to send more
+* data, because sender may sleep in its 'send()'
+* syscall waiting for enough space at our side.
+*/
+   if (vvs->rx_bytes < val)
+   send_update = true;


Then here just:
send_update = vvs->rx_bytes < val;


+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   WRITE_ONCE(sk_vsock(vsk)->sk_rcvlowat, val ? : 1);


Not in this patch, but what about doing this in vsock_set_rcvlowat() in 
af_vsock.c?


I mean avoid to return if `transport->set_rcvlowat(vsk, val)` is
successfully, so set sk_rcvlowat in a single point.

The rest LGTM!

Stefano


+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat);
+
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Asias He");
MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..388c157f6633 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},

.send_pkt = vsock_loopback_send_pkt,
-- 2.25.1






Re: [RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-15 Thread Stefano Garzarella

On Wed, Nov 08, 2023 at 10:20:04AM +0300, Arseniy Krasnov wrote:

This adds test which checks, that updating SO_RCVLOWAT value also sends


You can avoid "This adds", and write just "Add test ...".

See 
https://docs.kernel.org/process/submitting-patches.html#describe-your-changes

Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
to do frotz", as if you are giving orders to the codebase to change
its behaviour.

Also in the other patch.


credit update message. Otherwise mutual hungup may happen when receiver
didn't send credit update and then calls 'poll()' with non default
SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender
waits for free space at receiver's side.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 131 +++
1 file changed, 131 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index c1f7bc9abd22..c71b3875fd16 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct 
test_opts *opts)
close(fd);
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)


What about adding a comment like the one in the cover letter about
dependency with kernel values?

Please add it also in the commit description.

I'm thinking if we should move all the defines that depends on the
kernel in some special header.


+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   control_expectln("SRVREADY");
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  &buf_size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln("SRVREADY");
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   recv_buf_size++;
+
+   /* Updating SO_RCVLOWAT will send credit update. */
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  &recv_buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+
+   memset(&fds, 0, sizeof(fds));
+   fds.fd = fd;
+   fds.events = POLLIN | POLLRDNORM | POLLERR |
+POLLRDHUP | POLLHUP;
+
+   /* This 'poll()' will return once we receive last byte
+* sent by client.
+*/
+   if (poll(&fds, 1, -1) < 0) {
+   perror("poll");
+   exit(EXIT_FAILURE);
+   }
+
+   if (fds.revents & POLLERR) {
+   fprintf(stderr, "'poll()' error\n");
+   exit(EXIT_FAILURE);
+   }
+
+   if (fds.revents & (POLLIN | POLLRDNORM)) {
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   } else {
+   /* These flags must be set, as there is at
+* least 64KB of da

Re: [RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-17 Thread Stefano Garzarella

On Fri, Nov 17, 2023 at 10:12:38AM +0300, Arseniy Krasnov wrote:



On 15.11.2023 14:11, Stefano Garzarella wrote:

On Wed, Nov 08, 2023 at 10:20:04AM +0300, Arseniy Krasnov wrote:

This adds test which checks, that updating SO_RCVLOWAT value also sends


You can avoid "This adds", and write just "Add test ...".

See 
https://docs.kernel.org/process/submitting-patches.html#describe-your-changes

    Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
    instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
    to do frotz", as if you are giving orders to the codebase to change
    its behaviour.

Also in the other patch.


credit update message. Otherwise mutual hungup may happen when receiver
didn't send credit update and then calls 'poll()' with non default
SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender
waits for free space at receiver's side.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 131 +++
1 file changed, 131 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index c1f7bc9abd22..c71b3875fd16 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct 
test_opts *opts)
close(fd);
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE    (1024 * 128)
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE    (1024 * 64)


What about adding a comment like the one in the cover letter about
dependency with kernel values?

Please add it also in the commit description.

I'm thinking if we should move all the defines that depends on the
kernel in some special header.


IIUC it will be new header file in tools/testing/vsock, which includes such 
defines. At
this moment in will contain only VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. Idea is that 
such defines


So this only works on the virtio transport though, not the other
transports, right? (but maybe the others don't have this problem, so
it's fine).


are not supposed to use by user (so do not move it to uapi headers), but needed 
by tests
to check kernel behaviour. Please correct me if i'm wrong.


Right!
Maybe if it's just one, we can leave it there for now, but with a
comment on top explaining where it comes.

Thanks,
Stefano




Re: [PATCH net v1] vsock/test: fix SEQPACKET message bounds test

2023-11-22 Thread Stefano Garzarella

On Wed, Nov 22, 2023 at 12:16:42AM +0300, Arseniy Krasnov wrote:

Tune message length calculation to make this test work on machines
where 'getpagesize()' returns >32KB. Now maximum message length is not
hardcoded (on machines above it was smaller than 'getpagesize()' return
value, thus we get negative value and test fails), but calculated at
runtime and always bigger than 'getpagesize()' result. Reproduced on
aarch64 with 64KB page size.


It was reported to me by Bogdan, so we can add:

Reported-by: Bogdan Marcynkov 



Fixes: 5c338112e48a ("test/vsock: rework message bounds test")
Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 19 +--
1 file changed, 13 insertions(+), 6 deletions(-)


The fix LGTM and it worked on aarch64 machine.

Reviewed-by: Stefano Garzarella 

Thanks for the fast fix!
Stefano



diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index f5623b8d76b7..691e44c746bf 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -353,11 +353,12 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
}

#define SOCK_BUF_SIZE (2 * 1024 * 1024)
-#define MAX_MSG_SIZE (32 * 1024)
+#define MAX_MSG_PAGES 4

static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
{
unsigned long curr_hash;
+   size_t max_msg_size;
int page_size;
int msg_count;
int fd;
@@ -373,7 +374,8 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)

curr_hash = 0;
page_size = getpagesize();
-   msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE;
+   max_msg_size = MAX_MSG_PAGES * page_size;
+   msg_count = SOCK_BUF_SIZE / max_msg_size;

for (int i = 0; i < msg_count; i++) {
size_t buf_size;
@@ -383,7 +385,7 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)
/* Use "small" buffers and "big" buffers. */
if (i & 1)
buf_size = page_size +
-   (rand() % (MAX_MSG_SIZE - page_size));
+   (rand() % (max_msg_size - page_size));
else
buf_size = 1 + (rand() % page_size);

@@ -429,7 +431,6 @@ static void test_seqpacket_msg_bounds_server(const struct 
test_opts *opts)
unsigned long remote_hash;
unsigned long curr_hash;
int fd;
-   char buf[MAX_MSG_SIZE];
struct msghdr msg = {0};
struct iovec iov = {0};

@@ -457,8 +458,13 @@ static void test_seqpacket_msg_bounds_server(const struct 
test_opts *opts)
control_writeln("SRVREADY");
/* Wait, until peer sends whole data. */
control_expectln("SENDDONE");
-   iov.iov_base = buf;
-   iov.iov_len = sizeof(buf);
+   iov.iov_len = MAX_MSG_PAGES * getpagesize();
+   iov.iov_base = malloc(iov.iov_len);
+   if (!iov.iov_base) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
msg.msg_iov = &iov;
msg.msg_iovlen = 1;

@@ -483,6 +489,7 @@ static void test_seqpacket_msg_bounds_server(const struct 
test_opts *opts)
curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size);
}

+   free(iov.iov_base);
close(fd);
remote_hash = control_readulong();

--
2.25.1






Re: [RFC PATCH v3 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-29 Thread Stefano Garzarella

On Wed, Nov 22, 2023 at 09:05:09PM +0300, Arseniy Krasnov wrote:

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Do not initialize 'send_update' variable - set it directly during
   first usage.

drivers/vhost/vsock.c   |  2 ++
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  2 ++
net/vmw_vsock/virtio_transport_common.c | 28 +
net/vmw_vsock/vsock_loopback.c  |  2 ++
5 files changed, 35 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ecfa5c11f5ee 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat


Since now we don't set it anymore in the callback, what about following
the notify_* callbacks and rename it in `notify_set_rcvlowat`?

Eventually I think we can rename it in the previous patch.


},

.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..97dc1bebc69c 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val);
#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..cf3431189d0c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},

.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..4acee21b4350 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,34 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
}
EXPORT_SYMBOL_GPL(virtio_transport_read_skb);

+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(&vvs->rx_lock);
+
+   /* If number of available bytes is less than new
+* SO_RCVLOWAT value, kick sender to send more
+* data, because sender may sleep in its 'send()'
+* syscall waiting for enough space at our side.
+*/


Let's try to use at least the full 80 characters so we can reduce the
lines in this comment block.


+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat);
+
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Asias He");
MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..388c157f6633 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},

.send_pkt = vsock_loopback_send_pkt,
--
2.25.1






Re: [RFC PATCH v3 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-29 Thread Stefano Garzarella

On Wed, Nov 22, 2023 at 09:05:10PM +0300, Arseniy Krasnov wrote:

Test which checks, that updating SO_RCVLOWAT value also sends credit
update message. Otherwise mutual hungup may happen when receiver didn't
send credit update and then calls 'poll()' with non default SO_RCVLOWAT
value (e.g. waiting enough bytes to read), while sender waits for free
space at receiver's side. Important thing is that this test relies on
kernel's define for maximum packet size for virtio transport and this
value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this
define is used to control moment when to send credit update message).
If this value or its usage will be changed in kernel - this test may
become useless/broken.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Update commit message by adding details about dependency for this
   test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
 * Add comment for this dependency in 'vsock_test.c' where this define
   is duplicated.
v2 -> v3:
 * Replace synchronization based on control TCP socket with vsock
   data socket - this is needed to allow sender transmit data only
   when new buffer size of receiver is visible to sender. Otherwise
   there is race and test fails sometimes.

tools/testing/vsock/vsock_test.c | 142 +++
1 file changed, 142 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 5b0e93f9996c..773a71260fba 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1225,6 +1225,143 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  &buf_size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   recv_buf_size++;
+
+   /* Updating SO_RCVLOWAT will send credit update. */
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  &recv_buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RC

Re: [RFC PATCH v3 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-29 Thread Stefano Garzarella

On Wed, Nov 29, 2023 at 12:16:54PM +0300, Arseniy Krasnov wrote:



On 29.11.2023 12:16, Stefano Garzarella wrote:

On Wed, Nov 22, 2023 at 09:05:10PM +0300, Arseniy Krasnov wrote:

Test which checks, that updating SO_RCVLOWAT value also sends credit
update message. Otherwise mutual hungup may happen when receiver didn't
send credit update and then calls 'poll()' with non default SO_RCVLOWAT
value (e.g. waiting enough bytes to read), while sender waits for free
space at receiver's side. Important thing is that this test relies on
kernel's define for maximum packet size for virtio transport and this
value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this
define is used to control moment when to send credit update message).
If this value or its usage will be changed in kernel - this test may
become useless/broken.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Update commit message by adding details about dependency for this
   test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
 * Add comment for this dependency in 'vsock_test.c' where this define
   is duplicated.
v2 -> v3:
 * Replace synchronization based on control TCP socket with vsock
   data socket - this is needed to allow sender transmit data only
   when new buffer size of receiver is visible to sender. Otherwise
   there is race and test fails sometimes.

tools/testing/vsock/vsock_test.c | 142 +++
1 file changed, 142 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 5b0e93f9996c..773a71260fba 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1225,6 +1225,143 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE    (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE    (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+    size_t buf_size;
+    void *buf;
+    int fd;
+
+    fd = vsock_stream_connect(opts->peer_cid, 1234);
+    if (fd < 0) {
+    perror("connect");
+    exit(EXIT_FAILURE);
+    }
+
+    /* Send 1 byte more than peer's buffer size. */
+    buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+    buf = malloc(buf_size);
+    if (!buf) {
+    perror("malloc");
+    exit(EXIT_FAILURE);
+    }
+
+    /* Wait until peer sets needed buffer size. */
+    recv_byte(fd, 1, 0);
+
+    if (send(fd, buf, buf_size, 0) != buf_size) {
+    perror("send failed");
+    exit(EXIT_FAILURE);
+    }
+
+    free(buf);
+    close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+    size_t recv_buf_size;
+    struct pollfd fds;
+    size_t buf_size;
+    void *buf;
+    int fd;
+
+    fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+    if (fd < 0) {
+    perror("accept");
+    exit(EXIT_FAILURE);
+    }
+
+    buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+    if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+   &buf_size, sizeof(buf_size))) {
+    perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+    exit(EXIT_FAILURE);
+    }
+
+    /* Send one dummy byte here, because 'setsockopt()' above also
+ * sends special packet which tells sender to update our buffer
+ * size. This 'send_byte()' will serialize such packet with data
+ * reads in a loop below. Sender starts transmission only when
+ * it receives this single byte.
+ */
+    send_byte(fd, 1, 0);
+
+    buf = malloc(buf_size);
+    if (!buf) {
+    perror("malloc");
+    exit(EXIT_FAILURE);
+    }
+
+    /* Wait until there will be 128KB of data in rx queue. */
+    while (1) {
+    ssize_t res;
+
+    res = recv(fd, buf, buf_size, MSG_PEEK);
+    if (res == buf_size)
+    break;
+
+    if (res <= 0) {
+    fprintf(stderr, "unexpected 'recv()' return: %zi\n", res);
+    exit(EXIT_FAILURE);
+    }
+    }
+
+    /* There is 128KB of data in the socket's rx queue,
+ * dequeue first 64KB, credit update is not sent.
+ */
+    recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+    recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+    recv_buf_size++;
+
+    /* Updating SO_RCVLOWAT will send credit update. */
+    if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+   &recv_buf_size, sizeof(recv_buf_size))) {
+    perror("setsockopt(SO_RCVLOWA

Re: [RFC PATCH v4 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Stefano Garzarella

On Thu, Nov 30, 2023 at 12:25:18AM +0300, Arseniy Krasnov wrote:

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Do not initialize 'send_update' variable - set it directly during
   first usage.
v3 -> v4:
 * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.

drivers/vhost/vsock.c   |  3 ++-
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  3 ++-
net/vmw_vsock/virtio_transport_common.c | 27 +
net/vmw_vsock/vsock_loopback.c  |  3 ++-
5 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..c5e58a60a546 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,8 +449,9 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

-   .read_skb = virtio_transport_read_skb,
+   .read_skb = virtio_transport_read_skb


I think it is better to avoid this change, so when we will need to add
new callbacks, we don't need to edit this line again.

Please avoid it also in the other place in this patch.

The rest LGTM.

Thanks,
Stefano


},

.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
virtio_vsock_sock *vvs, u32 credit);

void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..8b7bb7ca8ea5 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,8 +537,9 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

-   .read_skb = virtio_transport_read_skb,
+   .read_skb = virtio_transport_read_skb
},

.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..1cb556ad4597 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock 
*vsk, skb_read_actor_t recv_acto

}
EXPORT_SYMBOL_GPL(virtio_transport_read_skb);

+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(&vvs->rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side.
+*/
+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Asias He");
MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..454f69838c2a 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,8 +96,9 @@ static struct virtio_transport loopback_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pr

Re: [RFC PATCH v4 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-30 Thread Stefano Garzarella

On Thu, Nov 30, 2023 at 12:25:19AM +0300, Arseniy Krasnov wrote:

Test which checks, that updating SO_RCVLOWAT value also sends credit
update message. Otherwise mutual hungup may happen when receiver didn't
send credit update and then calls 'poll()' with non default SO_RCVLOWAT
value (e.g. waiting enough bytes to read), while sender waits for free
space at receiver's side. Important thing is that this test relies on
kernel's define for maximum packet size for virtio transport and this
value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this
define is used to control moment when to send credit update message).
If this value or its usage will be changed in kernel - this test may
become useless/broken.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Update commit message by adding details about dependency for this
   test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
 * Add comment for this dependency in 'vsock_test.c' where this define
   is duplicated.
v2 -> v3:
 * Replace synchronization based on control TCP socket with vsock
   data socket - this is needed to allow sender transmit data only
   when new buffer size of receiver is visible to sender. Otherwise
   there is race and test fails sometimes.
v3 -> v4:
 * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
   in server part. This is needed to ensure that 'poll()' wake up us
   when number of bytes ready to read is equal to SO_RCVLOWAT value.

tools/testing/vsock/vsock_test.c | 149 +++
1 file changed, 149 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..68f7037834db 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,150 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  &buf_size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   recv_buf_size++;

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Stefano Garzarella

On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Do not initialize 'send_update' variable - set it directly during
   first usage.
v3 -> v4:
 * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
v4 -> v5:
 * Do not change callbacks order in transport structures.

drivers/vhost/vsock.c   |  1 +
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  1 +
net/vmw_vsock/virtio_transport_common.c | 27 +
net/vmw_vsock/vsock_loopback.c  |  1 +
5 files changed, 31 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..4146f80db8ac 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+   .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat


As we discussed in chat, better the order of the previous version, but
leaving the line of `.read_skb` untouched (with the final comma).

With that fixed in all transports, feel free to add:

Reviewed-by: Stefano Garzarella 


},

.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..8007593a3a93 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+   .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
},

.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..1cb556ad4597 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
}
EXPORT_SYMBOL_GPL(virtio_transport_read_skb);

+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(&vvs->rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side.
+*/
+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Asias He");
MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..9f4b814fbbc7 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,7 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,

.read_skb = virtio_transport_read_skb,
+   .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
},

.send_pkt = vsock_loopback_send_pkt,
--
2.25.1






Re: [PATCH net-next v5 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-30 Thread Stefano Garzarella

On Thu, Nov 30, 2023 at 04:08:40PM +0300, Arseniy Krasnov wrote:

Test which checks, that updating SO_RCVLOWAT value also sends credit
update message. Otherwise mutual hungup may happen when receiver didn't
send credit update and then calls 'poll()' with non default SO_RCVLOWAT
value (e.g. waiting enough bytes to read), while sender waits for free
space at receiver's side. Important thing is that this test relies on
kernel's define for maximum packet size for virtio transport and this
value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this
define is used to control moment when to send credit update message).
If this value or its usage will be changed in kernel - this test may
become useless/broken.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Update commit message by adding details about dependency for this
   test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
 * Add comment for this dependency in 'vsock_test.c' where this define
   is duplicated.
v2 -> v3:
 * Replace synchronization based on control TCP socket with vsock
   data socket - this is needed to allow sender transmit data only
   when new buffer size of receiver is visible to sender. Otherwise
   there is race and test fails sometimes.
v3 -> v4:
 * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
   in server part. This is needed to ensure that 'poll()' wake up us
   when number of bytes ready to read is equal to SO_RCVLOWAT value.
v4 -> v5:
 * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.

tools/testing/vsock/vsock_test.c | 142 +++
1 file changed, 142 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..d66bc4987026 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,143 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  &buf_size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   re

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-01 Thread Stefano Garzarella

On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote:

On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote:

On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
> On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
> >
> >
> > On 30.11.2023 16:42, Michael S. Tsirkin wrote:
> > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
> > >> Send credit update message when SO_RCVLOWAT is updated and it is bigger
> > >> than number of bytes in rx queue. It is needed, because 'poll()' will
> > >> wait until number of bytes in rx queue will be not smaller than
> > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
> > >> for tx/rx is possible: sender waits for free space and receiver is
> > >> waiting data in 'poll()'.
> > >>
> > >> Signed-off-by: Arseniy Krasnov 
> > >> ---
> > >>  Changelog:
> > >>  v1 -> v2:
> > >>   * Update commit message by removing 'This patch adds XXX' manner.
> > >>   * Do not initialize 'send_update' variable - set it directly during
> > >> first usage.
> > >>  v3 -> v4:
> > >>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
> > >>  v4 -> v5:
> > >>   * Do not change callbacks order in transport structures.
> > >>
> > >>  drivers/vhost/vsock.c   |  1 +
> > >>  include/linux/virtio_vsock.h|  1 +
> > >>  net/vmw_vsock/virtio_transport.c|  1 +
> > >>  net/vmw_vsock/virtio_transport_common.c | 27 +
> > >>  net/vmw_vsock/vsock_loopback.c  |  1 +
> > >>  5 files changed, 31 insertions(+)
> > >>
> > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > >> index f75731396b7e..4146f80db8ac 100644
> > >> --- a/drivers/vhost/vsock.c
> > >> +++ b/drivers/vhost/vsock.c
> > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
> > >>  .notify_buffer_size   = 
virtio_transport_notify_buffer_size,
> > >>
> > >>  .read_skb = virtio_transport_read_skb,
> > >> +.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat
> > >>  },
> > >>
> > >>  .send_pkt = vhost_transport_send_pkt,
> > >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > >> index ebb3ce63d64d..c82089dee0c8 100644
> > >> --- a/include/linux/virtio_vsock.h
> > >> +++ b/include/linux/virtio_vsock.h
> > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
virtio_vsock_sock *vvs, u32 credit);
> > >>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
> > >>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
> > >>  int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
> > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int 
val);
> > >>  #endif /* _LINUX_VIRTIO_VSOCK_H */
> > >> diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c
> > >> index af5bab1acee1..8007593a3a93 100644
> > >> --- a/net/vmw_vsock/virtio_transport.c
> > >> +++ b/net/vmw_vsock/virtio_transport.c
> > >> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
> > >>  .notify_buffer_size   = 
virtio_transport_notify_buffer_size,
> > >>
> > >>  .read_skb = virtio_transport_read_skb,
> > >> +.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat
> > >>  },
> > >>
> > >>  .send_pkt = virtio_transport_send_pkt,
> > >> diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
> > >> index f6dc896bf44c..1cb556ad4597 100644
> > >> --- a/net/vmw_vsock/virtio_transport_common.c
> > >> +++ b/net/vmw_vsock/virtio_transport_common.c
> > >> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock 
*vsk, skb_read_actor_t recv_acto
> > >>  }
> > >>  EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
> > >>
> > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk,
> > >> int val)
> > >> +{
> > >> +  

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-01 Thread Stefano Garzarella

On Fri, Dec 01, 2023 at 11:35:56AM +0300, Arseniy Krasnov wrote:



On 01.12.2023 11:27, Stefano Garzarella wrote:

On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote:

On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote:

On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
> On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
> >
> >
> > On 30.11.2023 16:42, Michael S. Tsirkin wrote:
> > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
> > >> Send credit update message when SO_RCVLOWAT is updated and it is bigger
> > >> than number of bytes in rx queue. It is needed, because 'poll()' will
> > >> wait until number of bytes in rx queue will be not smaller than
> > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
> > >> for tx/rx is possible: sender waits for free space and receiver is
> > >> waiting data in 'poll()'.
> > >>
> > >> Signed-off-by: Arseniy Krasnov 
> > >> ---
> > >>  Changelog:
> > >>  v1 -> v2:
> > >>   * Update commit message by removing 'This patch adds XXX' manner.
> > >>   * Do not initialize 'send_update' variable - set it directly during
> > >> first usage.
> > >>  v3 -> v4:
> > >>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
> > >>  v4 -> v5:
> > >>   * Do not change callbacks order in transport structures.
> > >>
> > >>  drivers/vhost/vsock.c   |  1 +
> > >>  include/linux/virtio_vsock.h    |  1 +
> > >>  net/vmw_vsock/virtio_transport.c    |  1 +
> > >>  net/vmw_vsock/virtio_transport_common.c | 27 +
> > >>  net/vmw_vsock/vsock_loopback.c  |  1 +
> > >>  5 files changed, 31 insertions(+)
> > >>
> > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> > >> index f75731396b7e..4146f80db8ac 100644
> > >> --- a/drivers/vhost/vsock.c
> > >> +++ b/drivers/vhost/vsock.c
> > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
> > >>  .notify_buffer_size   = virtio_transport_notify_buffer_size,
> > >>
> > >>  .read_skb = virtio_transport_read_skb,
> > >> +    .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
> > >>  },
> > >>
> > >>  .send_pkt = vhost_transport_send_pkt,
> > >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > >> index ebb3ce63d64d..c82089dee0c8 100644
> > >> --- a/include/linux/virtio_vsock.h
> > >> +++ b/include/linux/virtio_vsock.h
> > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
virtio_vsock_sock *vvs, u32 credit);
> > >>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
> > >>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
> > >>  int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
> > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int 
val);
> > >>  #endif /* _LINUX_VIRTIO_VSOCK_H */
> > >> diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c
> > >> index af5bab1acee1..8007593a3a93 100644
> > >> --- a/net/vmw_vsock/virtio_transport.c
> > >> +++ b/net/vmw_vsock/virtio_transport.c
> > >> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
> > >>  .notify_buffer_size   = virtio_transport_notify_buffer_size,
> > >>
> > >>  .read_skb = virtio_transport_read_skb,
> > >> +    .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
> > >>  },
> > >>
> > >>  .send_pkt = virtio_transport_send_pkt,
> > >> diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
> > >> index f6dc896bf44c..1cb556ad4597 100644
> > >> --- a/net/vmw_vsock/virtio_transport_common.c
> > >> +++ b/net/vmw_vsock/virtio_transport_common.c
> > >> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock 
*vsk, skb_read_actor_t recv_acto
> > >>  }
> > >>  EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
> > >>
> > >> +int virtio_transport_notify_set_rcvlowat(struc

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-04 Thread Stefano Garzarella

On Sat, Dec 02, 2023 at 03:22:39PM -0500, Michael S. Tsirkin wrote:

On Fri, Dec 01, 2023 at 01:40:41PM +0300, Arseniy Krasnov wrote:



On 01.12.2023 12:48, Stefano Garzarella wrote:
> On Fri, Dec 01, 2023 at 11:35:56AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 01.12.2023 11:27, Stefano Garzarella wrote:
>>> On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote:
>>>> On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote:
>>>>> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
>>>>> > On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>>>> > >
>>>>> > >
>>>>> > > On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>>>> > > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>>>> > > >> Send credit update message when SO_RCVLOWAT is updated and it is 
bigger
>>>>> > > >> than number of bytes in rx queue. It is needed, because 'poll()' 
will
>>>>> > > >> wait until number of bytes in rx queue will be not smaller than
>>>>> > > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual 
hungup
>>>>> > > >> for tx/rx is possible: sender waits for free space and receiver is
>>>>> > > >> waiting data in 'poll()'.
>>>>> > > >>
>>>>> > > >> Signed-off-by: Arseniy Krasnov 
>>>>> > > >> ---
>>>>> > > >>  Changelog:
>>>>> > > >>  v1 -> v2:
>>>>> > > >>   * Update commit message by removing 'This patch adds XXX' manner.
>>>>> > > >>   * Do not initialize 'send_update' variable - set it directly 
during
>>>>> > > >> first usage.
>>>>> > > >>  v3 -> v4:
>>>>> > > >>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 
chars.
>>>>> > > >>  v4 -> v5:
>>>>> > > >>   * Do not change callbacks order in transport structures.
>>>>> > > >>
>>>>> > > >>  drivers/vhost/vsock.c   |  1 +
>>>>> > > >>  include/linux/virtio_vsock.h    |  1 +
>>>>> > > >>  net/vmw_vsock/virtio_transport.c    |  1 +
>>>>> > > >>  net/vmw_vsock/virtio_transport_common.c | 27 
+
>>>>> > > >>  net/vmw_vsock/vsock_loopback.c  |  1 +
>>>>> > > >>  5 files changed, 31 insertions(+)
>>>>> > > >>
>>>>> > > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>>> > > >> index f75731396b7e..4146f80db8ac 100644
>>>>> > > >> --- a/drivers/vhost/vsock.c
>>>>> > > >> +++ b/drivers/vhost/vsock.c
>>>>> > > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport 
= {
>>>>> > > >>  .notify_buffer_size   = 
virtio_transport_notify_buffer_size,
>>>>> > > >>
>>>>> > > >>  .read_skb = virtio_transport_read_skb,
>>>>> > > >> +    .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat
>>>>> > > >>  },
>>>>> > > >>
>>>>> > > >>  .send_pkt = vhost_transport_send_pkt,
>>>>> > > >> diff --git a/include/linux/virtio_vsock.h 
b/include/linux/virtio_vsock.h
>>>>> > > >> index ebb3ce63d64d..c82089dee0c8 100644
>>>>> > > >> --- a/include/linux/virtio_vsock.h
>>>>> > > >> +++ b/include/linux/virtio_vsock.h
>>>>> > > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
virtio_vsock_sock *vvs, u32 credit);
>>>>> > > >>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>>>> > > >>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head 
*list);
>>>>> > > >>  int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t read_actor);
>>>>> > > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, 
int val);
>>>>> > > >> 

Re: [PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-05 Thread Stefano Garzarella

On Tue, Dec 05, 2023 at 09:48:05AM +0300, Arseniy Krasnov wrote:

Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 9 +++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..461c89882142 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;

@@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}

free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));


As in the previous patch, should we avoid the update it if `fwd_cnt` and 
`last_fwd_cnt` are the same?


Now I'm thinking if it is better to add that check directly in 
virtio_transport_send_credit_update().


Stefano



spin_unlock_bh(&vvs->rx_lock);

@@ -611,9 +614,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
+   low_rx_bytes)
virtio_transport_send_credit_update(vsk);

return total;
--
2.25.1






Re: [PATCH net-next v6 4/4] vsock/test: two tests to check credit update logic

2023-12-05 Thread Stefano Garzarella

On Tue, Dec 05, 2023 at 09:48:06AM +0300, Arseniy Krasnov wrote:

Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Update commit message by adding details about dependency for this
   test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
 * Add comment for this dependency in 'vsock_test.c' where this define
   is duplicated.
v2 -> v3:
 * Replace synchronization based on control TCP socket with vsock
   data socket - this is needed to allow sender transmit data only
   when new buffer size of receiver is visible to sender. Otherwise
   there is race and test fails sometimes.
v3 -> v4:
 * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
   in server part. This is needed to ensure that 'poll()' wake up us
   when number of bytes ready to read is equal to SO_RCVLOWAT value.
v4 -> v5:
 * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
v5 -> v6:
 * Add second test which checks, that credit update is sent during
   reading data from socket.
 * Update commit message.

tools/testing/vsock/vsock_test.c | 175 +++
1 file changed, 175 insertions(+)


Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
}

+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  &buf_size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  &recv_buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_F

Re: [PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-05 Thread Stefano Garzarella

On Tue, Dec 05, 2023 at 03:07:47PM +0300, Arseniy Krasnov wrote:



On 05.12.2023 13:54, Stefano Garzarella wrote:

On Tue, Dec 05, 2023 at 09:48:05AM +0300, Arseniy Krasnov wrote:

Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/virtio_transport_common.c | 9 +++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..461c89882142 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+    bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;

@@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}

free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+    low_rx_bytes = (vvs->rx_bytes <
+    sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));


As in the previous patch, should we avoid the update it if `fwd_cnt` and 
`last_fwd_cnt` are the same?

Now I'm thinking if it is better to add that check directly in 
virtio_transport_send_credit_update().


Good point, but I think, that it is better to keep this check here, because 
access to 'fwd_cnt' and 'last_fwd_cnt'
requires taking rx_lock - so I guess it is better to avoid taking this lock 
every time in 'virtio_transport_send_credit_update()'.


Yeah, I agree.


So may be we can do something like:


fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
free_space = vvs->buf_alloc - fwd_cnt_delta;


Pre-existing issue, but should we handle the wrap (e.g. fwd_cnt wrapped, 
but last_fwd_cnt not yet?). Maybe in that case we can foce the status

update.



and then, after lock is released:

if (fwd_cnt_delta && (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
   low_rx_bytes))
   virtio_transport_send_credit_update(vsk);

WDYT?


Yep, I agree.



Also, I guess that next idea to update this optimization(in next patchset), is 
to make
threshold depends on vvs->buf_alloc. Because if someone changes minimum buffer 
size to
for example 32KB, and then sets buffer size to 32KB, then free_space will be 
always
non-zero, thus optimization is off now and credit update is sent on 
every read.


But does it make sense to allow a buffer smaller than 
VIRTIO_VSOCK_MAX_PKT_BUF_SIZE?


Maybe we should fail in virtio_transport_notify_buffer_size() or use it 
as minimum.


Stefano




[PATCH net] vsock/virtio: fix "comparison of distinct pointer types lacks a cast" warning

2023-12-06 Thread Stefano Garzarella
After backporting commit 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY
flag support") in CentOS Stream 9, CI reported the following error:

In file included from ./include/linux/kernel.h:17,
 from ./include/linux/list.h:9,
 from ./include/linux/preempt.h:11,
 from ./include/linux/spinlock.h:56,
 from net/vmw_vsock/virtio_transport_common.c:9:
net/vmw_vsock/virtio_transport_common.c: In function 
‘virtio_transport_can_zcopy‘:
./include/linux/minmax.h:20:35: error: comparison of distinct pointer types 
lacks a cast [-Werror]
   20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
  |   ^~
./include/linux/minmax.h:26:18: note: in expansion of macro 
‘__typecheck‘
   26 | (__typecheck(x, y) && __no_side_effects(x, y))
  |  ^~~
./include/linux/minmax.h:36:31: note: in expansion of macro ‘__safe_cmp‘
   36 | __builtin_choose_expr(__safe_cmp(x, y), \
  |   ^~
./include/linux/minmax.h:45:25: note: in expansion of macro 
‘__careful_cmp‘
   45 | #define min(x, y)   __careful_cmp(x, y, <)
  | ^
net/vmw_vsock/virtio_transport_common.c:63:37: note: in expansion of macro 
‘min‘
   63 | int pages_to_send = min(pages_in_iov, 
MAX_SKB_FRAGS);

We could solve it by using min_t(), but this operation seems entirely
unnecessary, because we also pass MAX_SKB_FRAGS to iov_iter_npages(),
which performs almost the same check, returning at most MAX_SKB_FRAGS
elements. So, let's eliminate this unnecessary comparison.

Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support")
Cc: avkras...@salutedevices.com
Signed-off-by: Stefano Garzarella 
---
 net/vmw_vsock/virtio_transport_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..c8e162c9d1df 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -59,8 +59,7 @@ static bool virtio_transport_can_zcopy(const struct 
virtio_transport *t_ops,
t_ops = virtio_transport_get_ops(info->vsk);
 
if (t_ops->can_msgzerocopy) {
-   int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
-   int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS);
+   int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
 
/* +1 is for packet header. */
return t_ops->can_msgzerocopy(pages_to_send + 1);
-- 
2.43.0




Re: [PATCH net-next v7 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-11 Thread Stefano Garzarella

On Thu, Dec 07, 2023 at 01:50:05AM +0300, Arseniy Krasnov wrote:



On 07.12.2023 01:08, Michael S. Tsirkin wrote:

On Thu, Dec 07, 2023 at 12:52:51AM +0300, Arseniy Krasnov wrote:



On 07.12.2023 00:53, Michael S. Tsirkin wrote:

On Thu, Dec 07, 2023 at 12:18:48AM +0300, Arseniy Krasnov wrote:

Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).
Also handle case when 'fwd_cnt' wraps, while 'last_fwd_cnt' is still
not.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..39f8660d825d 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;

@@ -601,7 +603,15 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}

-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   /* Handle wrap of 'fwd_cnt'. */
+   if (vvs->fwd_cnt < vvs->last_fwd_cnt)
+   fwd_cnt_delta = vvs->fwd_cnt + (U32_MAX - vvs->last_fwd_cnt);


Are you sure there's no off by one here? for example if fwd_cnt is 0
and last_fwd_cnt is 0xf then apparently delta is 0.


Seems yes, I need +1 here


And then you will get a nop, because assigning U32_MAX + 1 to u32
gives you 0. Adding () does nothing to change the result,
+ and - are commutative.


Ahh, unsigned here, yes.


Ooops, sorry I was confused here!



@Stefano, what did You mean about wrapping here?

I think Michael is right, for example


Yep, I agree!
Sorry for this wrong suggestion!

Stefano



vvs->fwd_cnt wraps and now == 5
vvs->last_fwd_cnt == 0x

now delta before this patch will be 6 - correct value

May be I didn't get your idea, so implement it very naive?

Thanks, Arseniy








+   else
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;


I actually don't see what is wrong with just
fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt
32 bit unsigned math will I think handle wrap around correctly.

And given buf_alloc is also u32 - I don't see where the bug is in
the original code.


I think problem is when fwd_cnt wraps, while last_fwd_cnt is not. In this
case fwd_cnt_delta will be too big, so we won't send credit update which
leads to stall for sender

Thanks, Arseniy


Care coming up with an example?






+
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));

spin_unlock_bh(&vvs->rx_lock);

@@ -611,9 +621,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);

return total;
--
2.25.1











Re: [PATCH net-next v7 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-11 Thread Stefano Garzarella

On Thu, Dec 07, 2023 at 12:18:47AM +0300, Arseniy Krasnov wrote:

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Do not initialize 'send_update' variable - set it directly during
   first usage.
v3 -> v4:
 * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
v4 -> v5:
 * Do not change callbacks order in transport structures.
v5 -> v6:
 * Reorder callbacks in transport structures.
 * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.

drivers/vhost/vsock.c   |  1 +
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  1 +
net/vmw_vsock/virtio_transport_common.c | 30 +
net/vmw_vsock/vsock_loopback.c  |  1 +
5 files changed, 34 insertions(+)


Reviewed-by: Stefano Garzarella 



diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
#endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

.read_skb = virtio_transport_read_skb,
},
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..e137d740804e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
}
EXPORT_SYMBOL_GPL(virtio_transport_read_skb);

+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(&vvs->rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side. Also
+* don't send credit update when peer already knows actual value -
+* such transmission will be useless.
+*/
+   send_update = (vvs->rx_bytes < val) &&
+ (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+   spin_unlock_bh(&vvs->rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Asias He");
MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_l

Re: [PATCH] vsock/virtio: Fix unsigned integer wrap around in virtio_transport_has_space()

2023-12-11 Thread Stefano Garzarella

On Mon, Dec 11, 2023 at 05:25:05PM +0300, Nikolay Kuratov wrote:

We need to do signed arithmetic if we expect condition
`if (bytes < 0)` to be possible

Found by Linux Verification Center (linuxtesting.org) with SVACE



We should add:

Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko")


Signed-off-by: Nikolay Kuratov 
---
net/vmw_vsock/virtio_transport_common.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index c8e162c9d1df..6df246b53260 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -843,7 +843,7 @@ static s64 virtio_transport_has_space(struct vsock_sock 
*vsk)
struct virtio_vsock_sock *vvs = vsk->trans;
s64 bytes;

-   bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+   bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);


If we respect the credit, this should not happen. It can happen, though,
that the receiver changes its buffer size while we're communicating,
and if it reduces it, this could happen. So yes, we need to fix it!

Thanks!

Reviewed-by: Stefano Garzarella 


if (bytes < 0)
bytes = 0;

--
2.34.1






Re: [PATCH v2] vsock/virtio: Fix unsigned integer wrap around in virtio_transport_has_space()

2023-12-11 Thread Stefano Garzarella

On Mon, Dec 11, 2023 at 07:23:17PM +0300, Nikolay Kuratov wrote:

We need to do signed arithmetic if we expect condition
`if (bytes < 0)` to be possible

Found by Linux Verification Center (linuxtesting.org) with SVACE

Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko")
Signed-off-by: Nikolay Kuratov 
---

V1 -> V2: Added Fixes section


Please, next time carry also R-b tags.



net/vmw_vsock/virtio_transport_common.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Stefano Garzarella 

Thanks,
Stefano



diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index c8e162c9d1df..6df246b53260 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -843,7 +843,7 @@ static s64 virtio_transport_has_space(struct vsock_sock 
*vsk)
struct virtio_vsock_sock *vvs = vsk->trans;
s64 bytes;

-   bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+   bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
if (bytes < 0)
bytes = 0;

--
2.34.1







Re: [PATCH net-next v8 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-12 Thread Stefano Garzarella

On Tue, Dec 12, 2023 at 12:16:57AM +0300, Arseniy Krasnov wrote:

Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
Changelog:
v6 -> v7:
 * Handle wrap of 'fwd_cnt'.
 * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
v7 -> v8:
 * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.

net/vmw_vsock/virtio_transport_common.c | 13 ++++++---
1 file changed, 10 insertions(+), 3 deletions(-)


Reviewed-by: Stefano Garzarella 

Thanks!
Stefano



diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..8572f94bba88 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;

@@ -601,7 +603,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}

-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));

spin_unlock_bh(&vvs->rx_lock);

@@ -611,9 +616,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);

return total;
--
2.25.1






Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-13 Thread Stefano Garzarella

On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 19:12, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 18:54, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:

Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/



Patchset:

Acked-by: Michael S. Tsirkin 


Thanks!




But I worry whether we actually need 3/8 in net not in net-next.


Because of "Fixes" tag ? I think this problem is not critical and reproducible
only in special cases, but i'm not familiar with netdev process so good, so I 
don't
have strong opinion. I guess @Stefano knows better.

Thanks, Arseniy


Fixes means "if you have that other commit then you need this commit
too". I think as a minimum you need to rearrange patches to make the
fix go in first. We don't want a regression followed by a fix.


I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, 
because this

patch fixes problem that is not related with the new patches from this patchset.


I agree, patch 3 is for sure net material (I'm fine with both 
rearrangement or send it separately), but IMHO also patch 2 could be.
I think with the same fixes tag, since before commit b89d882dc9fc 
("vsock/virtio: reduce credit update messages") we sent a credit update

for every bytes we read, so we should not have this problem, right?

So, maybe all the series could be "net".

Thanks,
Stefano




Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-13 Thread Stefano Garzarella

On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote:



On 13.12.2023 11:43, Stefano Garzarella wrote:

On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 19:12, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 18:54, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:

Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/



Patchset:

Acked-by: Michael S. Tsirkin 


Thanks!




But I worry whether we actually need 3/8 in net not in net-next.


Because of "Fixes" tag ? I think this problem is not critical and reproducible
only in special cases, but i'm not familiar with netdev process so good, so I 
don't
have strong opinion. I guess @Stefano knows better.

Thanks, Arseniy


Fixes means "if you have that other commit then you need this commit
too". I think as a minimum you need to rearrange patches to make the
fix go in first. We don't want a regression followed by a fix.


I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this
patch fixes problem that is not related with the new patches from this patchset.


I agree, patch 3 is for sure net material (I'm fine with both rearrangement or 
send it separately), but IMHO also patch 2 could be.
I think with the same fixes tag, since before commit b89d882dc9fc ("vsock/virtio: 
reduce credit update messages") we sent a credit update
for every bytes we read, so we should not have this problem, right?


Agree for 2, so I think I can rearrange: two fixes go first, then current 0001, 
and then tests. And send it as V9 for 'net' only ?


Maybe you can add this to patch 1 if we want it on net:

Fixes: e38f22c860ed ("vsock: SO_RCVLOWAT transport set callback")

Then I think that patch should go before patch 2, so we don't need to
touch that code multiple times.

so, IMHO the order should be the actual order or 3 - 1 - 2 - 4.

Another option is to send just 2 & 3 to net, and the rest (1 & 4) to 
net-next. IMHO should be fine to send the entire series to net with the 
fixes tag also in patch 1.


Net maintainers and Michael might have a different advice.

Thanks,
Stefano




Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Stefano Garzarella

On Wed, Dec 13, 2023 at 08:11:57PM +0300, Arseniy Krasnov wrote:



On 13.12.2023 18:13, Michael S. Tsirkin wrote:

On Wed, Dec 13, 2023 at 10:05:44AM -0500, Michael S. Tsirkin wrote:

On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote:



On 13.12.2023 11:43, Stefano Garzarella wrote:

On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 19:12, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:



On 12.12.2023 18:54, Michael S. Tsirkin wrote:

On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:

Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/



Patchset:

Acked-by: Michael S. Tsirkin 


Thanks!




But I worry whether we actually need 3/8 in net not in net-next.


Because of "Fixes" tag ? I think this problem is not critical and reproducible
only in special cases, but i'm not familiar with netdev process so good, so I 
don't
have strong opinion. I guess @Stefano knows better.

Thanks, Arseniy


Fixes means "if you have that other commit then you need this commit
too". I think as a minimum you need to rearrange patches to make the
fix go in first. We don't want a regression followed by a fix.


I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this
patch fixes problem that is not related with the new patches from this patchset.


I agree, patch 3 is for sure net material (I'm fine with both rearrangement or 
send it separately), but IMHO also patch 2 could be.
I think with the same fixes tag, since before commit b89d882dc9fc ("vsock/virtio: 
reduce credit update messages") we sent a credit update
for every bytes we read, so we should not have this problem, right?


Agree for 2, so I think I can rearrange: two fixes go first, then current 0001, 
and then tests. And send it as V9 for 'net' only ?

Thanks, Arseniy



hmm why not net-next?


Oh I missed your previous discussion. I think everything in net-next is
safer.  Having said that, I won't nack it net, either.


So, summarizing all above:
1) This patchset entirely goes to net-next as v9
2) I reorder patches like 3 - 2 - 1 - 4, e.g. two fixes goes first with Fixes 
tag
3) Add Acked-by: Michael S. Tsirkin  to each patch

@Michael, @Stefano ?


Okay, let's do that ;-)

Stefano




Re: [PATCH net-next v9 0/4] send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Stefano Garzarella

On Thu, Dec 14, 2023 at 12:19:43PM +0300, Arseniy Krasnov wrote:

Hello,

  DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

 TEST

   INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
  hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


STEPS

   SENDER  RECEIVER
1) sends 128KB + 1 byte in a
  single buffer. 128KB will
  be sent, but for 1 byte
  sender will wait for free
  space at peer. Sender goes
  to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
  only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless).

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* New patch is added as 0001 - it removes return from SO_RCVLOWAT set
  callback in 'af_vsock.c' when transport callback is set - with that
  we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
  not copy-paste it to every transport. It was discussed in v1.
* See per-patch changelog after ---.
v2 -> v3:
* See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.
v4 -> v5:
* Change patchset tag 'RFC' -> 'net-next'.
* See per-patch changelog after ---.
v5 -> v6:
* New patch 0003 which sends credit update during reading bytes from
  socket.
* See per-patch changelog after ---.
v6 -> v7:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* See per-patch changelog after ---.
v7 -> v8:
* See per-patch changelog after ---.
v8 -> v9:
* Patchset rebased and tested on new HEAD of net-next (see hash above).
* Add 'Fixes' tag for the current 0002.
* Reorder patches by moving two fixes first.

Arseniy Krasnov (4):
 virtio/vsock: fix logic which reduces credit update messages
 virtio/vsock: send credit update during setting SO_RCVLOWAT
 vsock: update SO_RCVLOWAT setting callback
 vsock/test: two tests to check credit update logic


This order will break the bisectability, since now patch 2 will not
build if patch 3 is not applied.

So you need to implement in patch 2 `set_rcvlowat` and in patch 3
updated it to `notify_set_rcvlowat`, otherwise we always need to
backport patch 3 in stable branches, that should be applied before
patch 2.

You have 2 options:
a. move patch 3 before patch 2 without changing the code
b. change patch 2 to use `set_rcvlowat` and updated that code in patch 3

I don't have a strong opinion, but I slightly prefer option a. BTW that
forces us to backport more patches on stable branches, so I'm fine with
option b as well.

That said:
Nacked-by: Stefano Garzarella 




Re: [PATCH net-next v9 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Stefano Garzarella

On Thu, Dec 14, 2023 at 12:19:45PM +0300, Arseniy Krasnov wrote:

Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
Changelog:
v1 -> v2:
 * Update commit message by removing 'This patch adds XXX' manner.
 * Do not initialize 'send_update' variable - set it directly during
   first usage.
v3 -> v4:
 * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
v4 -> v5:
 * Do not change callbacks order in transport structures.
v5 -> v6:
 * Reorder callbacks in transport structures.
 * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
v8 -> v9:
 * Add 'Fixes' tag.

drivers/vhost/vsock.c   |  1 +
include/linux/virtio_vsock.h|  1 +
net/vmw_vsock/virtio_transport.c|  1 +
net/vmw_vsock/virtio_transport_common.c | 30 +
net/vmw_vsock/vsock_loopback.c  |  1 +
5 files changed, 34 insertions(+)


As I already mentioned in the cover letter, this patch doesn't compile
unless we apply patch 3 before this one, so:

Nacked-by: Stefano Garzarella 




Re: [RFC PATCH 1/5] vsock/virtio: Extend virtio-vsock spec with an "order" field

2024-05-23 Thread Stefano Garzarella

As Alyssa suggested, we should discuss spec changes in the virtio ML.
BTW as long as this is an RFC, it's fine. Just be sure, though, to 
remember to merge the change in the specification first versus the 
patches in Linux.
So I recommend that you don't send a non-RFC set into Linux until you 
have agreed on the changes to the specification.


On Fri, May 17, 2024 at 10:46:03PM GMT, Xuewei Niu wrote:

The "order" field determines the location of the device in the linked list,
the device with CID 4, having a smallest order, is in the first place, and
so forth.


Do we really need an order, or would it suffice to just indicate the 
device to be used by default? (as the default gateway in networking)




Rules:

* It doesn’t have to be continuous;
* It cannot exist conflicts;
* It is optional for the mode of a single device, but is required for the
 mode of multiple devices.


We should also add a feature to support this new field.



Signed-off-by: Xuewei Niu 
---
include/uapi/linux/virtio_vsock.h | 1 +
1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 64738838bee5..b62ec7d2ab1e 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -43,6 +43,7 @@

struct virtio_vsock_config {
__le64 guest_cid;
+   __le64 order;


Do we really need 64 bits for the order?


} __attribute__((packed));

enum virtio_vsock_event_id {
--
2.34.1






Re: [RFC PATCH 2/5] vsock/virtio: Add support for multi-devices

2024-05-23 Thread Stefano Garzarella

On Fri, May 17, 2024 at 10:46:04PM GMT, Xuewei Niu wrote:

The maximum number of devices is limited by `MAX_VSOCK_NUM`.

Extends `vsock_transport` struct with 4 methods to support multi-devices:

* `get_virtio_vsock()`: It receives a CID, and returns a struct of virtio
 vsock. This method is designed to select a vsock device by its CID.
* `get_default_cid()`: It receives nothing, returns the default CID of the
 first vsock device registered to the kernel.
* `get_local_cids()`: It returns a vector of vsock devices' CIDs.
* `compare_order()`: It receives two different CIDs, named "left" and
 "right" respectively. It returns "-1" while the "left" is behind the
 "right". Otherwise, return "1".

`get_local_cid()` is retained, but returns "-1" if the transport supports
multi-devices.

Replaces the single instance of `virtio_vsock` with a list, named
`virtio_vsock_list`. The devices are inserted into the list when probing.

The kernel will deny devices from being registered if there are conflicts
existing in CIDs or orders.

Signed-off-by: Xuewei Niu 
---
include/net/af_vsock.h  |  16 ++
include/uapi/linux/vm_sockets.h |   6 +
net/vmw_vsock/af_vsock.c|  82 ++--
net/vmw_vsock/virtio_transport.c| 246 ++--
net/vmw_vsock/virtio_transport_common.c |  10 +-
5 files changed, 293 insertions(+), 67 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 535701efc1e5..0151296a0bc5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -174,6 +174,22 @@ struct vsock_transport {

/* Addressing. */
u32 (*get_local_cid)(void);
+   /* Held rcu read lock by the caller. */


We should also explain why the rcu is needed.


+   struct virtio_vsock *(*get_virtio_vsock)(unsigned int cid);


af_vsock supports several transports (i.e. HyperV, VMCI, VIRTIO/VHOST,
loobpack), so we need to be generic here.

In addition, the pointer returned by this function is never used, so
why we need this?


+   unsigned int (*get_default_cid)(void);
+   /* Get an list containing all the CIDs of registered vsock.   Return
+* the length of the list.
+*
+* Held rcu read lock by the caller.
+*/
+   int (*get_local_cids)(unsigned int *local_cids);


Why int? get_local_cid() returns an u32, we should do the same.

In addition, can we remove get_local_cid() and implement 
get_local_cids() for all the transports?



+   /* Compare the order of two devices.  Given the guest CIDs of two
+* different devices, returns -1 while the left is behind the right.
+* Otherwise, return 1.
+*
+* Held rcu read lock by the caller.
+*/
+   int (*compare_order)(unsigned int left, unsigned int right);


Please check better the type for CIDs all over the place.



/* Read a single skb */
int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index ed07181d4eff..36ca5023293a 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -189,6 +189,12 @@ struct sockaddr_vm {
   sizeof(__u8)];
};

+/* The maximum number of vsock devices.  Each vsock device has an exclusive
+ * context id.
+ */
+
+#define MAX_VSOCK_NUM 16


This is used internally in AF_VSOCK, I don't think we should expose it
in the UAPI.



+
#define IOCTL_VM_SOCKETS_GET_LOCAL_CID  _IO(7, 0xb9)

/* MSG_ZEROCOPY notifications are encoded in the standard error format,
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 54ba7316f808..da06ddc940cd 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -234,19 +234,45 @@ static void __vsock_remove_connected(struct vsock_sock 
*vsk)

static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
{
-   struct vsock_sock *vsk;
+   struct vsock_sock *vsk, *any_vsk = NULL;

+   rcu_read_lock();


Why the rcu is needed?

	list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) 
	{

+   /* The highest priority: full match. */
if (vsock_addr_equals_addr(addr, &vsk->local_addr))
-   return sk_vsock(vsk);
+   goto out;

-   if (addr->svm_port == vsk->local_addr.svm_port &&
-   (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
-addr->svm_cid == VMADDR_CID_ANY))
-   return sk_vsock(vsk);
+   /* Port match */
+   if (addr->svm_port == vsk->local_addr.svm_port) {
+   /* The second priority: local cid is VMADDR_CID_ANY. */
+   if (vsk->local_addr.svm_cid == VMADDR_CID_ANY)
+   goto out;
+
+   /* The third priority: local cid isn't VMADDR_CID_ANY. 
*/
+   if (addr->svm_cid == VMADDR_CI

Re: [RFC PATCH 3/5] vsock/virtio: can_msgzerocopy adapts to multi-devices

2024-05-23 Thread Stefano Garzarella

On Fri, May 17, 2024 at 10:46:05PM GMT, Xuewei Niu wrote:

Adds a new argument, named "cid", to let them know which `virtio_vsock` to
be selected.

Signed-off-by: Xuewei Niu 
---
include/linux/virtio_vsock.h| 2 +-
net/vmw_vsock/virtio_transport.c| 5 ++---
net/vmw_vsock/virtio_transport_common.c | 6 +++---
3 files changed, 6 insertions(+), 7 deletions(-)


Every commit in linux must be working to support bisection. So these 
changes should be made before adding multi-device support.




diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..21bfd5e0c2e7 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -168,7 +168,7 @@ struct virtio_transport {
 * extra checks and can perform zerocopy transmission by
 * default.
 */
-   bool (*can_msgzerocopy)(int bufs_num);
+   bool (*can_msgzerocopy)(u32 cid, int bufs_num);
};

ssize_t
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 93d25aeafb83..998b22e5ce36 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -521,14 +521,13 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}

-static bool virtio_transport_can_msgzerocopy(int bufs_num)
+static bool virtio_transport_can_msgzerocopy(u32 cid, int bufs_num)
{
struct virtio_vsock *vsock;
bool res = false;

rcu_read_lock();
-
-   vsock = rcu_dereference(the_virtio_vsock);
+   vsock = virtio_transport_get_virtio_vsock(cid);
if (vsock) {
struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index bed75a41419e..e7315d7b9af1 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -39,7 +39,7 @@ virtio_transport_get_ops(struct vsock_sock *vsk)

static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops,
   struct virtio_vsock_pkt_info *info,
-  size_t pkt_len)
+  size_t pkt_len, unsigned int cid)
{
struct iov_iter *iov_iter;

@@ -62,7 +62,7 @@ static bool virtio_transport_can_zcopy(const struct 
virtio_transport *t_ops,
int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);

/* +1 is for packet header. */
-   return t_ops->can_msgzerocopy(pages_to_send + 1);
+   return t_ops->can_msgzerocopy(cid, pages_to_send + 1);
}

return true;
@@ -375,7 +375,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock 
*vsk,
info->msg->msg_flags &= ~MSG_ZEROCOPY;

if (info->msg->msg_flags & MSG_ZEROCOPY)
-   can_zcopy = virtio_transport_can_zcopy(t_ops, info, 
pkt_len);
+   can_zcopy = virtio_transport_can_zcopy(t_ops, info, 
pkt_len, src_cid);

if (can_zcopy)
max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
--
2.34.1






Re: [RFC PATCH 4/5] vsock: seqpacket_allow adapts to multi-devices

2024-05-23 Thread Stefano Garzarella

On Fri, May 17, 2024 at 10:46:06PM GMT, Xuewei Niu wrote:

Adds a new argument, named "src_cid", to let them know which `virtio_vsock`
to be selected.

Signed-off-by: Xuewei Niu 
---
include/net/af_vsock.h   |  2 +-
net/vmw_vsock/af_vsock.c | 15 +--
net/vmw_vsock/virtio_transport.c |  4 ++--
net/vmw_vsock/vsock_loopback.c   |  4 ++--
4 files changed, 18 insertions(+), 7 deletions(-)


Same for this.



diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 0151296a0bc5..25f7dc3d602d 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -143,7 +143,7 @@ struct vsock_transport {
 int flags);
int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
 size_t len);
-   bool (*seqpacket_allow)(u32 remote_cid);
+   bool (*seqpacket_allow)(u32 src_cid, u32 remote_cid);
u32 (*seqpacket_has_data)(struct vsock_sock *vsk);

/* Notification. */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index da06ddc940cd..3b34be802bf2 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -470,10 +470,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct 
vsock_sock *psk)
{
const struct vsock_transport *new_transport;
struct sock *sk = sk_vsock(vsk);
-   unsigned int remote_cid = vsk->remote_addr.svm_cid;
+   unsigned int src_cid, remote_cid;
__u8 remote_flags;
int ret;

+   remote_cid = vsk->remote_addr.svm_cid;
+
/* If the packet is coming with the source and destination CIDs higher
 * than VMADDR_CID_HOST, then a vsock channel where all the packets are
 * forwarded to the host should be established. Then the host will
@@ -527,8 +529,17 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct 
vsock_sock *psk)
return -ENODEV;

if (sk->sk_type == SOCK_SEQPACKET) {
+   if (vsk->local_addr.svm_cid == VMADDR_CID_ANY) {
+   if (new_transport->get_default_cid)
+   src_cid = new_transport->get_default_cid();
+   else
+   src_cid = new_transport->get_local_cid();
+   } else {
+   src_cid = vsk->local_addr.svm_cid;
+   }
+
if (!new_transport->seqpacket_allow ||
-   !new_transport->seqpacket_allow(remote_cid)) {
+   !new_transport->seqpacket_allow(src_cid, remote_cid)) {
module_put(new_transport->module);
return -ESOCKTNOSUPPORT;
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 998b22e5ce36..0bddcbd906a2 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -615,14 +615,14 @@ static struct virtio_transport virtio_transport = {
.can_msgzerocopy = virtio_transport_can_msgzerocopy,
};

-static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+static bool virtio_transport_seqpacket_allow(u32 src_cid, u32 remote_cid)
{
struct virtio_vsock *vsock;
bool seqpacket_allow;

seqpacket_allow = false;
rcu_read_lock();
-   vsock = rcu_dereference(the_virtio_vsock);
+   vsock = virtio_transport_get_virtio_vsock(src_cid);
if (vsock)
seqpacket_allow = vsock->seqpacket_allow;
rcu_read_unlock();
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 6dea6119f5b2..b94358f5bb2c 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -46,7 +46,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
return 0;
}

-static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+static bool vsock_loopback_seqpacket_allow(u32 src_cid, u32 remote_cid);
static bool vsock_loopback_msgzerocopy_allow(void)
{
return true;
@@ -104,7 +104,7 @@ static struct virtio_transport loopback_transport = {
.send_pkt = vsock_loopback_send_pkt,
};

-static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
+static bool vsock_loopback_seqpacket_allow(u32 src_cid, u32 remote_cid)
{
return true;
}
--
2.34.1






Re: [RFC PATCH 5/5] vsock: Add an ioctl request to get all CIDs

2024-05-23 Thread Stefano Garzarella

On Fri, May 17, 2024 at 10:46:07PM GMT, Xuewei Niu wrote:

The new request is called `IOCTL_VM_SOCKETS_GET_LOCAL_CIDS`. And the old
one, `IOCTL_VM_SOCKETS_GET_LOCAL_CID` is retained.

For the transport that supports multi-devices:

* `IOCTL_VM_SOCKETS_GET_LOCAL_CID` returns "-1";


What about returning the default CID (lower prio)?

* `IOCTL_VM_SOCKETS_GET_LOCAL_CIDS` returns a vector of CIDS. The usage 
is

shown as following.

```
struct vsock_local_cids local_cids;
if ((ret = ioctl(fd, IOCTL_VM_SOCKETS_GET_LOCAL_CIDS, &local_cids))) {
   perror("failed to get cids");
   exit(1);
}
for (i = 0; i
---
include/net/af_vsock.h  |  7 +++
include/uapi/linux/vm_sockets.h |  8 
net/vmw_vsock/af_vsock.c| 19 +++
3 files changed, 34 insertions(+)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 25f7dc3d602d..2febc816e388 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -264,4 +264,11 @@ static inline bool vsock_msgzerocopy_allow(const struct 
vsock_transport *t)
{
return t->msgzerocopy_allow && t->msgzerocopy_allow();
}
+
+/ IOCTL /
+/* Type of return value of IOCTL_VM_SOCKETS_GET_LOCAL_CIDS. */
+struct vsock_local_cids {
+   int nr;
+   unsigned int data[MAX_VSOCK_NUM];
+};
#endif /* __AF_VSOCK_H__ */
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index 36ca5023293a..01f73fb7af5a 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -195,8 +195,16 @@ struct sockaddr_vm {

#define MAX_VSOCK_NUM 16


Okay, now I see why you need this in the UAPI, but pleace try to follow
other defines.

What about VM_SOCKETS_MAX_DEVS ?



+/* Return actual context id if the transport not support vsock
+ * multi-devices. Otherwise, return `-1U`.
+ */
+
#define IOCTL_VM_SOCKETS_GET_LOCAL_CID  _IO(7, 0xb9)

+/* Only available in transports that support multiple devices. */
+
+#define IOCTL_VM_SOCKETS_GET_LOCAL_CIDS _IOR(7, 0xba, struct 
vsock_local_cids)
+
/* MSG_ZEROCOPY notifications are encoded in the standard error format,
 * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in
 * kernel source tree for more details.
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 3b34be802bf2..2ea2ff52f15b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2454,6 +2454,7 @@ static long vsock_dev_do_ioctl(struct file *filp,
u32 __user *p = ptr;
u32 cid = VMADDR_CID_ANY;
int retval = 0;
+   struct vsock_local_cids local_cids;

switch (cmd) {
case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
@@ -2469,6 +2470,24 @@ static long vsock_dev_do_ioctl(struct file *filp,
retval = -EFAULT;
break;

+   case IOCTL_VM_SOCKETS_GET_LOCAL_CIDS:
+   if (!transport_g2h || !transport_g2h->get_local_cids)
+   goto fault;
+
+   rcu_read_lock();
+   local_cids.nr = transport_g2h->get_local_cids(local_cids.data);
+   rcu_read_unlock();
+
+   if (local_cids.nr < 0 ||
+   copy_to_user(p, &local_cids, sizeof(local_cids)))
+   goto fault;
+
+   break;
+
+fault:
+   retval = -EFAULT;
+   break;
+
default:
retval = -ENOIOCTLCMD;
}
--
2.34.1






Re: [RFC PATCH 0/5] vsock/virtio: Add support for multi-devices

2024-05-23 Thread Stefano Garzarella

Hi,
thanks for this RFC!

On Fri, May 17, 2024 at 10:46:02PM GMT, Xuewei Niu wrote:

# Motivition

Vsock is a lightweight and widely used data exchange mechanism between host
and guest. Kata Containers, a secure container runtime, leverages the
capability to exchange control data between the shim and the kata-agent.

The Linux kernel only supports one vsock device for virtio-vsock transport,
resulting in the following limitations:

* Poor performance isolation: All vsock connections share the same
virtqueue.


This might be fixed if we implement multi-queue in virtio-vsock.


* Cannot enable more than one backend: Virtio-vsock, vhost-vsock, and
vhost-user-vsock cannot be enabled simultaneously on the transport.

We’d like to transfer networking data, such as TSI (Transparent Socket
Impersonation), over vsock via the vhost-user protocol to reduce overhead.
However, by default, the vsock device is occupied by the kata-agent.

# Usages

Principle: **Supporting virtio-vsock multi-devices while also being
compatible with existing ones.**

## Connection from Guest to Host

There are two valuable questions to take about:

1. How to be compatible with the existing usages?
2. How do we specify a virtio-vsock device?

### Question 1

Before we delve into question 1, I'd like to provide a piece of pseudocode
as an example of one of the existing use cases from the guest's
perspective.

Assuming there is one virtio-vsock device with CID 4. One of existing
usages to connect to host is shown as following.

```
fd = socket(AF_VSOCK);
connect(fd, 2, 1234);
n = write(fd, buffer);
```

The result is that a connection is established from the guest (4, ?) to the
host (2, 1234), where "?" denotes a random port.

In the context of multi-devices, there are more than two devices. If the
users don’t specify one CID explicitly, the kernel becomes confused about
which device to use. The new implementation should be compatible with the
old one.

We expanded the virtio-vsock specification to address this issue. The
specification now includes a new field called "order".

```
struct virtio_vsock_config {
 __le64 guest_cid;
 __le64 order;
} _attribute_((packed));
```

In the phase of virtio-vsock driver probing, the guest kernel reads 
from

VMM to get the order of each device. **We stipulate that the device with the
smallest order is regarded as the default device**(this mechanism functions
as a 'default gateway' in networking).

Assuming there are three virtio-vsock devices: device1 (CID=3), device2
(CID=4), and device3 (CID=5). The arrangement of the list is as follows
from the perspective of the guest kernel:

```
virtio_vsock_list =
virtio_vsock { cid: 4, order: 0 } -> virtio_vsock { cid: 3, order: 1 } -> 
virtio_vsock { cid: 5, order: 10 }
```

At this time, the guest kernel realizes that the device2 (CID=4) is the
default device. Execute the same code as before.

```
fd = socket(AF_VSOCK);
connect(fd, 2, 1234);
n = write(fd, buffer);
```

A connection will be established from the guest (4, ?) to the host (2, 1234).


It seems that only the one with order 0 is used here though, so what is 
the ordering for?
Wouldn't it suffice to simply indicate the default device (e.g., like 
the default gateway for networking)?




### Question 2

Now, the user wants to specify a device instead of the default one. An
explicit binding operation is required to be performed.

Use the device (CID=3), where “-1” represents any port, the kernel will


We have a macro: VMADDR_PORT_ANY (which is -1)


search an available port automatically.

```
fd = socket(AF_VSOCK);
bind(fd, 3, -1);
connect(fd, 2, 1234);)
n = write(fd, buffer);
```

Use the device (CID=4).

```
fd = socket(AF_VSOCK);
bind(fd, 4, -1);
connect(fd, 2, 1234);
n = write(fd, buffer);
```

## Connection from Host to Guest

Connection from host to guest is quite similar to the existing usages. The
device’s CID is specified by the bind operation.

Listen at the device (CID=3)’s port 1.

```
fd = socket(AF_VSOCK);
bind(fd, 3, 1);
listen(fd);
new_fd = accept(fd, &host_cid, &host_port);
n = write(fd, buffer);
```

Listen at the device (CID=4)’s port 1.

```
fd = socket(AF_VSOCK);
bind(fd, 4, 1);
listen(fd);
new_fd = accept(fd, &host_cid, &host_port);
n = write(fd, buffer);
```

# Use Cases

We've completed a POC with Kata Containers, Ztunnel, which is a
purpose-built per-node proxy for Istio ambient mesh, and TSI. Please refer
to the following link for more details.

Link: https://bit.ly/4bdPJbU


Thank you for this RFC, I left several comments in the patches, we still 
have some work to do, but I think it is something we can support :-)


Here I summarize the things that I think we need to fix:
1. Avoid adding transport-specific things in af_vsock.c
   We need to have a generic API to allow other transports to implement
   the same functionality.
2. We need to add negotiation of a new feature in virtio/vhost transports
   We need to enable or disable support depending on whether t

Re: [RFC PATCH v1 1/2] virtio/vsock: rework deferred credit update logic

2024-06-25 Thread Stefano Garzarella

On Fri, Jun 21, 2024 at 10:25:40PM GMT, Arseniy Krasnov wrote:

Previous calculation of 'free_space' was wrong (but worked as expected
in most cases, see below), because it didn't account number of bytes in
rx queue. Let's rework 'free_space' calculation in the following way:
as this value is considered free space at rx side from tx point of view,
it must be equal to return value of 'virtio_transport_get_credit()' at
tx side. This function uses 'tx_cnt' counter and 'peer_fwd_cnt': first
is number of transmitted bytes (without wrap), second is last 'fwd_cnt'
value received from rx. So let's use same approach at rx side during
'free_space' calculation: add 'rx_cnt' counter which is number of
received bytes (also without wrap) and subtract 'last_fwd_cnt' from it.
Now we have:
1) 'rx_cnt' == 'tx_cnt' at both sides.
2) 'last_fwd_cnt' == 'peer_fwd_cnt' - because first is last 'fwd_cnt'
  sent to tx, while second is last 'fwd_cnt' received from rx.

Now 'free_space' is handled correctly and also we don't need


mmm, I don't know if it was wrong before, maybe we could say it was less 
accurate.


That said, could we have the same problem now if we have a lot of 
producers and the virtqueue becomes full?



'low_rx_bytes' flag - this was more like a hack.

Previous calculation of 'free_space' worked (in 99% cases), because if
we take a look on behaviour of both expressions (new and previous):

'(rx_cnt - last_fwd_cnt)' and '(fwd_cnt - last_fwd_cnt)'

Both of them always grows up, with almost same "speed": only difference
is that 'rx_cnt' is incremented earlier during packet is received,
while 'fwd_cnt' in incremented when packet is read by user. So if 'rx_cnt'
grows "faster", then resulting 'free_space' become smaller also, so we
send credit updates a little bit more, but:

 * 'free_space' calculation based on 'rx_cnt' gives the same value,
   which tx sees as free space at rx side, so original idea of


Ditto, what happen if the virtqueue is full?


   'free_space' is now implemented as planned.
 * Hack with 'low_rx_bytes' now is not needed.


Yeah, so this patch should also mitigate issue reported by Alex (added 
in CC), right?


If yes, please mention that problem and add a Reported-by giving credit 
to Alex.




Also here is some performance comparison between both versions of
'free_space' calculation:

*--*--*--*
|  | 'rx_cnt' | previous |
*--*--*--*
|H -> G|   8.42   |   7.82   |
*--*--*--*
|G -> H|   11.6   |   12.1   |
*--*--*--*


How many seconds did you run it? How many repetitions? There's a little 
discrepancy anyway, but I can't tell if it's just noise.




As benchmark 'vsock-iperf' with default arguments was used. There is no
significant performance difference before and after this patch.

Signed-off-by: Arseniy Krasnov 
---
include/linux/virtio_vsock.h| 1 +
net/vmw_vsock/virtio_transport_common.c | 8 +++-
2 files changed, 4 insertions(+), 5 deletions(-)


Thanks for working on this, I'll do more tests but the approach LGTM.

Thanks,
Stefano



diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..3579491c411e 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -135,6 +135,7 @@ struct virtio_vsock_sock {
u32 peer_buf_alloc;

/* Protected by rx_lock */
+   u32 rx_cnt;
u32 fwd_cnt;
u32 last_fwd_cnt;
u32 rx_bytes;
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 16ff976a86e3..1d4e2328e06e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -441,6 +441,7 @@ static bool virtio_transport_inc_rx_pkt(struct 
virtio_vsock_sock *vvs,
return false;

vvs->rx_bytes += len;
+   vvs->rx_cnt += len;
return true;
}

@@ -558,7 +559,6 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
size_t bytes, total = 0;
struct sk_buff *skb;
u32 fwd_cnt_delta;
-   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;

@@ -603,9 +603,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}

fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
-   free_space = vvs->buf_alloc - fwd_cnt_delta;
-   low_rx_bytes = (vvs->rx_bytes <
-   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
+   free_space = vvs->buf_alloc - (vvs->rx_cnt - vvs->last_fwd_cnt);

spin_unlock_bh(&vvs->rx_lock);

@@ -619,7 +617,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * number of bytes in rx queue is not enough to wake up reader.
 */
if (fwd_cnt_delta &&
-   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE))
virtio_transport_send_credit_update(vsk);

return total;
--

Re: [PATCH net-next v3 1/3] vsock: add support for SIOCOUTQ ioctl for all vsock socket types.

2024-06-28 Thread Stefano Garzarella
nit: in theory in this patch we don't support it for any of the 
transports, so I wouldn't confuse and take that part out of the title.


WDYT with someting like:

vsock: add support for SIOCOUTQ ioctl

On Wed, Jun 26, 2024 at 02:08:35PM GMT, Luigi Leonardi via B4 Relay 
wrote:

From: Luigi Leonardi 

Add support for ioctl(s) for SOCK_STREAM SOCK_SEQPACKET and SOCK_DGRAM
in AF_VSOCK.
The only ioctl available is SIOCOUTQ/TIOCOUTQ, which returns the number
of unsent bytes in the socket. This information is transport-specific
and is delegated to them using a callback.

Suggested-by: Daan De Meyer 
Signed-off-by: Luigi Leonardi 
---
include/net/af_vsock.h   |  3 +++
net/vmw_vsock/af_vsock.c | 60 +---
2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 535701efc1e5..7b5375ae7827 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -169,6 +169,9 @@ struct vsock_transport {
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);

+   /* SIOCOUTQ ioctl */
+   size_t (*unsent_bytes)(struct vsock_sock *vsk);


If you want to return also errors, maybe better returning ssize_t.
This should fix one of the error reported by kernel bots.


+
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 4b040285aa78..d6140d73d122 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -112,6 +112,7 @@
#include 
#include 
#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -1292,6 +1293,59 @@ int vsock_dgram_recvmsg(struct socket *sock, struct 
msghdr *msg,
}
EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);

+static int vsock_do_ioctl(struct socket *sock, unsigned int cmd,
+ int __user *arg)
+{
+   struct sock *sk = sock->sk;
+   struct vsock_sock *vsk;
+   int retval;
+
+   vsk = vsock_sk(sk);
+
+   switch (cmd) {
+   case SIOCOUTQ: {
+   size_t n_bytes;
+
+   if (!vsk->transport || !vsk->transport->unsent_bytes) {
+   retval = -EOPNOTSUPP;
+   break;
+   }
+
+   if (vsk->transport->unsent_bytes) {


This if is not necessary after the check we did earlier, right?

Removing it should fix the other issue reported by the bot.


+   if (sock_type_connectible(sk->sk_type) && sk->sk_state 
== TCP_LISTEN) {
+   retval = -EINVAL;
+   break;
+   }
+
+   n_bytes = vsk->transport->unsent_bytes(vsk);
+   if (n_bytes < 0) {
+   retval = n_bytes;
+   break;
+   }
+
+   retval = put_user(n_bytes, arg);
+   }
+   break;
+   }
+   default:
+   retval = -ENOIOCTLCMD;
+   }
+
+   return retval;
+}
+
+static int vsock_ioctl(struct socket *sock, unsigned int cmd,
+  unsigned long arg)
+{
+   int ret;
+
+   lock_sock(sock->sk);
+   ret = vsock_do_ioctl(sock, cmd, (int __user *)arg);
+   release_sock(sock->sk);
+
+   return ret;
+}
+
static const struct proto_ops vsock_dgram_ops = {
.family = PF_VSOCK,
.owner = THIS_MODULE,
@@ -1302,7 +1356,7 @@ static const struct proto_ops vsock_dgram_ops = {
.accept = sock_no_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = sock_no_listen,
.shutdown = vsock_shutdown,
.sendmsg = vsock_dgram_sendmsg,
@@ -2286,7 +2340,7 @@ static const struct proto_ops vsock_stream_ops = {
.accept = vsock_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt,
@@ -2308,7 +2362,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
.accept = vsock_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt,

--
2.45.2








Re: [PATCH net-next v3 2/3] vsock/virtio: add SIOCOUTQ support for all virtio based transports

2024-06-28 Thread Stefano Garzarella

On Wed, Jun 26, 2024 at 02:08:36PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce support for stream_bytes_unsent and seqpacket_bytes_unsent
ioctl for virtio_transport, vhost_vsock and vsock_loopback.

For all transports the unsent bytes counter is incremented
in virtio_transport_get_credit.

In the virtio_transport (G2H) the counter is decremented each
time the host notifies the guest that it consumed the skbuffs.
In vhost-vsock (H2G) the counter is decremented after the skbuff
is queued in the virtqueue.
In vsock_loopback the counter is decremented after the skbuff is
dequeued.

Signed-off-by: Luigi Leonardi 
---
drivers/vhost/vsock.c   |  4 +++-
include/linux/virtio_vsock.h|  7 +++
net/vmw_vsock/virtio_transport.c|  4 +++-
net/vmw_vsock/virtio_transport_common.c | 35 +
net/vmw_vsock/vsock_loopback.c  |  7 +++
5 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index ec20ecff85c7..dba8b3ea37bf 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
restart_tx = true;
}

-   consume_skb(skb);
+   virtio_transport_consume_skb_sent(skb, true);
}
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added)
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

+   .unsent_bytes = virtio_transport_bytes_unsent,


The callback is named `unsent_bytes`, I'd use something similar also
in the function name, so `virtio_transport_unsent_bytes`, or the
opposite renaming the callback, as you prefer, but I'd use the same
for both.


+
.read_skb = virtio_transport_read_skb,
},

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..e74c12878213 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -134,6 +134,8 @@ struct virtio_vsock_sock {
u32 peer_fwd_cnt;
u32 peer_buf_alloc;



Can you remove this extra empty line, so it's clear that it is
protected by tx_lock?


+   size_t bytes_unsent;
+
/* Protected by rx_lock */
u32 fwd_cnt;
u32 last_fwd_cnt;
@@ -193,6 +195,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock 
*vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk);

+size_t virtio_transport_bytes_unsent(struct vsock_sock *vsk);
+
+void virtio_transport_consume_skb_sent(struct sk_buff *skb,
+  bool consume);
+
int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 struct vsock_sock *psk);
int
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 43d405298857..fc62d2818c2c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct 
*work)

virtqueue_disable_cb(vq);
while ((skb = virtqueue_get_buf(vq, &len)) != NULL) {
-   consume_skb(skb);
+   virtio_transport_consume_skb_sent(skb, true);
added = true;
}
} while (!virtqueue_enable_cb(vq));
@@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

+   .unsent_bytes = virtio_transport_bytes_unsent,
+
.read_skb = virtio_transport_read_skb,
},

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 16ff976a86e3..3a7fa36f306b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock 
*vvs, struct sk_buff *
}
EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);

+void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume)
+{
+   struct sock *s = skb->sk;
+
+   if (s && skb->len) {
+   struct vsock_sock *vs = vsock_sk(s);
+   struct virtio_vsock_sock *vvs;
+
+   vvs = vs->trans;
+
+   spin_lock_bh(&vvs->tx_lock);
+   vvs->bytes_unsent -= skb->len;
+   spin_unlock_bh(&vvs->tx_lock);
+   }
+
+   if (consume)
+   consume_skb(skb);
+}
+EXPORT_SYMBOL_G

Re: [RFC PATCH v1 1/2] virtio/vsock: rework deferred credit update logic

2024-07-01 Thread Stefano Garzarella

Hi Arseniy,

On Fri, Jun 21, 2024 at 10:25:40PM GMT, Arseniy Krasnov wrote:

Previous calculation of 'free_space' was wrong (but worked as expected
in most cases, see below), because it didn't account number of bytes in
rx queue. Let's rework 'free_space' calculation in the following way:
as this value is considered free space at rx side from tx point of 
view,

it must be equal to return value of 'virtio_transport_get_credit()' at
tx side. This function uses 'tx_cnt' counter and 'peer_fwd_cnt': first
is number of transmitted bytes (without wrap), second is last 'fwd_cnt'
value received from rx. So let's use same approach at rx side during
'free_space' calculation: add 'rx_cnt' counter which is number of
received bytes (also without wrap) and subtract 'last_fwd_cnt' from it.
Now we have:
1) 'rx_cnt' == 'tx_cnt' at both sides.
2) 'last_fwd_cnt' == 'peer_fwd_cnt' - because first is last 'fwd_cnt'
  sent to tx, while second is last 'fwd_cnt' received from rx.

Now 'free_space' is handled correctly and also we don't need
'low_rx_bytes' flag - this was more like a hack.

Previous calculation of 'free_space' worked (in 99% cases), because if
we take a look on behaviour of both expressions (new and previous):

'(rx_cnt - last_fwd_cnt)' and '(fwd_cnt - last_fwd_cnt)'

Both of them always grows up, with almost same "speed": only difference
is that 'rx_cnt' is incremented earlier during packet is received,
while 'fwd_cnt' in incremented when packet is read by user. So if 
'rx_cnt'

grows "faster", then resulting 'free_space' become smaller also, so we
send credit updates a little bit more, but:

 * 'free_space' calculation based on 'rx_cnt' gives the same value,
   which tx sees as free space at rx side, so original idea of
   'free_space' is now implemented as planned.
 * Hack with 'low_rx_bytes' now is not needed.

Also here is some performance comparison between both versions of
'free_space' calculation:

*--*--*--*
|  | 'rx_cnt' | previous |
*--*--*--*
|H -> G|   8.42   |   7.82   |
*--*--*--*
|G -> H|   11.6   |   12.1   |
*--*--*--*


I did some tests on an Intel(R) Xeon(R) Silver 4410Y using iperf-vsock:
- kernel 6.9.0
pkt_size G->H H->G
4k4.6  6.4
64k  13.8 11.5
128k 13.4 11.7

- kernel 6.9.0 with this series applied
pkt_size     G->H     H->G
4k            4.6     8.16
64k          12.2     8.9
128k         12.8     8.8

I see a big drop, especially on H->G with big packets. Can you try to 
replicate on your env?


I'll try to understand more and also an i7 on the next days.

Thanks,
Stefano



As benchmark 'vsock-iperf' with default arguments was used. There is no
significant performance difference before and after this patch.

Signed-off-by: Arseniy Krasnov 
---
include/linux/virtio_vsock.h| 1 +
net/vmw_vsock/virtio_transport_common.c | 8 +++-
2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..3579491c411e 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -135,6 +135,7 @@ struct virtio_vsock_sock {
u32 peer_buf_alloc;

/* Protected by rx_lock */
+   u32 rx_cnt;
u32 fwd_cnt;
u32 last_fwd_cnt;
u32 rx_bytes;
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 16ff976a86e3..1d4e2328e06e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -441,6 +441,7 @@ static bool virtio_transport_inc_rx_pkt(struct 
virtio_vsock_sock *vvs,
return false;

vvs->rx_bytes += len;
+   vvs->rx_cnt += len;
return true;
}

@@ -558,7 +559,6 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
size_t bytes, total = 0;
struct sk_buff *skb;
u32 fwd_cnt_delta;
-   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;

@@ -603,9 +603,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}

fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
-   free_space = vvs->buf_alloc - fwd_cnt_delta;
-   low_rx_bytes = (vvs->rx_bytes <
-   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
+   free_space = vvs->buf_alloc - (vvs->rx_cnt - vvs->last_fwd_cnt);

spin_unlock_bh(&vvs->rx_lock);

@@ -619,7 +617,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * number of bytes in rx queue is not enough to wake up reader.
 */
if (fwd_cnt_delta &&
-   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE))
virtio_transport_send_credit_update(vsk);

return total;
--
2.25.1







Re: [PATCH PATCH net-next v2 1/2] vsock/virtio: refactor virtio_transport_send_pkt_work

2024-07-02 Thread Stefano Garzarella

On Mon, Jul 01, 2024 at 04:28:02PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Marco Pinna 

Preliminary patch to introduce an optimization to the
enqueue system.

All the code used to enqueue a packet into the virtqueue
is removed from virtio_transport_send_pkt_work()
and moved to the new virtio_transport_send_skb() function.

Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Marco Pinna 
---
net/vmw_vsock/virtio_transport.c | 133 +--
1 file changed, 73 insertions(+), 60 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 43d405298857..a74083d28120 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -94,6 +94,77 @@ static u32 virtio_transport_get_local_cid(void)
return ret;
}

+/* Caller need to hold vsock->tx_lock on vq */
+static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq,
+struct virtio_vsock *vsock, bool 
*restart_rx)
+{
+   int ret, in_sg = 0, out_sg = 0;
+   struct scatterlist **sgs;
+   bool reply;
+
+   reply = virtio_vsock_skb_reply(skb);
+   sgs = vsock->out_sgs;
+   sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
+   sizeof(*virtio_vsock_hdr(skb)));
+   out_sg++;
+
+   if (!skb_is_nonlinear(skb)) {
+   if (skb->len > 0) {
+   sg_init_one(sgs[out_sg], skb->data, skb->len);
+   out_sg++;
+   }
+   } else {
+   struct skb_shared_info *si;
+   int i;
+
+   /* If skb is nonlinear, then its buffer must contain
+* only header and nothing more. Data is stored in
+* the fragged part.
+*/
+   WARN_ON_ONCE(skb_headroom(skb) != 
sizeof(*virtio_vsock_hdr(skb)));
+
+   si = skb_shinfo(skb);
+
+   for (i = 0; i < si->nr_frags; i++) {
+   skb_frag_t *skb_frag = &si->frags[i];
+   void *va;
+
+   /* We will use 'page_to_virt()' for the userspace page
+* here, because virtio or dma-mapping layers will call
+* 'virt_to_phys()' later to fill the buffer descriptor.
+* We don't touch memory at "virtual" address of this 
page.
+*/
+   va = page_to_virt(skb_frag_page(skb_frag));
+   sg_init_one(sgs[out_sg],
+   va + skb_frag_off(skb_frag),
+   skb_frag_size(skb_frag));
+   out_sg++;
+   }
+   }
+
+   ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL);
+   /* Usually this means that there is no more space available in
+* the vq
+*/
+   if (ret < 0)
+   return ret;
+
+   virtio_transport_deliver_tap_pkt(skb);
+
+   if (reply) {
+   struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+   int val;
+
+   val = atomic_dec_return(&vsock->queued_replies);
+
+   /* Do we now have resources to resume rx processing? */
+   if (val + 1 == virtqueue_get_vring_size(rx_vq))
+   *restart_rx = true;
+   }


Looking more closely at this patch, perhaps we can leave reply handling 
out of this refactoring, as it is only needed in the worker.


IIUC, this is to prevent the RX worker from leaving room for the TX 
worker by handling too many replies. So when we have a large enough 
number of replies (equal to the size of the RX queue) in the queue of 
the TX worker ready to be queued in the virtqueue, we stop the RX worker 
and restart it only when the TX worker has had a chance to send replies.


@Stefan can you confirm this since you were involved in the original 
implementation?


If we skip the worker, we don't need this.
Moreover, we know well that the worker has no queued elements, so we 
will only go to increment `queued_replies` and then decrement it 
immediately afterwards.


Thanks,
Stefano


+
+   return 0;
+}
+
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -111,77 +182,19 @@ virtio_transport_send_pkt_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];

for (;;) {
-   int ret, in_sg = 0, out_sg = 0;
-   struct scatterlist **sgs;
struct sk_buff *skb;
-   bool reply;
+   int ret;

skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
if (!skb)
break;

-   reply = virtio_vsock_skb_reply(skb);
-   sgs = vsock->out_sgs;
-   sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
-   sizeof(*virtio_vsock_hdr(skb)));
- 

Re: [PATCH PATCH net-next v2 2/2] vsock/virtio: avoid enqueue packets when work queue is empty

2024-07-02 Thread Stefano Garzarella

On Mon, Jul 01, 2024 at 04:49:41PM GMT, Luigi Leonardi wrote:

Hi all,


+   /* Inside RCU, can't sleep! */
+   ret = mutex_trylock(&vsock->tx_lock);
+   if (unlikely(ret == 0))
+   goto out_worker;


I just realized that here I don't release the tx_lock and
that the email subject is "PATCH PATCH".
I will fix this in the next version.


What about adding a function to handle all these steps?
So we can handle better the error path in this block code.

IMHO to simplify the code, you can just return true or false if you 
queued it. Then if the driver is disappearing and we are still queuing 
it, it will be the release that will clean up all the queues, so we 
might not worry about this edge case.


Thanks,
Stefano


Any feedback is welcome!

Thanks,
Luigi






Re: [PATCH PATCH net-next v2 2/2] vsock/virtio: avoid enqueue packets when work queue is empty

2024-07-02 Thread Stefano Garzarella

On Mon, Jul 01, 2024 at 04:28:03PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Marco Pinna 

Introduce an optimization in virtio_transport_send_pkt:
when the work queue (send_pkt_queue) is empty the packet is
put directly in the virtqueue reducing latency.

In the following benchmark (pingpong mode) the host sends
a payload to the guest and waits for the same payload back.

All vCPUs pinned individually to pCPUs.
vhost process pinned to a pCPU
fio process pinned both inside the host and the guest system.

Host CPU: Intel i7-10700KF CPU @ 3.80GHz
Tool: Fio version 3.37-56
Env: Phys host + L1 Guest
Payload: 512
Runtime-per-test: 50s
Mode: pingpong (h-g-h)
Test runs: 50
Type: SOCK_STREAM

Before (Linux 6.8.11)
--
mean(1st percentile):380.56 ns
mean(overall):   780.83 ns
mean(99th percentile):  8300.24 ns

After
--
mean(1st percentile):   370.59 ns
mean(overall):  720.66 ns
mean(99th percentile): 7600.27 ns

Same setup, using 4K payload:

Before (Linux 6.8.11)
--
mean(1st percentile):458.84 ns
mean(overall):  1650.17 ns
mean(99th percentile): 42240.68 ns

After
--
mean(1st percentile):450.12 ns
mean(overall):  1460.84 ns
mean(99th percentile): 37632.45 ns

virtqueue.

Throughput: iperf-vsock

Before (Linux 6.8.11)
G2H 28.7 Gb/s

After
G2H 40.8 Gb/s


Cool!

I'd suggest to add the length of buffer (-l param) used, and also
check more lenghts, like at least 4k, 64k, 128k.



The performance improvement is related to this optimization,
I checked that each packet was put directly on the vq
avoiding the work queue.


How?



Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Marco Pinna 


I think you might want to change the author of this patch, since it's 
changed a lot from Marco's original one. Obviously if you both agree on 
this.


Thanks,
Stefano


---
net/vmw_vsock/virtio_transport.c | 38 --
1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index a74083d28120..3815aa8d956b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -213,6 +213,7 @@ virtio_transport_send_pkt(struct sk_buff *skb)
{
struct virtio_vsock_hdr *hdr;
struct virtio_vsock *vsock;
+   bool use_worker = true;
int len = skb->len;

hdr = virtio_vsock_hdr(skb);
@@ -234,8 +235,41 @@ virtio_transport_send_pkt(struct sk_buff *skb)
if (virtio_vsock_skb_reply(skb))
atomic_inc(&vsock->queued_replies);

-   virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-   queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+   /* If the workqueue (send_pkt_queue) is empty there is no need to 
enqueue the packet.
+* Just put it on the virtqueue using virtio_transport_send_skb.
+*/
+   if (skb_queue_empty_lockless(&vsock->send_pkt_queue)) {
+   bool restart_rx = false;
+   struct virtqueue *vq;
+   int ret;
+
+   /* Inside RCU, can't sleep! */
+   ret = mutex_trylock(&vsock->tx_lock);
+   if (unlikely(ret == 0))
+   goto out_worker;
+
+   /* Driver is being removed, no need to enqueue the packet */
+   if (!vsock->tx_run)
+   goto out_rcu;
+
+   vq = vsock->vqs[VSOCK_VQ_TX];
+
+   if (!virtio_transport_send_skb(skb, vq, vsock, &restart_rx)) {
+   use_worker = false;
+   virtqueue_kick(vq);
+   }
+
+   mutex_unlock(&vsock->tx_lock);
+
+   if (restart_rx)
+   queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+   }
+
+out_worker:
+   if (use_worker) {
+   virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
+   queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+   }

out_rcu:
rcu_read_unlock();

-- 2.45.2







[PATCH] vdpa_sim_blk: add `capacity` module parameter

2024-07-05 Thread Stefano Garzarella
The vDPA block simulator always allocated a 128 MiB ram-disk, but some
filesystems (e.g. XFS) may require larger minimum sizes (see
https://issues.redhat.com/browse/RHEL-45951).

So to allow us to test these filesystems, let's add a module parameter
to control the size of the simulated virtio-blk devices.
The value is mapped directly to the `capacity` field of the virtio-blk
configuration space, so it must be expressed in sector numbers of 512
bytes.

The default value (0x4) is the same as the previous value, so the
behavior without setting `capacity` remains unchanged.

Before this patch or with this patch without setting `capacity`:
  $ modprobe vdpa-sim-blk
  $ vdpa dev add mgmtdev vdpasim_blk name blk0
  virtio_blk virtio6: 1/0/0 default/read/poll queues
  virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB)

After this patch:
  $ modprobe vdpa-sim-blk capacity=614400
  $ vdpa dev add mgmtdev vdpasim_blk name blk0
  virtio_blk virtio6: 1/0/0 default/read/poll queues
  virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB)

Signed-off-by: Stefano Garzarella 
---
 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index b137f3679343..18f390149836 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -33,7 +33,6 @@
 (1ULL << VIRTIO_BLK_F_DISCARD)  | \
 (1ULL << VIRTIO_BLK_F_WRITE_ZEROES))
 
-#define VDPASIM_BLK_CAPACITY   0x4
 #define VDPASIM_BLK_SIZE_MAX   0x1000
 #define VDPASIM_BLK_SEG_MAX32
 #define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX
@@ -43,6 +42,10 @@
 #define VDPASIM_BLK_AS_NUM 1
 #define VDPASIM_BLK_GROUP_NUM  1
 
+static unsigned long capacity = 0x4;
+module_param(capacity, ulong, 0444);
+MODULE_PARM_DESC(capacity, "virtio-blk device capacity (in 512-byte sectors)");
+
 struct vdpasim_blk {
struct vdpasim vdpasim;
void *buffer;
@@ -79,10 +82,10 @@ static void vdpasim_blk_buffer_unlock(struct vdpasim_blk 
*blk)
 static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector,
u64 num_sectors, u64 max_sectors)
 {
-   if (start_sector > VDPASIM_BLK_CAPACITY) {
+   if (start_sector > capacity) {
dev_dbg(&vdpasim->vdpa.dev,
-   "starting sector exceeds the capacity - start: 0x%llx 
capacity: 0x%x\n",
-   start_sector, VDPASIM_BLK_CAPACITY);
+   "starting sector exceeds the capacity - start: 0x%llx 
capacity: 0x%lx\n",
+   start_sector, capacity);
}
 
if (num_sectors > max_sectors) {
@@ -92,10 +95,10 @@ static bool vdpasim_blk_check_range(struct vdpasim 
*vdpasim, u64 start_sector,
return false;
}
 
-   if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) {
+   if (num_sectors > capacity - start_sector) {
dev_dbg(&vdpasim->vdpa.dev,
-   "request exceeds the capacity - start: 0x%llx num: 
0x%llx capacity: 0x%x\n",
-   start_sector, num_sectors, VDPASIM_BLK_CAPACITY);
+   "request exceeds the capacity - start: 0x%llx num: 
0x%llx capacity: 0x%lx\n",
+   start_sector, num_sectors, capacity);
return false;
}
 
@@ -369,7 +372,7 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, 
void *config)
 
memset(config, 0, sizeof(struct virtio_blk_config));
 
-   blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY);
+   blk_config->capacity = cpu_to_vdpasim64(vdpasim, capacity);
blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX);
blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX);
blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM);
@@ -437,8 +440,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, 
const char *name,
if (blk->shared_backend) {
blk->buffer = shared_buffer;
} else {
-   blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
-  GFP_KERNEL);
+   blk->buffer = kvzalloc(capacity << SECTOR_SHIFT, GFP_KERNEL);
if (!blk->buffer) {
ret = -ENOMEM;
goto put_dev;
@@ -495,8 +497,7 @@ static int __init vdpasim_blk_init(void)
goto parent_err;
 
if (shared_backend) {
-   shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
-GFP_KERNEL);
+

Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter

2024-07-05 Thread Stefano Garzarella

On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote:

On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote:

The vDPA block simulator always allocated a 128 MiB ram-disk, but some
filesystems (e.g. XFS) may require larger minimum sizes (see
https://issues.redhat.com/browse/RHEL-45951).

So to allow us to test these filesystems, let's add a module parameter
to control the size of the simulated virtio-blk devices.
The value is mapped directly to the `capacity` field of the virtio-blk
configuration space, so it must be expressed in sector numbers of 512
bytes.

The default value (0x4) is the same as the previous value, so the
behavior without setting `capacity` remains unchanged.

Before this patch or with this patch without setting `capacity`:
  $ modprobe vdpa-sim-blk
  $ vdpa dev add mgmtdev vdpasim_blk name blk0
  virtio_blk virtio6: 1/0/0 default/read/poll queues
  virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB)

After this patch:
  $ modprobe vdpa-sim-blk capacity=614400
  $ vdpa dev add mgmtdev vdpasim_blk name blk0
  virtio_blk virtio6: 1/0/0 default/read/poll queues
  virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB)

Signed-off-by: Stefano Garzarella 


What a hack. Cindy was working on adding control over config
space, why can't that be used?


If it can be used easily with virtio-blk device too, it will be great.
@Cindy do you plan to support that changes for a virtio-blk device too?

In the mean time, for the simulator I thought that this change was fine.
It's just used for testing and debugging...

My main question is how to use that when we have `shared_backend` set to 
true, since we use that setting to test for example live migration. In 
that case, how do we handle the size of the shared ramdisk between 
devices?


Thanks,
Stefano




Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter

2024-07-08 Thread Stefano Garzarella

Hi Cindy, Jason,

On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote:

On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu  wrote:


On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella  wrote:
>
> On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote:
> >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote:
> >> The vDPA block simulator always allocated a 128 MiB ram-disk, but some
> >> filesystems (e.g. XFS) may require larger minimum sizes (see
> >> https://issues.redhat.com/browse/RHEL-45951).
> >>
> >> So to allow us to test these filesystems, let's add a module parameter
> >> to control the size of the simulated virtio-blk devices.
> >> The value is mapped directly to the `capacity` field of the virtio-blk
> >> configuration space, so it must be expressed in sector numbers of 512
> >> bytes.
> >>
> >> The default value (0x4) is the same as the previous value, so the
> >> behavior without setting `capacity` remains unchanged.
> >>
> >> Before this patch or with this patch without setting `capacity`:
> >>   $ modprobe vdpa-sim-blk
> >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
> >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
> >>   virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB)
> >>
> >> After this patch:
> >>   $ modprobe vdpa-sim-blk capacity=614400
> >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
> >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
> >>   virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB)
> >>
> >> Signed-off-by: Stefano Garzarella 
> >
> >What a hack. Cindy was working on adding control over config
> >space, why can't that be used?
>
> If it can be used easily with virtio-blk device too, it will be great.
> @Cindy do you plan to support that changes for a virtio-blk device too?
>
Hi Stefano
I plan to add support to change the vdpa device's configuration after
it is created.


I think for Stefano's case, we can just implement it via provisioning
parameters?


Yep, I think we don't need to change it after creation, but specifying 
while creating should be enough.


So, IIUC we can already do it, implementing something similar to 
vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right?


What about when we have `shared_backend` set to true for the 
vdpa_sim_blk.ko? In this case the backend is supposed to be shared 
between all the devices to test live migration.


Maybe we can just change the size of the shared ramdisk to be reflected 
to all devices.


Suggestions?

@Cindy do you want to work on this for blk as well?
If you don't have time, I'll look at it when I can allocate some time.



Thanks


In the first step, I want to use the vdpa tool to add
support for changing the MAC address for the network device. the next
step will also add MTU settings etc
here is the link
https://lore.kernel.org/all/20240708064820.88955-1-l...@redhat.com/T/#t



I'll take a look, thanks for ccing me!

Stefano


in the device part, the device needs to implement its function of
int (*dev_set_attr)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev,
   const struct vdpa_dev_set_config *config);
the configuration will be passed by struct vdpa_dev_set_config. I'm
not sure if this kind of design is suitable for you? Really thanks and
any comments are welcome
thanks
Cindy


> In the mean time, for the simulator I thought that this change was fine.
> It's just used for testing and debugging...
>
> My main question is how to use that when we have `shared_backend` set to
> true, since we use that setting to test for example live migration. In
> that case, how do we handle the size of the shared ramdisk between
> devices?
>
> Thanks,
> Stefano
>








Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter

2024-07-09 Thread Stefano Garzarella

On Tue, Jul 09, 2024 at 10:56:16AM GMT, Jason Wang wrote:

On Mon, Jul 8, 2024 at 4:15 PM Stefano Garzarella  wrote:


Hi Cindy, Jason,

On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote:
>On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu  wrote:
>>
>> On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella  wrote:
>> >
>> > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote:
>> > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote:
>> > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but some
>> > >> filesystems (e.g. XFS) may require larger minimum sizes (see
>> > >> https://issues.redhat.com/browse/RHEL-45951).
>> > >>
>> > >> So to allow us to test these filesystems, let's add a module parameter
>> > >> to control the size of the simulated virtio-blk devices.
>> > >> The value is mapped directly to the `capacity` field of the virtio-blk
>> > >> configuration space, so it must be expressed in sector numbers of 512
>> > >> bytes.
>> > >>
>> > >> The default value (0x4) is the same as the previous value, so the
>> > >> behavior without setting `capacity` remains unchanged.
>> > >>
>> > >> Before this patch or with this patch without setting `capacity`:
>> > >>   $ modprobe vdpa-sim-blk
>> > >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
>> > >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
>> > >>   virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 
MiB)
>> > >>
>> > >> After this patch:
>> > >>   $ modprobe vdpa-sim-blk capacity=614400
>> > >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
>> > >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
>> > >>   virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 
MiB)
>> > >>
>> > >> Signed-off-by: Stefano Garzarella 
>> > >
>> > >What a hack. Cindy was working on adding control over config
>> > >space, why can't that be used?
>> >
>> > If it can be used easily with virtio-blk device too, it will be great.
>> > @Cindy do you plan to support that changes for a virtio-blk device too?
>> >
>> Hi Stefano
>> I plan to add support to change the vdpa device's configuration after
>> it is created.
>
>I think for Stefano's case, we can just implement it via provisioning
>parameters?

Yep, I think we don't need to change it after creation, but specifying
while creating should be enough.

So, IIUC we can already do it, implementing something similar to
vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right?


Right.



What about when we have `shared_backend` set to true for the
vdpa_sim_blk.ko? In this case the backend is supposed to be shared
between all the devices to test live migration.


This seems to be another topic.


Yep, but really related. I think we need to handle that case when 
supporting the `capacity` setting.






Maybe we can just change the size of the shared ramdisk to be reflected
to all devices.

Suggestions?


Could we specify the path to tmpfs or others during provisioning
instead?  It seems more general (but more work).


Then it would almost become a real device, no longer just a simulator.  
It's enough work, though, as you said, but at that point we'd just have 
to specify the backend file to use for the device.


In that case what API would we need to use to allow the user to set the 
backend file?


Thanks,
Stefano




Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter

2024-07-10 Thread Stefano Garzarella

On Wed, Jul 10, 2024 at 11:08:48AM GMT, Jason Wang wrote:

On Tue, Jul 9, 2024 at 8:41 PM Stefano Garzarella  wrote:


On Tue, Jul 09, 2024 at 10:56:16AM GMT, Jason Wang wrote:
>On Mon, Jul 8, 2024 at 4:15 PM Stefano Garzarella  wrote:
>>
>> Hi Cindy, Jason,
>>
>> On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote:
>> >On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu  wrote:
>> >>
>> >> On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella  
wrote:
>> >> >
>> >> > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote:
>> >> > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote:
>> >> > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but 
some
>> >> > >> filesystems (e.g. XFS) may require larger minimum sizes (see
>> >> > >> https://issues.redhat.com/browse/RHEL-45951).
>> >> > >>
>> >> > >> So to allow us to test these filesystems, let's add a module 
parameter
>> >> > >> to control the size of the simulated virtio-blk devices.
>> >> > >> The value is mapped directly to the `capacity` field of the 
virtio-blk
>> >> > >> configuration space, so it must be expressed in sector numbers of 512
>> >> > >> bytes.
>> >> > >>
>> >> > >> The default value (0x4) is the same as the previous value, so the
>> >> > >> behavior without setting `capacity` remains unchanged.
>> >> > >>
>> >> > >> Before this patch or with this patch without setting `capacity`:
>> >> > >>   $ modprobe vdpa-sim-blk
>> >> > >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
>> >> > >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
>> >> > >>   virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 
MB/128 MiB)
>> >> > >>
>> >> > >> After this patch:
>> >> > >>   $ modprobe vdpa-sim-blk capacity=614400
>> >> > >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
>> >> > >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
>> >> > >>   virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 
MB/300 MiB)
>> >> > >>
>> >> > >> Signed-off-by: Stefano Garzarella 
>> >> > >
>> >> > >What a hack. Cindy was working on adding control over config
>> >> > >space, why can't that be used?
>> >> >
>> >> > If it can be used easily with virtio-blk device too, it will be great.
>> >> > @Cindy do you plan to support that changes for a virtio-blk device too?
>> >> >
>> >> Hi Stefano
>> >> I plan to add support to change the vdpa device's configuration after
>> >> it is created.
>> >
>> >I think for Stefano's case, we can just implement it via provisioning
>> >parameters?
>>
>> Yep, I think we don't need to change it after creation, but specifying
>> while creating should be enough.
>>
>> So, IIUC we can already do it, implementing something similar to
>> vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right?
>
>Right.
>
>>
>> What about when we have `shared_backend` set to true for the
>> vdpa_sim_blk.ko? In this case the backend is supposed to be shared
>> between all the devices to test live migration.
>
>This seems to be another topic.

Yep, but really related. I think we need to handle that case when
supporting the `capacity` setting.


Ok, so if I was not wrong, the goal is to test migration.


Sorry, I was not clear, I try to rephrase:
vdpa_sim_blk already supports a module parameter called `shared_backend` 
introduced mainly to test live migration on the same host. When that 
parameter is on, all the created devices share the same backend and so 
we can easily do migration from one to another.


With that parameter on or off, the device is always 128 MB, but now 
that's a problem for testing, because it looks like XFS requires at 
least 300 MB: https://issues.redhat.com/browse/RHEL-45951


That's why I sent this patch.

When `shared_backend` is off (default), using the provisioning 
parameters seems feasible to me, but when it's on, how do I deal with 
it?


Being a simulator, we can maybe make it so that only the first device 
can change the size for example, or that all devices control the size, 
but then we would have to handle the size change at r

Re: [PATCH] test/vsock: add install target

2024-07-10 Thread Stefano Garzarella

On Tue, Jul 09, 2024 at 09:50:51PM GMT, Peng Fan (OSS) wrote:

From: Peng Fan 

Add install target for vsock to make Yocto easy to install the images.

Signed-off-by: Peng Fan 
---
tools/testing/vsock/Makefile | 12 
1 file changed, 12 insertions(+)

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index a7f56a09ca9f..5c8442fa9460 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -8,8 +8,20 @@ vsock_perf: vsock_perf.o msg_zerocopy_common.o
vsock_uring_test: LDLIBS = -luring
vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o 
msg_zerocopy_common.o

+VSOCK_INSTALL_PATH ?= $(abspath .)
+# Avoid changing the rest of the logic here and lib.mk.
+INSTALL_PATH := $(VSOCK_INSTALL_PATH)
+
CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
clean:
${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test
-include *.d
+
+install: all
+   @# Ask all targets to install their files
+   mkdir -p $(INSTALL_PATH)/vsock


why using the "vsock" subdir?

IIUC you were inspired by selftests/Makefile, but it installs under 
$(INSTALL_PATH)/kselftest/ the scripts used by the main one 
`run_kselftest.sh`, which is installed in $(INSTALL_PATH instead.

So in this case I would install everything in $(INSTALL_PATH).

WDYT?


+   install -m 744 vsock_test $(INSTALL_PATH)/vsock/
+   install -m 744 vsock_perf $(INSTALL_PATH)/vsock/
+   install -m 744 vsock_diag_test $(INSTALL_PATH)/vsock/
+   install -m 744 vsock_uring_test $(INSTALL_PATH)/vsock/


Also from selftests/Makefile, what about using the ifdef instead of 
using $(abspath .) as default place?


I mean this:

install: all
ifdef INSTALL_PATH
  ...
else
$(error Error: set INSTALL_PATH to use install)
endif

Thanks,
Stefano




Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter

2024-07-10 Thread Stefano Garzarella

On Wed, Jul 10, 2024 at 03:28:31PM GMT, Jason Wang wrote:

On Wed, Jul 10, 2024 at 3:19 PM Stefano Garzarella  wrote:


On Wed, Jul 10, 2024 at 11:08:48AM GMT, Jason Wang wrote:
>On Tue, Jul 9, 2024 at 8:41 PM Stefano Garzarella  wrote:
>>
>> On Tue, Jul 09, 2024 at 10:56:16AM GMT, Jason Wang wrote:
>> >On Mon, Jul 8, 2024 at 4:15 PM Stefano Garzarella  
wrote:
>> >>
>> >> Hi Cindy, Jason,
>> >>
>> >> On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote:
>> >> >On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu  wrote:
>> >> >>
>> >> >> On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella  
wrote:
>> >> >> >
>> >> >> > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote:
>> >> >> > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote:
>> >> >> > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but 
some
>> >> >> > >> filesystems (e.g. XFS) may require larger minimum sizes (see
>> >> >> > >> https://issues.redhat.com/browse/RHEL-45951).
>> >> >> > >>
>> >> >> > >> So to allow us to test these filesystems, let's add a module 
parameter
>> >> >> > >> to control the size of the simulated virtio-blk devices.
>> >> >> > >> The value is mapped directly to the `capacity` field of the 
virtio-blk
>> >> >> > >> configuration space, so it must be expressed in sector numbers of 
512
>> >> >> > >> bytes.
>> >> >> > >>
>> >> >> > >> The default value (0x4) is the same as the previous value, so 
the
>> >> >> > >> behavior without setting `capacity` remains unchanged.
>> >> >> > >>
>> >> >> > >> Before this patch or with this patch without setting `capacity`:
>> >> >> > >>   $ modprobe vdpa-sim-blk
>> >> >> > >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
>> >> >> > >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
>> >> >> > >>   virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 
MB/128 MiB)
>> >> >> > >>
>> >> >> > >> After this patch:
>> >> >> > >>   $ modprobe vdpa-sim-blk capacity=614400
>> >> >> > >>   $ vdpa dev add mgmtdev vdpasim_blk name blk0
>> >> >> > >>   virtio_blk virtio6: 1/0/0 default/read/poll queues
>> >> >> > >>   virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 
MB/300 MiB)
>> >> >> > >>
>> >> >> > >> Signed-off-by: Stefano Garzarella 
>> >> >> > >
>> >> >> > >What a hack. Cindy was working on adding control over config
>> >> >> > >space, why can't that be used?
>> >> >> >
>> >> >> > If it can be used easily with virtio-blk device too, it will be 
great.
>> >> >> > @Cindy do you plan to support that changes for a virtio-blk device 
too?
>> >> >> >
>> >> >> Hi Stefano
>> >> >> I plan to add support to change the vdpa device's configuration after
>> >> >> it is created.
>> >> >
>> >> >I think for Stefano's case, we can just implement it via provisioning
>> >> >parameters?
>> >>
>> >> Yep, I think we don't need to change it after creation, but specifying
>> >> while creating should be enough.
>> >>
>> >> So, IIUC we can already do it, implementing something similar to
>> >> vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right?
>> >
>> >Right.
>> >
>> >>
>> >> What about when we have `shared_backend` set to true for the
>> >> vdpa_sim_blk.ko? In this case the backend is supposed to be shared
>> >> between all the devices to test live migration.
>> >
>> >This seems to be another topic.
>>
>> Yep, but really related. I think we need to handle that case when
>> supporting the `capacity` setting.
>
>Ok, so if I was not wrong, the goal is to test migration.

Sorry, I was not clear, I try to rephrase:
vdpa_sim_blk already supports a module parameter called `shared_backend`
introduced m

Re: [PATCH] test/vsock: add install target

2024-07-10 Thread Stefano Garzarella

On Wed, Jul 10, 2024 at 08:11:32AM GMT, Peng Fan wrote:

Subject: Re: [PATCH] test/vsock: add install target

On Tue, Jul 09, 2024 at 09:50:51PM GMT, Peng Fan (OSS) wrote:
>From: Peng Fan 
>
>Add install target for vsock to make Yocto easy to install the images.
>
>Signed-off-by: Peng Fan 
>---
> tools/testing/vsock/Makefile | 12 
> 1 file changed, 12 insertions(+)
>
>diff --git a/tools/testing/vsock/Makefile
>b/tools/testing/vsock/Makefile index a7f56a09ca9f..5c8442fa9460
100644
>--- a/tools/testing/vsock/Makefile
>+++ b/tools/testing/vsock/Makefile
>@@ -8,8 +8,20 @@ vsock_perf: vsock_perf.o
msg_zerocopy_common.o
> vsock_uring_test: LDLIBS = -luring
> vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o
>msg_zerocopy_common.o
>
>+VSOCK_INSTALL_PATH ?= $(abspath .)
>+# Avoid changing the rest of the logic here and lib.mk.
>+INSTALL_PATH := $(VSOCK_INSTALL_PATH)
>+
> CFLAGS += -g -O2 -Werror -Wall -I. -I../../include
> -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow
> -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -
D_GNU_SOURCE
> .PHONY: all test clean
> clean:
>${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
vsock_uring_test
> -include *.d
>+
>+install: all
>+   @# Ask all targets to install their files
>+   mkdir -p $(INSTALL_PATH)/vsock

why using the "vsock" subdir?

IIUC you were inspired by selftests/Makefile, but it installs under
$(INSTALL_PATH)/kselftest/ the scripts used by the main one
`run_kselftest.sh`, which is installed in $(INSTALL_PATH instead.
So in this case I would install everything in $(INSTALL_PATH).

WDYT?


I agree.



>+   install -m 744 vsock_test $(INSTALL_PATH)/vsock/
>+   install -m 744 vsock_perf $(INSTALL_PATH)/vsock/
>+   install -m 744 vsock_diag_test $(INSTALL_PATH)/vsock/
>+   install -m 744 vsock_uring_test $(INSTALL_PATH)/vsock/

Also from selftests/Makefile, what about using the ifdef instead of
using $(abspath .) as default place?

I mean this:

install: all
ifdef INSTALL_PATH
   ...
else
$(error Error: set INSTALL_PATH to use install) endif


Is the following looks good to you?

# Avoid conflict with INSTALL_PATH set by the main Makefile
VSOCK_INSTALL_PATH ?=
INSTALL_PATH := $(VSOCK_INSTALL_PATH)


I'm not a super Makefile expert, but why do we need both 
VSOCK_INSTALL_PATH and INSTALL_PATH?


Stefano



install: all
ifdef INSTALL_PATH
   mkdir -p $(INSTALL_PATH)
   install -m 744 vsock_test $(INSTALL_PATH)
   install -m 744 vsock_perf $(INSTALL_PATH)
   install -m 744 vsock_diag_test $(INSTALL_PATH)
   install -m 744 vsock_uring_test $(INSTALL_PATH)
else
   $(error Error: set INSTALL_PATH to use install)
Endif

Thanks,
Peng.


Thanks,
Stefano







Re: [PATCH] test/vsock: add install target

2024-07-10 Thread Stefano Garzarella

On Wed, Jul 10, 2024 at 11:34:05AM GMT, Peng Fan wrote:

Subject: Re: [PATCH] test/vsock: add install target

On Wed, Jul 10, 2024 at 08:11:32AM GMT, Peng Fan wrote:
>> Subject: Re: [PATCH] test/vsock: add install target
>>
>> On Tue, Jul 09, 2024 at 09:50:51PM GMT, Peng Fan (OSS) wrote:
>> >From: Peng Fan 
>> >
>> >Add install target for vsock to make Yocto easy to install the
images.
>> >
>> >Signed-off-by: Peng Fan 
>> >---
>> > tools/testing/vsock/Makefile | 12 
>> > 1 file changed, 12 insertions(+)
>> >
>> >diff --git a/tools/testing/vsock/Makefile
>> >b/tools/testing/vsock/Makefile index a7f56a09ca9f..5c8442fa9460
>> 100644
>> >--- a/tools/testing/vsock/Makefile
>> >+++ b/tools/testing/vsock/Makefile
>> >@@ -8,8 +8,20 @@ vsock_perf: vsock_perf.o
>> msg_zerocopy_common.o
>> > vsock_uring_test: LDLIBS = -luring
>> > vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o
>> >msg_zerocopy_common.o
>> >
>> >+VSOCK_INSTALL_PATH ?= $(abspath .)
>> >+# Avoid changing the rest of the logic here and lib.mk.
>> >+INSTALL_PATH := $(VSOCK_INSTALL_PATH)
>> >+
>> > CFLAGS += -g -O2 -Werror -Wall -I. -I../../include
>> > -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow
>> > -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -
>> D_GNU_SOURCE
>> > .PHONY: all test clean
>> > clean:
>> >   ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
>> vsock_uring_test
>> > -include *.d
>> >+
>> >+install: all
>> >+  @# Ask all targets to install their files
>> >+  mkdir -p $(INSTALL_PATH)/vsock
>>
>> why using the "vsock" subdir?
>>
>> IIUC you were inspired by selftests/Makefile, but it installs under
>> $(INSTALL_PATH)/kselftest/ the scripts used by the main one
>> `run_kselftest.sh`, which is installed in $(INSTALL_PATH instead.
>> So in this case I would install everything in $(INSTALL_PATH).
>>
>> WDYT?
>
>I agree.
>
>>
>> >+  install -m 744 vsock_test $(INSTALL_PATH)/vsock/
>> >+  install -m 744 vsock_perf $(INSTALL_PATH)/vsock/
>> >+  install -m 744 vsock_diag_test $(INSTALL_PATH)/vsock/
>> >+  install -m 744 vsock_uring_test $(INSTALL_PATH)/vsock/
>>
>> Also from selftests/Makefile, what about using the ifdef instead of
>> using $(abspath .) as default place?
>>
>> I mean this:
>>
>> install: all
>> ifdef INSTALL_PATH
>>...
>> else
>>$(error Error: set INSTALL_PATH to use install) endif
>
>Is the following looks good to you?
>
># Avoid conflict with INSTALL_PATH set by the main Makefile
>VSOCK_INSTALL_PATH ?= INSTALL_PATH := $(VSOCK_INSTALL_PATH)

I'm not a super Makefile expert, but why do we need both
VSOCK_INSTALL_PATH and INSTALL_PATH?


INSTALL_PATH is exported by kernel root directory makefile.
So to user, we need to avoid export INSTALL_PATH here.
So I just follow selftests/Makefile using KSFT_INSTALL_PATH


There is a comment there:

# Avoid changing the rest of the logic here and lib.mk.

Added by commit 17eac6c2db8b2cdfe33d40229bdda2acd86b304a.

IIUC they re-used INSTALL_PATH, just to avoid too many changes in that 
file and in tools/testing/selftests/lib.mk


So, IMHO we should not care about it and only use VSOCK_INSTALL_PATH if 
you don't want to conflict with INSTALL_PATH.


Stefano




Re: [PATCH V2] test/vsock: add install target

2024-07-10 Thread Stefano Garzarella

On Wed, Jul 10, 2024 at 08:27:28PM GMT, Peng Fan (OSS) wrote:

From: Peng Fan 

Add install target for vsock to make Yocto easy to install the images.

Signed-off-by: Peng Fan 
---


LGTM! This is a net-next material, so next time better to specify it 
(e.g. [PATCH net-next]).


If not queued within a week, please resend specifying net-next.

Reviewed-by: Stefano Garzarella 



V2:
Use VSOCK_INSTALL_PATH, drop INSTALL_PATH

tools/testing/vsock/Makefile | 13 +
1 file changed, 13 insertions(+)

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index a7f56a09ca9f..6e0b4e95e230 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -13,3 +13,16 @@ CFLAGS += -g -O2 -Werror -Wall -I. -I../../include 
-I../../../usr/include -Wno-p
clean:
${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test
-include *.d
+
+VSOCK_INSTALL_PATH ?=
+
+install: all
+ifdef VSOCK_INSTALL_PATH
+   mkdir -p $(VSOCK_INSTALL_PATH)
+   install -m 744 vsock_test $(VSOCK_INSTALL_PATH)
+   install -m 744 vsock_perf $(VSOCK_INSTALL_PATH)
+   install -m 744 vsock_diag_test $(VSOCK_INSTALL_PATH)
+   install -m 744 vsock_uring_test $(VSOCK_INSTALL_PATH)
+else
+   $(error Error: set VSOCK_INSTALL_PATH to use install)
+endif
--
2.37.1






Re: [PATCH] test/vsock: add install target

2024-07-11 Thread Stefano Garzarella

CCing Stefan.

On Wed, Jul 10, 2024 at 07:00:59PM GMT, Jakub Kicinski wrote:

On Wed, 10 Jul 2024 13:58:39 +0200 Stefano Garzarella wrote:

There is a comment there:

 # Avoid changing the rest of the logic here and lib.mk.

Added by commit 17eac6c2db8b2cdfe33d40229bdda2acd86b304a.

IIUC they re-used INSTALL_PATH, just to avoid too many changes in that
file and in tools/testing/selftests/lib.mk

So, IMHO we should not care about it and only use VSOCK_INSTALL_PATH if
you don't want to conflict with INSTALL_PATH.


Any reason why vsock isn't part of selftests in the first place?



Usually vsock tests test both the driver (virtio-vsock) in the guest and 
the device in the host kernel (vhost-vsock). So I usually run the tests 
in 2 nested VMs to test the latest changes for both the guest and the 
host.


I don't know enough selftests, but do you think it is possible to 
integrate them?


CCing Stefan who is the original author and may remember more reasons 
about this choice.


Thanks,
Stefano




Re: [PATCH] test/vsock: add install target

2024-07-12 Thread Stefano Garzarella

On Thu, Jul 11, 2024 at 07:14:55AM GMT, Jakub Kicinski wrote:

On Thu, 11 Jul 2024 15:38:01 +0200 Stefan Hajnoczi wrote:

> Usually vsock tests test both the driver (virtio-vsock) in the guest and the
> device in the host kernel (vhost-vsock). So I usually run the tests in 2
> nested VMs to test the latest changes for both the guest and the host.
>
> I don't know enough selftests, but do you think it is possible to integrate
> them?
>
> CCing Stefan who is the original author and may remember more reasons about
> this choice.

It's probably because of the manual steps in tools/testing/vsock/README:

  The following prerequisite steps are not automated and must be performed prior
  to running tests:

  1. Build the kernel, make headers_install, and build these tests.
  2. Install the kernel and tests on the host.
  3. Install the kernel and tests inside the guest.
  4. Boot the guest and ensure that the AF_VSOCK transport is enabled.

If you want to automate this for QEMU, VMware, and Hyper-V that would be
great. It relies on having a guest running under these hypervisors and
that's not trivial to automate (plus it involves proprietary software
for VMware and Hyper-V that may not be available without additional
license agreements and/or payment).


Not sure if there's a requirement that full process is automated.
Or at least if there is we are already breaking it in networking
because for some tests we need user to export some env variables
to point the test to the right interfaces and even a remote machine
to generate traffic. If the env isn't set up tests return 4 (SKIP).
I don't feel strongly that ksft + env approach is better but at
least it gives us easy access to the basic build and packaging
features from ksft. Up to you but thought I'd ask.



Yeah, I'll try to allocate some cycles to look into that. Tracking it 
here: https://gitlab.com/vsock/vsock/-/issues/13


What about this patch, can we queue it for now?

Thanks,
Stefano




Re: [PATCH net-next v3 1/2] vsock/virtio: refactor virtio_transport_send_pkt_work

2024-07-12 Thread Stefano Garzarella

On Thu, Jul 11, 2024 at 04:58:46PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Marco Pinna 

Preliminary patch to introduce an optimization to the
enqueue system.

All the code used to enqueue a packet into the virtqueue
is removed from virtio_transport_send_pkt_work()
and moved to the new virtio_transport_send_skb() function.

Co-developed-by: Luigi Leonardi 
Signed-off-by: Luigi Leonardi 
Signed-off-by: Marco Pinna 
---
net/vmw_vsock/virtio_transport.c | 105 ++-
1 file changed, 59 insertions(+), 46 deletions(-)


LGTM

Reviewed-by: Stefano Garzarella 



diff --git a/net/vmw_vsock/virtio_transport.c 
b/net/vmw_vsock/virtio_transport.c

index 43d405298857..c4205c22f40b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -94,6 +94,63 @@ static u32 virtio_transport_get_local_cid(void)
return ret;
}

+/* Caller need to hold vsock->tx_lock on vq */
+static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq,
+struct virtio_vsock *vsock)
+{
+   int ret, in_sg = 0, out_sg = 0;
+   struct scatterlist **sgs;
+
+   sgs = vsock->out_sgs;
+   sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
+   sizeof(*virtio_vsock_hdr(skb)));
+   out_sg++;
+
+   if (!skb_is_nonlinear(skb)) {
+   if (skb->len > 0) {
+   sg_init_one(sgs[out_sg], skb->data, skb->len);
+   out_sg++;
+   }
+   } else {
+   struct skb_shared_info *si;
+   int i;
+
+   /* If skb is nonlinear, then its buffer must contain
+* only header and nothing more. Data is stored in
+* the fragged part.
+*/
+   WARN_ON_ONCE(skb_headroom(skb) != 
sizeof(*virtio_vsock_hdr(skb)));
+
+   si = skb_shinfo(skb);
+
+   for (i = 0; i < si->nr_frags; i++) {
+   skb_frag_t *skb_frag = &si->frags[i];
+   void *va;
+
+   /* We will use 'page_to_virt()' for the userspace page
+* here, because virtio or dma-mapping layers will call
+* 'virt_to_phys()' later to fill the buffer descriptor.
+* We don't touch memory at "virtual" address of this 
page.
+*/
+   va = page_to_virt(skb_frag_page(skb_frag));
+   sg_init_one(sgs[out_sg],
+   va + skb_frag_off(skb_frag),
+   skb_frag_size(skb_frag));
+   out_sg++;
+   }
+   }
+
+   ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL);
+   /* Usually this means that there is no more space available in
+* the vq
+*/
+   if (ret < 0)
+   return ret;
+
+   virtio_transport_deliver_tap_pkt(skb);
+   return 0;
+}
+
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -111,66 +168,22 @@ virtio_transport_send_pkt_work(struct work_struct *work)
vq = vsock->vqs[VSOCK_VQ_TX];

for (;;) {
-   int ret, in_sg = 0, out_sg = 0;
-   struct scatterlist **sgs;
struct sk_buff *skb;
bool reply;
+   int ret;

skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
if (!skb)
break;

reply = virtio_vsock_skb_reply(skb);
-   sgs = vsock->out_sgs;
-   sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb),
-   sizeof(*virtio_vsock_hdr(skb)));
-   out_sg++;
-
-   if (!skb_is_nonlinear(skb)) {
-   if (skb->len > 0) {
-   sg_init_one(sgs[out_sg], skb->data, skb->len);
-   out_sg++;
-   }
-   } else {
-   struct skb_shared_info *si;
-   int i;
-
-   /* If skb is nonlinear, then its buffer must contain
-* only header and nothing more. Data is stored in
-* the fragged part.
-*/
-   WARN_ON_ONCE(skb_headroom(skb) != 
sizeof(*virtio_vsock_hdr(skb)));
-
-   si = skb_shinfo(skb);

-   for (i = 0; i < si->nr_frags; i++) {
-   skb_frag_t *skb_frag = &si->frags[i];
-   void *va;
-
-   /* We will use 'page_to_virt()' for the 
userspace page
-* here, because virtio or dma-mapping layers 
will call
-   

Re: [PATCH net-next v3 2/2] vsock/virtio: avoid queuing packets when work queue is empty

2024-07-12 Thread Stefano Garzarella

On Thu, Jul 11, 2024 at 04:58:47PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce an optimization in virtio_transport_send_pkt:
when the work queue (send_pkt_queue) is empty the packet is


Note: send_pkt_queue is just a queue of sk_buff, is not really a work 
queue.



put directly in the virtqueue increasing the throughput.


Why?

I'd write something like this, but feel free to change it:

When the driver needs to send new packets to the device, it always
queues the new sk_buffs into an intermediate queue (send_pkt_queue)
and schedules a worker (send_pkt_work) to then queue them into the
virtqueue exposed to the device.

This increases the chance of batching, but also introduces a lot of
latency into the communication. So we can optimize this path by
adding a fast path to be taken when there is no element in the
intermediate queue, there is space available in the virtqueue,
and no other process that is sending packets (tx_lock held).




In the following benchmark (pingpong mode) the host sends


"fio benchmark"


a payload to the guest and waits for the same payload back.

All vCPUs pinned individually to pCPUs.
vhost process pinned to a pCPU
fio process pinned both inside the host and the guest system.

Host CPU: Intel i7-10700KF CPU @ 3.80GHz
Tool: Fio version 3.37-56
Env: Phys host + L1 Guest
Runtime-per-test: 50s
Mode: pingpong (h-g-h)
Test runs: 50
Type: SOCK_STREAM

Before: Linux 6.9.7

Payload 512B:

1st perc.   overall 99th perc.
Before  370 810.15  8656ns
After   374 780.29  8741ns

Payload 4K:

1st perc.   overall 99th perc.
Before  460 1720.23 42752   ns
After   460 1520.84 36096   ns

The performance improvement is related to this optimization,
I used ebpf to check that each packet was sent directly to the
virtqueue.

Throughput: iperf-vsock


I would reorganize the description for a moment because it's a little 
confusing. For example like this:


The following benchmarks were run to check improvements in latency and 
throughput. The test bed is a host with Intel i7-10700KF CPU @ 3.80GHz 
and L1 guest running on QEMU/KVM.


- Latency
  Tool: ...

- Throughput
  Tool: ...


The size represents the buffer length (-l) to read/write
P represents the number parallel streams

P=1
4K  64K 128K
Before  6.8729.329.5 Gb/s
After   10.539.439.9 Gb/s

P=2
4K  64K 128K
Before  10.532.833.2 Gb/s
After   17.847.748.5 Gb/s

P=4
4K  64K 128K
Before  12.733.634.2 Gb/s
After   16.948.150.5 Gb/s


Wow, great! I'm a little surprised that the latency is not much 
affected, but the throughput benefits so much with that kind of 
optimization.


Maybe we can check the latency with smaller payloads like 64 bytes or 
even smaller.




Co-developed-by: Marco Pinna 
Signed-off-by: Marco Pinna 
Signed-off-by: Luigi Leonardi 
---
net/vmw_vsock/virtio_transport.c | 38 ++
1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index c4205c22f40b..d75727fdc35f 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -208,6 +208,29 @@ virtio_transport_send_pkt_work(struct work_struct *work)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}

+/* Caller need to hold RCU for vsock.
+ * Returns 0 if the packet is successfully put on the vq.
+ */
+static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, 
struct sk_buff *skb)
+{
+   struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+   int ret;
+
+   /* Inside RCU, can't sleep! */
+   ret = mutex_trylock(&vsock->tx_lock);
+   if (unlikely(ret == 0))
+   return -EBUSY;
+
+   ret = virtio_transport_send_skb(skb, vq, vsock);
+
+   mutex_unlock(&vsock->tx_lock);
+
+   /* Kick if virtio_transport_send_skb succeeded */


Superfluous comment, we can remove it.


+   if (ret == 0)
+   virtqueue_kick(vq);


nit: I'd add a blank line here after the if block to highlight that the 
return is out.



+   return ret;
+}
+
static int
virtio_transport_send_pkt(struct sk_buff *skb)
{
@@ -231,11 +254,18 @@ virtio_transport_send_pkt(struct sk_buff *skb)
goto out_rcu;
}

-   if (virtio_vsock_skb_reply(skb))
-   atomic_inc(&vsock->queued_replies);
+   /* If the workqueue (send_pkt_queue) is empty there is no need to 
enqueue the packet.


Again, send_pkt_queue is not a workqueue.

Here I would explain more why there is no need, the fact that we are not 
doing this is clear.



+* Just put it on the virtqueue using 
virtio_transport_send_skb_fast_path.
+*/



nit: here I would instead remove the blank line to make it clear tha

Re: [PATCH net-next v3 3/3] test/vsock: add ioctl unsent bytes test

2024-07-15 Thread Stefano Garzarella

On Wed, Jun 26, 2024 at 02:08:37PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET, which 
checks
after a packet is delivered, that the number of unsent bytes is zero,
using ioctl SIOCOUTQ.

Signed-off-by: Luigi Leonardi 
---
tools/testing/vsock/util.c   |  6 +--
tools/testing/vsock/util.h   |  3 ++
tools/testing/vsock/vsock_test.c | 85 
3 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 554b290fefdc..a3d448a075e3 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, 
unsigned int bind_po
}

/* Connect to  and return the file descriptor. */
-static int vsock_connect(unsigned int cid, unsigned int port, int type)
+int vsock_connect(unsigned int cid, unsigned int port, int type)
{
union {
struct sockaddr sa;
@@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int 
port, int type)
/* Listen on  and return the first incoming connection.  The remote
 * address is stored to clientaddrp.  clientaddrp may be NULL.
 */
-static int vsock_accept(unsigned int cid, unsigned int port,
-   struct sockaddr_vm *clientaddrp, int type)
+int vsock_accept(unsigned int cid, unsigned int port,
+struct sockaddr_vm *clientaddrp, int type)
{
union {
struct sockaddr sa;
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index e95e62485959..fff22d4a14c0 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -39,6 +39,9 @@ struct test_case {
void init_signals(void);
unsigned int parse_cid(const char *str);
unsigned int parse_port(const char *str);
+int vsock_connect(unsigned int cid, unsigned int port, int type);
+int vsock_accept(unsigned int cid, unsigned int port,
+struct sockaddr_vm *clientaddrp, int type);


I'd mention in the commit description that you need these functions to 
be more generic. Maybe in the future we can re-use them where we share 
the same test for both SEQPACKET and STREAM.


The rest LGTM.

Thanks,
Stefano


int vsock_stream_connect(unsigned int cid, unsigned int port);
int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index f851f8961247..76bd17b4b291 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -20,6 +20,8 @@
#include 
#include 
#include 
+#include 
+#include 

#include "vsock_test_zerocopy.h"
#include "timeout.h"
@@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct 
test_opts *opts)
}
}

+#define MSG_BUF_IOCTL_LEN 64
+static void test_unsent_bytes_server(const struct test_opts *opts, int type)
+{
+   unsigned char buf[MSG_BUF_IOCTL_LEN];
+   int client_fd;
+
+   client_fd = vsock_accept(VMADDR_CID_ANY, 1234, NULL, type);
+   if (client_fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf));
+   control_writeln("RECEIVED");
+
+   close(client_fd);
+}
+
+static void test_unsent_bytes_client(const struct test_opts *opts, int type)
+{
+   unsigned char buf[MSG_BUF_IOCTL_LEN];
+   int ret, fd, sock_bytes_unsent;
+
+   fd = vsock_connect(opts->peer_cid, 1234, type);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   for (int i = 0; i < sizeof(buf); i++)
+   buf[i] = rand() & 0xFF;
+
+   send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
+   control_expectln("RECEIVED");
+
+   ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
+   if (ret < 0) {
+   if (errno == EOPNOTSUPP) {
+   fprintf(stderr, "Test skipped\n");
+   } else {
+   perror("ioctl");
+   exit(EXIT_FAILURE);
+   }
+   } else if (ret == 0 && sock_bytes_unsent != 0) {
+   fprintf(stderr,
+   "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n",
+   sock_bytes_unsent);
+   exit(EXIT_FAILURE);
+   }
+
+   close(fd);
+}
+
+static void test_stream_unsent_bytes_client(const struct test_opts *opts)
+{
+   test_unsent_bytes_client(opts, SOCK_STREAM);
+}
+
+static void test_stream_unsent_bytes_server(const struct test_opts *opts)
+{
+   test_unsent_bytes_server(opts, SOCK_STREAM);
+}
+
+static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts)
+{
+   test_unsent_bytes_client(opts, SOCK_SEQPACKET);
+}
+
+static void test_seqpacket_unsent_bytes_server(const struct test_opt

Re: [PATCH v1] MAINTAINERS: add me as reviewer of AF_VSOCK and virtio-vsock

2024-07-29 Thread Stefano Garzarella

On Sun, Jul 28, 2024 at 09:33:25PM GMT, Arseniy Krasnov wrote:

I'm working on AF_VSOCK and virtio-vsock.


Yeah, thanks for the help!



Signed-off-by: Arseniy Krasnov 
---
MAINTAINERS | 2 ++
1 file changed, 2 insertions(+)


Reviewed-by: Stefano Garzarella 



diff --git a/MAINTAINERS b/MAINTAINERS
index c0a3d9e93689..2bf0987d87ed 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24131,6 +24131,7 @@ F:  virt/lib/
VIRTIO AND VHOST VSOCK DRIVER
M:  Stefan Hajnoczi 
M:  Stefano Garzarella 
+R: Arseniy Krasnov 
L:  k...@vger.kernel.org
L:  virtualizat...@lists.linux.dev
L:  net...@vger.kernel.org
@@ -24370,6 +24371,7 @@ F:  drivers/media/test-drivers/vivid/*

VM SOCKETS (AF_VSOCK)
M:  Stefano Garzarella 
+R: Arseniy Krasnov 
L:  virtualizat...@lists.linux.dev
L:  net...@vger.kernel.org
S:  Maintained
--
2.35.0






Re: [PATCH net-next v4 1/3] vsock: add support for SIOCOUTQ ioctl

2024-07-31 Thread Stefano Garzarella

On Tue, Jul 30, 2024 at 09:43:06PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Add support for ioctl(s) in AF_VSOCK.
The only ioctl available is SIOCOUTQ/TIOCOUTQ, which returns the number
of unsent bytes in the socket. This information is transport-specific
and is delegated to them using a callback.

Suggested-by: Daan De Meyer 
Signed-off-by: Luigi Leonardi 
---
include/net/af_vsock.h   |  3 +++
net/vmw_vsock/af_vsock.c | 58 +---
2 files changed, 58 insertions(+), 3 deletions(-)


LGTM!

Reviewed-by: Stefano Garzarella 



diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 535701efc1e5..fc504d2da3d0 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -169,6 +169,9 @@ struct vsock_transport {
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);

+   /* SIOCOUTQ ioctl */
+   ssize_t (*unsent_bytes)(struct vsock_sock *vsk);
+
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 4b040285aa78..58e639e82942 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -112,6 +112,7 @@
#include 
#include 
#include 
+#include 

static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@@ -1292,6 +1293,57 @@ int vsock_dgram_recvmsg(struct socket *sock, struct 
msghdr *msg,
}
EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);

+static int vsock_do_ioctl(struct socket *sock, unsigned int cmd,
+ int __user *arg)
+{
+   struct sock *sk = sock->sk;
+   struct vsock_sock *vsk;
+   int ret;
+
+   vsk = vsock_sk(sk);
+
+   switch (cmd) {
+   case SIOCOUTQ: {
+   ssize_t n_bytes;
+
+   if (!vsk->transport || !vsk->transport->unsent_bytes) {
+   ret = -EOPNOTSUPP;
+   break;
+   }
+
+   if (sock_type_connectible(sk->sk_type) && sk->sk_state == 
TCP_LISTEN) {
+   ret = -EINVAL;
+   break;
+   }
+
+   n_bytes = vsk->transport->unsent_bytes(vsk);
+   if (n_bytes < 0) {
+   ret = n_bytes;
+   break;
+   }
+
+   ret = put_user(n_bytes, arg);
+   break;
+   }
+   default:
+   ret = -ENOIOCTLCMD;
+   }
+
+   return ret;
+}
+
+static int vsock_ioctl(struct socket *sock, unsigned int cmd,
+  unsigned long arg)
+{
+   int ret;
+
+   lock_sock(sock->sk);
+   ret = vsock_do_ioctl(sock, cmd, (int __user *)arg);
+   release_sock(sock->sk);
+
+   return ret;
+}
+
static const struct proto_ops vsock_dgram_ops = {
.family = PF_VSOCK,
.owner = THIS_MODULE,
@@ -1302,7 +1354,7 @@ static const struct proto_ops vsock_dgram_ops = {
.accept = sock_no_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = sock_no_listen,
.shutdown = vsock_shutdown,
.sendmsg = vsock_dgram_sendmsg,
@@ -2286,7 +2338,7 @@ static const struct proto_ops vsock_stream_ops = {
.accept = vsock_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt,
@@ -2308,7 +2360,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
.accept = vsock_accept,
.getname = vsock_getname,
.poll = vsock_poll,
-   .ioctl = sock_no_ioctl,
+   .ioctl = vsock_ioctl,
.listen = vsock_listen,
.shutdown = vsock_shutdown,
.setsockopt = vsock_connectible_setsockopt,

--
2.45.2







Re: [PATCH net-next v4 2/3] vsock/virtio: add SIOCOUTQ support for all virtio based transports

2024-07-31 Thread Stefano Garzarella

On Tue, Jul 30, 2024 at 09:43:07PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce support for virtio_transport_unsent_bytes
ioctl for virtio_transport, vhost_vsock and vsock_loopback.

For all transports the unsent bytes counter is incremented
in virtio_transport_get_credit.

In virtio_transport (G2H) and in vhost-vsock (H2G) the counter
is decremented when the skbuff is consumed. In vsock_loopback the
same skbuff is passed from the transmitter to the receiver, so
the counter is decremented before queuing the skbuff to the
receiver.

Signed-off-by: Luigi Leonardi 
---
drivers/vhost/vsock.c   |  4 +++-
include/linux/virtio_vsock.h|  6 ++
net/vmw_vsock/virtio_transport.c|  4 +++-
net/vmw_vsock/virtio_transport_common.c | 35 +
net/vmw_vsock/vsock_loopback.c  |  6 ++
5 files changed, 53 insertions(+), 2 deletions(-)



Reviewed-by: Stefano Garzarella 




diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index bf664ec9341b..802153e23073 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
restart_tx = true;
}

-   consume_skb(skb);
+   virtio_transport_consume_skb_sent(skb, true);
}
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
if (added)
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

+   .unsent_bytes = virtio_transport_unsent_bytes,
+
.read_skb = virtio_transport_read_skb,
},

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index c82089dee0c8..0387d64e2c66 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -133,6 +133,7 @@ struct virtio_vsock_sock {
u32 tx_cnt;
u32 peer_fwd_cnt;
u32 peer_buf_alloc;
+   size_t bytes_unsent;

/* Protected by rx_lock */
u32 fwd_cnt;
@@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock 
*vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk);

+ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk);
+
+void virtio_transport_consume_skb_sent(struct sk_buff *skb,
+  bool consume);
+
int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 struct vsock_sock *psk);
int
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 64a07acfef12..e0160da4ef43 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct 
*work)

virtqueue_disable_cb(vq);
while ((skb = virtqueue_get_buf(vq, &len)) != NULL) {
-   consume_skb(skb);
+   virtio_transport_consume_skb_sent(skb, true);
added = true;
}
} while (!virtqueue_enable_cb(vq));
@@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
.notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,

+   .unsent_bytes = virtio_transport_unsent_bytes,
+
.read_skb = virtio_transport_read_skb,
},

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 16ff976a86e3..884ee128851e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock 
*vvs, struct sk_buff *
}
EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);

+void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume)
+{
+   struct sock *s = skb->sk;
+
+   if (s && skb->len) {
+   struct vsock_sock *vs = vsock_sk(s);
+   struct virtio_vsock_sock *vvs;
+
+   vvs = vs->trans;
+
+   spin_lock_bh(&vvs->tx_lock);
+   vvs->bytes_unsent -= skb->len;
+   spin_unlock_bh(&vvs->tx_lock);
+   }
+
+   if (consume)
+   consume_skb(skb);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent);
+
u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
{
u32 ret;
@@ -475,6 +495,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock 
*vvs, u32 credit)
if (ret > credit)

Re: [PATCH net-next v4 3/3] test/vsock: add ioctl unsent bytes test

2024-07-31 Thread Stefano Garzarella

On Tue, Jul 30, 2024 at 09:43:08PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET,
which use SIOCOUTQ ioctl to check that the number of unsent bytes is
zero after delivering a packet.

vsock_connect and vsock_accept are no longer static: this is to
create more generic tests, allowing code to be reused for SEQPACKET
and STREAM.


Yeah, good idea. We should use them for other tests as well.
(for the future)



Signed-off-by: Luigi Leonardi 
---
tools/testing/vsock/util.c   |  6 +--
tools/testing/vsock/util.h   |  3 ++
tools/testing/vsock/vsock_test.c | 85 
3 files changed, 91 insertions(+), 3 deletions(-)


LGTM and I ran them. All good :-)

Reviewed-by: Stefano Garzarella 



diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 554b290fefdc..a3d448a075e3 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, 
unsigned int bind_po
}

/* Connect to  and return the file descriptor. */
-static int vsock_connect(unsigned int cid, unsigned int port, int type)
+int vsock_connect(unsigned int cid, unsigned int port, int type)
{
union {
struct sockaddr sa;
@@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int 
port, int type)
/* Listen on  and return the first incoming connection.  The remote
 * address is stored to clientaddrp.  clientaddrp may be NULL.
 */
-static int vsock_accept(unsigned int cid, unsigned int port,
-   struct sockaddr_vm *clientaddrp, int type)
+int vsock_accept(unsigned int cid, unsigned int port,
+struct sockaddr_vm *clientaddrp, int type)
{
union {
struct sockaddr sa;
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index e95e62485959..fff22d4a14c0 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -39,6 +39,9 @@ struct test_case {
void init_signals(void);
unsigned int parse_cid(const char *str);
unsigned int parse_port(const char *str);
+int vsock_connect(unsigned int cid, unsigned int port, int type);
+int vsock_accept(unsigned int cid, unsigned int port,
+struct sockaddr_vm *clientaddrp, int type);
int vsock_stream_connect(unsigned int cid, unsigned int port);
int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index f851f8961247..8d38dbf8f41f 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -20,6 +20,8 @@
#include 
#include 
#include 
+#include 
+#include 

#include "vsock_test_zerocopy.h"
#include "timeout.h"
@@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct 
test_opts *opts)
}
}

+#define MSG_BUF_IOCTL_LEN 64
+static void test_unsent_bytes_server(const struct test_opts *opts, int type)
+{
+   unsigned char buf[MSG_BUF_IOCTL_LEN];
+   int client_fd;
+
+   client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type);
+   if (client_fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf));
+   control_writeln("RECEIVED");
+
+   close(client_fd);
+}
+
+static void test_unsent_bytes_client(const struct test_opts *opts, int type)
+{
+   unsigned char buf[MSG_BUF_IOCTL_LEN];
+   int ret, fd, sock_bytes_unsent;
+
+   fd = vsock_connect(opts->peer_cid, opts->peer_port, type);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   for (int i = 0; i < sizeof(buf); i++)
+   buf[i] = rand() & 0xFF;
+
+   send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
+   control_expectln("RECEIVED");
+
+   ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
+   if (ret < 0) {
+   if (errno == EOPNOTSUPP) {
+   fprintf(stderr, "Test skipped, SIOCOUTQ not 
supported.\n");
+   } else {
+   perror("ioctl");
+   exit(EXIT_FAILURE);
+   }
+   } else if (ret == 0 && sock_bytes_unsent != 0) {
+   fprintf(stderr,
+   "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n",
+   sock_bytes_unsent);
+   exit(EXIT_FAILURE);
+   }
+
+   close(fd);
+}
+
+static void test_stream_unsent_bytes_client(const struct test_opts *opts)
+{
+   test_unsent_bytes_client(opts, SOCK_STREAM);
+}
+
+static void test_stream_unsent_bytes_server(const struct test_opts *opts)

Re: [PATCH net-next v4 2/2] vsock/virtio: avoid queuing packets when intermediate queue is empty

2024-07-31 Thread Stefano Garzarella

On Tue, Jul 30, 2024 at 09:47:32PM GMT, Luigi Leonardi via B4 Relay wrote:

From: Luigi Leonardi 

When the driver needs to send new packets to the device, it always
queues the new sk_buffs into an intermediate queue (send_pkt_queue)
and schedules a worker (send_pkt_work) to then queue them into the
virtqueue exposed to the device.

This increases the chance of batching, but also introduces a lot of
latency into the communication. So we can optimize this path by
adding a fast path to be taken when there is no element in the
intermediate queue, there is space available in the virtqueue,
and no other process that is sending packets (tx_lock held).

The following benchmarks were run to check improvements in latency and
throughput. The test bed is a host with Intel i7-10700KF CPU @ 3.80GHz
and L1 guest running on QEMU/KVM with vhost process and all vCPUs
pinned individually to pCPUs.

- Latency
  Tool: Fio version 3.37-56
  Mode: pingpong (h-g-h)
  Test runs: 50
  Runtime-per-test: 50s
  Type: SOCK_STREAM

In the following fio benchmark (pingpong mode) the host sends
a payload to the guest and waits for the same payload back.

fio process pinned both inside the host and the guest system.

Before: Linux 6.9.8

Payload 64B:

1st perc.   overall 99th perc.
Before  12.91   16.78   42.24   us
After   9.7713.57   39.17   us

Payload 512B:

1st perc.   overall 99th perc.
Before  13.35   17.35   41.52   us
After   10.25   14.11   39.58   us

Payload 4K:

1st perc.   overall 99th perc.
Before  14.71   19.87   41.52   us
After   10.51   14.96   40.81   us

- Throughput
  Tool: iperf-vsock

The size represents the buffer length (-l) to read/write
P represents the number of parallel streams

P=1
4K  64K 128K
Before  6.8729.329.5 Gb/s
After   10.539.439.9 Gb/s

P=2
4K  64K 128K
Before  10.532.833.2 Gb/s
After   17.847.748.5 Gb/s

P=4
4K  64K 128K
Before  12.733.634.2 Gb/s
After   16.948.150.5 Gb/s


Great improvement! Thanks again for this work!



The performance improvement is related to this optimization,
I used a ebpf kretprobe on virtio_transport_send_skb to check
that each packet was sent directly to the virtqueue

Co-developed-by: Marco Pinna 
Signed-off-by: Marco Pinna 
Signed-off-by: Luigi Leonardi 
---
net/vmw_vsock/virtio_transport.c | 39 +++
1 file changed, 35 insertions(+), 4 deletions(-)


All my comments have been resolved. I let iperf run bidirectionally for 
a long time and saw no problems, so:


Reviewed-by: Stefano Garzarella 




diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index f641e906f351..f992f9a216f0 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -208,6 +208,28 @@ virtio_transport_send_pkt_work(struct work_struct *work)
queue_work(virtio_vsock_workqueue, &vsock->rx_work);
}

+/* Caller need to hold RCU for vsock.
+ * Returns 0 if the packet is successfully put on the vq.
+ */
+static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, 
struct sk_buff *skb)
+{
+   struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX];
+   int ret;
+
+   /* Inside RCU, can't sleep! */
+   ret = mutex_trylock(&vsock->tx_lock);
+   if (unlikely(ret == 0))
+   return -EBUSY;
+
+   ret = virtio_transport_send_skb(skb, vq, vsock);
+   if (ret == 0)
+   virtqueue_kick(vq);
+
+   mutex_unlock(&vsock->tx_lock);
+
+   return ret;
+}
+
static int
virtio_transport_send_pkt(struct sk_buff *skb)
{
@@ -231,11 +253,20 @@ virtio_transport_send_pkt(struct sk_buff *skb)
goto out_rcu;
}

-   if (virtio_vsock_skb_reply(skb))
-   atomic_inc(&vsock->queued_replies);
+   /* If send_pkt_queue is empty, we can safely bypass this queue
+* because packet order is maintained and (try) to put the packet
+* on the virtqueue using virtio_transport_send_skb_fast_path.
+* If this fails we simply put the packet on the intermediate
+* queue and schedule the worker.
+*/
+   if (!skb_queue_empty_lockless(&vsock->send_pkt_queue) ||
+   virtio_transport_send_skb_fast_path(vsock, skb)) {
+   if (virtio_vsock_skb_reply(skb))
+   atomic_inc(&vsock->queued_replies);

-   virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
-   queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+   virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
+   queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+   }

out_rcu:
rcu_read_unlock();

--
2.45.2







Re: [PATCH net-next v4 0/2] vsock: avoid queuing on intermediate queue if possible

2024-08-05 Thread Stefano Garzarella

Hi Michael,
this series is marked as "Not Applicable" for the net-next tree:
https://patchwork.kernel.org/project/netdevbpf/patch/20240730-pinna-v4-2-5c9179164...@outlook.com/

Actually this is more about the virtio-vsock driver, so can you queue 
this on your tree?


Thanks,
Stefano

On Tue, Jul 30, 2024 at 09:47:30PM GMT, Luigi Leonardi via B4 Relay wrote:

This series introduces an optimization for vsock/virtio to reduce latency
and increase the throughput: When the guest sends a packet to the host,
and the intermediate queue (send_pkt_queue) is empty, if there is enough
space, the packet is put directly in the virtqueue.

v3->v4
While running experiments on fio with 64B payload, I realized that there
was a mistake in my fio configuration, so I re-ran all the experiments
and now the latency numbers are indeed lower with the patch applied.
I also noticed that I was kicking the host without the lock.

- Fixed a configuration mistake on fio and re-ran all experiments.
- Fio latency measurement using 64B payload.
- virtio_transport_send_skb_fast_path sends kick with the tx_lock acquired
- Addressed all minor style changes requested by maintainer.
- Rebased on latest net-next
- Link to v3: 
https://lore.kernel.org/r/20240711-pinna-v3-0-697d4164f...@outlook.com

v2->v3
- Performed more experiments using iperf3 using multiple streams
- Handling of reply packets removed from virtio_transport_send_skb,
 as is needed just by the worker.
- Removed atomic_inc/atomic_sub when queuing directly to the vq.
- Introduced virtio_transport_send_skb_fast_path that handles the
 steps for sending on the vq.
- Fixed a missing mutex_unlock in error path.
- Changed authorship of the second commit
- Rebased on latest net-next

v1->v2
In this v2 I replaced a mutex_lock with a mutex_trylock because it was
insidea RCU critical section. I also added a check on tx_run, so if the
module is being removed the packet is not queued. I'd like to thank Stefano
for reporting the tx_run issue.

Applied all Stefano's suggestions:
   - Minor code style changes
   - Minor commit text rewrite
Performed more experiments:
- Check if all the packets go directly to the vq (Matias' suggestion)
- Used iperf3 to see if there is any improvement in overall throughput
 from guest to host
- Pinned the vhost process to a pCPU.
- Run fio using 512B payload
Rebased on latest net-next

---
Luigi Leonardi (1):
 vsock/virtio: avoid queuing packets when intermediate queue is empty

Marco Pinna (1):
 vsock/virtio: refactor virtio_transport_send_pkt_work

net/vmw_vsock/virtio_transport.c | 144 +--
1 file changed, 94 insertions(+), 50 deletions(-)
---
base-commit: 1722389b0d863056d78287a120a1d6cadb8d4f7b
change-id: 20240730-pinna-db8cc1b8b037

Best regards,
--
Luigi Leonardi 








Re: BUG: stack guard page was hit in vsock_connectible_recvmsg

2024-08-05 Thread Stefano Garzarella

Hi,

On Mon, Aug 05, 2024 at 08:44:11AM GMT, Ubisectech Sirius wrote:


Hello.
We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
Recently, our team has discovered a issue in Linux kernel 6.8. Attached to the 
email were a PoC file of the issue.


Thanks for the report!

It looks like this is releated to the net/vmw_vsock/vsock_bpf.c, so I'm 
CCing Bobby who developed that.


@Bobby if you have time, please take a look.

I'm trying to replicate on a VM with 6.8 kernel, but for now I can't 
reproduce it.


How reproducible is it in your system?

I see that the reproducer was generated by syzkaller.
Is that internal or public instance?
In the second case, do you have a link to the report?

From the report I see that you're using 6.8.0.
Is it the upstream version (commit 
e8f897f4afef0031fe618a8e94127a0934896aba)?

Can you replicate this with more recent versions as well?

Thanks,
Stefano



Stack dump:
BUG: TASK stack guard page was hit at c90001b27f88 (stack is 
c90001b28000..c90001b3)

stack guard page:  [#1] PREEMPT SMP KASAN NOPTI
CPU: 0 PID: 8069 Comm: syz-executor293 Not tainted 6.8.0 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:mark_lock+0x25/0xd60 kernel/locking/lockdep.c:4639
Code: 90 90 90 90 90 55 48 89 e5 41 57 41 56 41 55 41 54 41 89 d4 48 ba 00 00 00 00 
00 fc ff df 53 48 83 e4 f0 48 81 ec 10 01 00 00 <48> c7 44 24 30 b3 8a b5 41 48 
8d 5c 24 30 48 c7 44 24 38 00 88 b9
RSP: 0018:c90001b27f90 EFLAGS: 00010086
RAX: 0004 RBX: 888042cd2fa2 RCX: 888042cd2f64
RDX: dc00 RSI: 888042cd2f80 RDI: 888042cd24c0
RBP: c90001b280c8 R08: 0001 R09: fbfff2711214
R10: 938890a7 R11:  R12: 0002
R13:  R14: 888042cd24c0 R15: 0004073c
FS:  558f13c0() GS:88802c60() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: c90001b27f88 CR3: 48cb8000 CR4: 00750ef0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
PKRU: 5554
Call Trace:
<#DF>


mark_usage kernel/locking/lockdep.c:4587 [inline]
__lock_acquire+0x91e/0x3bc0 kernel/locking/lockdep.c:5091
lock_acquire kernel/locking/lockdep.c:5754 [inline]
lock_acquire+0x1b1/0x530 kernel/locking/lockdep.c:5719
lock_sock_nested+0x3a/0xf0 net/core/sock.c:3523
lock_sock include/net/sock.h:1691 [inline]
vsock_connectible_recvmsg+0xdd/0xba0 net/vmw_vsock/af_vsock.c:2196
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline]
vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105
vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240
__vsock_recvmsg 

Re: [PATCH net-next v4 0/2] vsock: avoid queuing on intermediate queue if possible

2024-08-06 Thread Stefano Garzarella

On Tue, Aug 06, 2024 at 09:02:57AM GMT, Jakub Kicinski wrote:

On Mon, 5 Aug 2024 10:39:23 +0200 Stefano Garzarella wrote:

this series is marked as "Not Applicable" for the net-next tree:
https://patchwork.kernel.org/project/netdevbpf/patch/20240730-pinna-v4-2-5c9179164...@outlook.com/

Actually this is more about the virtio-vsock driver, so can you queue
this on your tree?


We can revive it in our patchwork, too, if that's easier.


That's perfectly fine with me, if Michael hasn't already queued it.


Not entirely sure why it was discarded, seems borderline.



Yes, even to me it's not super clear when to expect net and when virtio.
Usually the other vsock transports (VMCI and HyperV) go with net, so 
virtio-vsock is a bit of an exception.


I don't have any particular preferences, so how it works best for you 
and Michael is fine with me.


Thanks,
Stefano




Re: [PATCH net-next v4 0/2] vsock: avoid queuing on intermediate queue if possible

2024-08-29 Thread Stefano Garzarella

On Thu, Aug 29, 2024 at 08:19:31AM GMT, Michael S. Tsirkin wrote:

On Thu, Aug 29, 2024 at 01:00:37PM +0200, Luigi Leonardi wrote:

Hi All,

It has been a while since the last email and this patch has not been merged yet.
This is just a gentle ping :)

Thanks,
Luigi



ok I can queue it for next. Next time pls remember to CC all
maintainers. Thanks!


Thank for queueing it!

BTW, it looks like the virtio-vsock driver is listed in
"VIRTIO AND VHOST VSOCK DRIVER" but not listed under
"VIRTIO CORE AND NET DRIVERS", so running get_maintainer.pl I have this
list:

$ ./scripts/get_maintainer.pl -f net/vmw_vsock/virtio_transport.c
Stefan Hajnoczi  (maintainer:VIRTIO AND VHOST VSOCK DRIVER)
Stefano Garzarella  (maintainer:VIRTIO AND VHOST VSOCK 
DRIVER)
"David S. Miller"  (maintainer:NETWORKING [GENERAL])
Eric Dumazet  (maintainer:NETWORKING [GENERAL])
Jakub Kicinski  (maintainer:NETWORKING [GENERAL])
Paolo Abeni  (maintainer:NETWORKING [GENERAL])
k...@vger.kernel.org (open list:VIRTIO AND VHOST VSOCK DRIVER)
virtualizat...@lists.linux.dev (open list:VIRTIO AND VHOST VSOCK DRIVER)
net...@vger.kernel.org (open list:VIRTIO AND VHOST VSOCK DRIVER)
linux-kernel@vger.kernel.org (open list)

Should we add net/vmw_vsock/virtio_transport.c and related files also 
under "VIRTIO CORE AND NET DRIVERS" ?


Thanks,
Stefano





>Hi Michael,
>this series is marked as "Not Applicable" for the net-next tree:
>https://patchwork.kernel.org/project/netdevbpf/patch/20240730-pinna-v4-2-5c9179164...@outlook.com/

>Actually this is more about the virtio-vsock driver, so can you queue
>this on your tree?

>Thanks,
>Stefano







[PATCH] MAINTAINERS: add virtio-vsock driver in the VIRTIO CORE section

2024-08-29 Thread Stefano Garzarella
The virtio-vsock driver is already under VM SOCKETS (AF_VSOCK),
managed pricipally with the net tree, and VIRTIO AND VHOST
VSOCK DRIVER. However, changes that only affect the virtio part
usually go with Michael's tree, so let's also put the driver in
the VIRTIO CORE section to have its maintainers in CC for changes
to the virtio-vsock driver.

Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Signed-off-by: Stefano Garzarella 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 878dcd23b331..6dcea63f396e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24218,6 +24218,7 @@ F:  include/linux/vdpa.h
 F: include/linux/virtio*.h
 F: include/linux/vringh.h
 F: include/uapi/linux/virtio_*.h
+F: net/vmw_vsock/virtio*
 F: tools/virtio/
 F: tools/testing/selftests/drivers/net/virtio_net/
 
-- 
2.46.0




Re: [PATCH for 5.10] vdpa_sim: fix param validation in vdpasim_get_config()

2021-02-15 Thread Stefano Garzarella

On Mon, Feb 15, 2021 at 03:32:19PM +0100, Greg KH wrote:

On Thu, Feb 11, 2021 at 05:25:19PM +0100, Stefano Garzarella wrote:

Commit 65b709586e222fa6ffd4166ac7fdb5d5dad113ee upstream.


No, this really is not that commit, so please do not say it is.


Oops, sorry.




Before this patch, if 'offset + len' was equal to
sizeof(struct virtio_net_config), the entire buffer wasn't filled,
returning incorrect values to the caller.

Since 'vdpasim->config' type is 'struct virtio_net_config', we can
safely copy its content under this condition.

Commit 65b709586e22 ("vdpa_sim: add get_config callback in
vdpasim_dev_attr") unintentionally solved it upstream while
refactoring vdpa_sim.c to support multiple devices. But we don't want
to backport it to stable branches as it contains many changes.

Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator")
Cc:  # 5.10.x
Signed-off-by: Stefano Garzarella 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 6a90fdb9cbfc..8ca178d7b02f 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -572,7 +572,7 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, 
unsigned int offset,
 {
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);

-   if (offset + len < sizeof(struct virtio_net_config))
+   if (offset + len <= sizeof(struct virtio_net_config))
memcpy(buf, (u8 *)&vdpasim->config + offset, len);
 }


I'll be glad to take a one-off patch, but why can't we take the real
upstream patch?  That is always the better long-term solution, right?


Because that patch depends on the following patches merged in v5.11-rc1 
while refactoring vdpa_sim:

  f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type
  cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer
  a13b5918fdd0 vdpa_sim: add work_fn in vdpasim_dev_attr
  011c35bac5ef vdpa_sim: add supported_features field in vdpasim_dev_attr
  2f8f46188805 vdpa_sim: add device id field in vdpasim_dev_attr
  6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes
  36a9c3063025 vdpa_sim: rename vdpasim_config_ops variables
  423248d60d2b vdpa_sim: remove hard-coded virtq count

Maybe we can skip some of them, but IMHO should be less risky to apply 
only this change.


If you want I can try to figure out the minimum sub-set of patches 
needed for 65b709586e22 ("vdpa_sim: add get_config callback in 
vdpasim_dev_attr").


Thanks,
Stefano



[RFC PATCH 01/10] vdpa: add get_config_size callback in vdpa_config_ops

2021-02-16 Thread Stefano Garzarella
This new callback is used to get the size of the configuration space
of vDPA devices.

Signed-off-by: Stefano Garzarella 
---
 include/linux/vdpa.h  | 4 
 drivers/vdpa/ifcvf/ifcvf_main.c   | 6 ++
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++
 drivers/vdpa/vdpa_sim/vdpa_sim.c  | 9 +
 4 files changed, 25 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 4ab5494503a8..fddf42b17573 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -150,6 +150,9 @@ struct vdpa_iova_range {
  * @set_status:Set the device status
  * @vdev: vdpa device
  * @status: virtio device status
+ * @get_config_size:   Get the size of the configuration space
+ * @vdev: vdpa device
+ * Returns size_t: configuration size
  * @get_config:Read from device specific configuration 
space
  * @vdev: vdpa device
  * @offset: offset from the beginning of
@@ -231,6 +234,7 @@ struct vdpa_config_ops {
u32 (*get_vendor_id)(struct vdpa_device *vdev);
u8 (*get_status)(struct vdpa_device *vdev);
void (*set_status)(struct vdpa_device *vdev, u8 status);
+   size_t (*get_config_size)(struct vdpa_device *vdev);
void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
   void *buf, unsigned int len);
void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 7c8bbfcf6c3e..2443271e17d2 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -332,6 +332,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device 
*vdpa_dev)
return IFCVF_QUEUE_ALIGNMENT;
 }
 
+static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+   return sizeof(struct virtio_net_config);
+}
+
 static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
  unsigned int offset,
  void *buf, unsigned int len)
@@ -392,6 +397,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
.get_device_id  = ifcvf_vdpa_get_device_id,
.get_vendor_id  = ifcvf_vdpa_get_vendor_id,
.get_vq_align   = ifcvf_vdpa_get_vq_align,
+   .get_config_size= ifcvf_vdpa_get_config_size,
.get_config = ifcvf_vdpa_get_config,
.set_config = ifcvf_vdpa_set_config,
.set_config_cb  = ifcvf_vdpa_set_config_cb,
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 10e9b09932eb..78043ee567b6 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1814,6 +1814,11 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 }
 
+static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
+{
+   return sizeof(struct virtio_net_config);
+}
+
 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int 
offset, void *buf,
 unsigned int len)
 {
@@ -1900,6 +1905,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
.get_vendor_id = mlx5_vdpa_get_vendor_id,
.get_status = mlx5_vdpa_get_status,
.set_status = mlx5_vdpa_set_status,
+   .get_config_size = mlx5_vdpa_get_config_size,
.get_config = mlx5_vdpa_get_config,
.set_config = mlx5_vdpa_set_config,
.get_generation = mlx5_vdpa_get_generation,
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index d5942842432d..779ae6c144d7 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -439,6 +439,13 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, 
u8 status)
spin_unlock(&vdpasim->lock);
 }
 
+static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)
+{
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   return vdpasim->dev_attr.config_size;
+}
+
 static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
 void *buf, unsigned int len)
 {
@@ -566,6 +573,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
.get_vendor_id  = vdpasim_get_vendor_id,
.get_status = vdpasim_get_status,
.set_status = vdpasim_set_status,
+   .get_config_size= vdpasim_get_config_size,
.get_config = vdpasim_get_config,
.set_config = vdpasim_set_config,
.get_generation = vdpasim_get_generation,
@@ -593,6 +601,7 @@ static const struct vdpa_config_ops 
vdpasim_batch_config_ops = {
.get_vendor_id  = vdpasim_get_vendor_id,
  

[RFC PATCH 02/10] vdpa: check vdpa_get_config() parameters and return bytes read

2021-02-16 Thread Stefano Garzarella
Now we have the 'get_config_size()' callback available, so we can
check that 'offset' and 'len' parameters are valid.

When these exceed boundaries, we limit the reading to the available
configuration space in the device, and we return the amount of bytes
read.

We also move vdpa_get_config() implementation in drivers/vdpa/vdpa.c,
since the function are growing.

Signed-off-by: Stefano Garzarella 
---
 include/linux/vdpa.h | 16 ++--
 drivers/vdpa/vdpa.c  | 35 +++
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index fddf42b17573..8a679c98f8b1 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -332,20 +332,8 @@ static inline int vdpa_set_features(struct vdpa_device 
*vdev, u64 features)
 return ops->set_features(vdev, features);
 }
 
-
-static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
-  void *buf, unsigned int len)
-{
-const struct vdpa_config_ops *ops = vdev->config;
-
-   /*
-* Config accesses aren't supposed to trigger before features are set.
-* If it does happen we assume a legacy guest.
-*/
-   if (!vdev->features_valid)
-   vdpa_set_features(vdev, 0);
-   ops->get_config(vdev, offset, buf, len);
-}
+int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
+   void *buf, unsigned int len);
 
 /**
  * vdpa_mgmtdev_ops - vdpa device ops
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 3d997b389345..9ed6c779c63c 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -51,6 +51,41 @@ static struct bus_type vdpa_bus = {
.remove = vdpa_dev_remove,
 };
 
+static int vdpa_config_size_wrap(struct vdpa_device *vdev, unsigned int offset,
+unsigned int len)
+{
+   const struct vdpa_config_ops *ops = vdev->config;
+   unsigned int config_size = ops->get_config_size(vdev);
+
+   if (offset > config_size || len > config_size)
+   return -1;
+
+   return min(len, config_size - offset);
+}
+
+int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
+   void *buf, unsigned int len)
+{
+   const struct vdpa_config_ops *ops = vdev->config;
+   int bytes_get;
+
+   bytes_get = vdpa_config_size_wrap(vdev, offset, len);
+   if (bytes_get <= 0)
+   return bytes_get;
+
+   /*
+* Config accesses aren't supposed to trigger before features are set.
+* If it does happen we assume a legacy guest.
+*/
+   if (!vdev->features_valid)
+   vdpa_set_features(vdev, 0);
+
+   ops->get_config(vdev, offset, buf, bytes_get);
+
+   return bytes_get;
+}
+EXPORT_SYMBOL_GPL(vdpa_get_config);
+
 static void vdpa_release_dev(struct device *d)
 {
struct vdpa_device *vdev = dev_to_vdpa(d);
-- 
2.29.2



[RFC PATCH 00/10] vdpa: get/set_config() rework

2021-02-16 Thread Stefano Garzarella
Following the discussion with Michael and Jason [1], I reworked a bit
get/set_config() in vdpa.

I changed vdpa_get_config() to check the boundaries and added vdpa_set_config().
When 'offset' or 'len' parameters exceed boundaries, we limit the reading to
the available configuration space in the device, and we return the amount of
bytes read/written.

In this way the user space can pass buffers bigger than config space.
I also returned the amount of bytes read and written to user space.

Patches also available here:
https://github.com/stefano-garzarella/linux/tree/vdpa-get-set-config-refactoring

Thanks for your comments,
Stefano

[1] https://lkml.org/lkml/2021/2/10/350

Stefano Garzarella (10):
  vdpa: add get_config_size callback in vdpa_config_ops
  vdpa: check vdpa_get_config() parameters and return bytes read
  vdpa: add vdpa_set_config() helper
  vdpa: remove param checks in the get/set_config callbacks
  vdpa: remove WARN_ON() in the get/set_config callbacks
  virtio_vdpa: use vdpa_set_config()
  vhost/vdpa: use vdpa_set_config()
  vhost/vdpa: allow user space to pass buffers bigger than config space
  vhost/vdpa: use get_config_size callback in
vhost_vdpa_config_validate()
  vhost/vdpa: return configuration bytes read and written to user space

 include/linux/vdpa.h  | 22 ---
 drivers/vdpa/ifcvf/ifcvf_base.c   |  3 +-
 drivers/vdpa/ifcvf/ifcvf_main.c   |  8 +++-
 drivers/vdpa/mlx5/net/mlx5_vnet.c |  9 -
 drivers/vdpa/vdpa.c   | 51 
 drivers/vdpa/vdpa_sim/vdpa_sim.c  | 15 +---
 drivers/vhost/vdpa.c  | 64 ---
 drivers/virtio/virtio_vdpa.c  |  3 +-
 8 files changed, 116 insertions(+), 59 deletions(-)

-- 
2.29.2



[RFC PATCH 03/10] vdpa: add vdpa_set_config() helper

2021-02-16 Thread Stefano Garzarella
Let's add a function similar to vpda_get_config() to check the
'offset' and 'len' parameters, call the set_config() device callback,
and return the amount of bytes written.

Signed-off-by: Stefano Garzarella 
---
 include/linux/vdpa.h |  2 ++
 drivers/vdpa/vdpa.c  | 16 
 2 files changed, 18 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 8a679c98f8b1..562fcd14f4b5 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -334,6 +334,8 @@ static inline int vdpa_set_features(struct vdpa_device 
*vdev, u64 features)
 
 int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
void *buf, unsigned int len);
+int vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
+   const void *buf, unsigned int len);
 
 /**
  * vdpa_mgmtdev_ops - vdpa device ops
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 9ed6c779c63c..825afc690a7e 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -86,6 +86,22 @@ int vdpa_get_config(struct vdpa_device *vdev, unsigned int 
offset,
 }
 EXPORT_SYMBOL_GPL(vdpa_get_config);
 
+int vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
+   const void *buf, unsigned int len)
+{
+   const struct vdpa_config_ops *ops = vdev->config;
+   int bytes_set;
+
+   bytes_set = vdpa_config_size_wrap(vdev, offset, len);
+   if (bytes_set <= 0)
+   return bytes_set;
+
+   ops->set_config(vdev, offset, buf, bytes_set);
+
+   return bytes_set;
+}
+EXPORT_SYMBOL_GPL(vdpa_set_config);
+
 static void vdpa_release_dev(struct device *d)
 {
struct vdpa_device *vdev = dev_to_vdpa(d);
-- 
2.29.2



[RFC PATCH 04/10] vdpa: remove param checks in the get/set_config callbacks

2021-02-16 Thread Stefano Garzarella
vdpa_get_config() and vdpa_set_config() now check parameters before
calling callbacks, so we can remove these redundant checks.

Signed-off-by: Stefano Garzarella 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +--
 drivers/vdpa/vdpa_sim/vdpa_sim.c  | 6 --
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 78043ee567b6..ab63dc9b8432 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1825,8 +1825,7 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
*vdev, unsigned int offset,
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 
-   if (offset + len <= sizeof(struct virtio_net_config))
-   memcpy(buf, (u8 *)&ndev->config + offset, len);
+   memcpy(buf, (u8 *)&ndev->config + offset, len);
 }
 
 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
offset, const void *buf,
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 779ae6c144d7..392180c6f2cf 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -451,9 +451,6 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, 
unsigned int offset,
 {
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
 
-   if (offset + len > vdpasim->dev_attr.config_size)
-   return;
-
if (vdpasim->dev_attr.get_config)
vdpasim->dev_attr.get_config(vdpasim, vdpasim->config);
 
@@ -465,9 +462,6 @@ static void vdpasim_set_config(struct vdpa_device *vdpa, 
unsigned int offset,
 {
struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
 
-   if (offset + len > vdpasim->dev_attr.config_size)
-   return;
-
memcpy(vdpasim->config + offset, buf, len);
 
if (vdpasim->dev_attr.set_config)
-- 
2.29.2



[RFC PATCH 05/10] vdpa: remove WARN_ON() in the get/set_config callbacks

2021-02-16 Thread Stefano Garzarella
vdpa_get_config() and vdpa_set_config() now check parameters before
calling callbacks, so we can remove these warnings.

Signed-off-by: Stefano Garzarella 
---
Maybe we can skip this patch and leave the WARN_ONs in place.
What do you recommend?
---
 drivers/vdpa/ifcvf/ifcvf_base.c | 3 +--
 drivers/vdpa/ifcvf/ifcvf_main.c | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
index f2a128e56de5..5941ecf934d0 100644
--- a/drivers/vdpa/ifcvf/ifcvf_base.c
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -222,7 +222,6 @@ void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
u8 old_gen, new_gen, *p;
int i;
 
-   WARN_ON(offset + length > sizeof(struct virtio_net_config));
do {
old_gen = ifc_ioread8(&hw->common_cfg->config_generation);
p = dst;
@@ -240,7 +239,7 @@ void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
int i;
 
p = src;
-   WARN_ON(offset + length > sizeof(struct virtio_net_config));
+
for (i = 0; i < length; i++)
ifc_iowrite8(*p++, hw->net_cfg + offset + i);
 }
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 2443271e17d2..e55f88c57461 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -343,7 +343,6 @@ static void ifcvf_vdpa_get_config(struct vdpa_device 
*vdpa_dev,
 {
struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
 
-   WARN_ON(offset + len > sizeof(struct virtio_net_config));
ifcvf_read_net_config(vf, offset, buf, len);
 }
 
@@ -353,7 +352,6 @@ static void ifcvf_vdpa_set_config(struct vdpa_device 
*vdpa_dev,
 {
struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
 
-   WARN_ON(offset + len > sizeof(struct virtio_net_config));
ifcvf_write_net_config(vf, offset, buf, len);
 }
 
-- 
2.29.2



[RFC PATCH 06/10] virtio_vdpa: use vdpa_set_config()

2021-02-16 Thread Stefano Garzarella
Instead of calling the 'set_config' callback directly, we call the
new vdpa_set_config() helper which also checks the parameters.

Signed-off-by: Stefano Garzarella 
---
 drivers/virtio/virtio_vdpa.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index e28acf482e0c..2f1c4a2dd241 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -65,9 +65,8 @@ static void virtio_vdpa_set(struct virtio_device *vdev, 
unsigned offset,
const void *buf, unsigned len)
 {
struct vdpa_device *vdpa = vd_get_vdpa(vdev);
-   const struct vdpa_config_ops *ops = vdpa->config;
 
-   ops->set_config(vdpa, offset, buf, len);
+   vdpa_set_config(vdpa, offset, buf, len);
 }
 
 static u32 virtio_vdpa_generation(struct virtio_device *vdev)
-- 
2.29.2



[RFC PATCH 07/10] vhost/vdpa: use vdpa_set_config()

2021-02-16 Thread Stefano Garzarella
Instead of calling the 'set_config' callback directly, we call the
new vdpa_set_config() helper which also checks the parameters.

Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vdpa.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ef688c8c0e0e..cdd8f24168b2 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -236,7 +236,6 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
  struct vhost_vdpa_config __user *c)
 {
struct vdpa_device *vdpa = v->vdpa;
-   const struct vdpa_config_ops *ops = vdpa->config;
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
u8 *buf;
@@ -250,7 +249,7 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
if (IS_ERR(buf))
return PTR_ERR(buf);
 
-   ops->set_config(vdpa, config.off, buf, config.len);
+   vdpa_set_config(vdpa, config.off, buf, config.len);
 
kvfree(buf);
return 0;
-- 
2.29.2



[RFC PATCH 08/10] vhost/vdpa: allow user space to pass buffers bigger than config space

2021-02-16 Thread Stefano Garzarella
vdpa_get_config() and vdpa_set_config() now are able to read/write
only the bytes available in the device configuration space, also if
the buffer provided is bigger than that.

Let's use this feature to allow the user space application to pass any
buffer. We limit the size of the internal bounce buffer allocated with
the device config size.

Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vdpa.c | 36 
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index cdd8f24168b2..544f8582a42b 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -185,10 +185,10 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, 
u8 __user *statusp)
return 0;
 }
 
-static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
- struct vhost_vdpa_config *c)
+static ssize_t vhost_vdpa_config_validate(struct vhost_vdpa *v,
+ struct vhost_vdpa_config *c)
 {
-   long size = 0;
+   u32 size = 0;
 
switch (v->virtio_id) {
case VIRTIO_ID_NET:
@@ -199,10 +199,7 @@ static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
if (c->len == 0)
return -EINVAL;
 
-   if (c->len > size - c->off)
-   return -E2BIG;
-
-   return 0;
+   return min(c->len, size);
 }
 
 static long vhost_vdpa_get_config(struct vhost_vdpa *v,
@@ -211,19 +208,23 @@ static long vhost_vdpa_get_config(struct vhost_vdpa *v,
struct vdpa_device *vdpa = v->vdpa;
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+   ssize_t config_size;
u8 *buf;
 
if (copy_from_user(&config, c, size))
return -EFAULT;
-   if (vhost_vdpa_config_validate(v, &config))
-   return -EINVAL;
-   buf = kvzalloc(config.len, GFP_KERNEL);
+
+   config_size = vhost_vdpa_config_validate(v, &config);
+   if (config_size <= 0)
+   return config_size;
+
+   buf = kvzalloc(config_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
 
-   vdpa_get_config(vdpa, config.off, buf, config.len);
+   vdpa_get_config(vdpa, config.off, buf, config_size);
 
-   if (copy_to_user(c->buf, buf, config.len)) {
+   if (copy_to_user(c->buf, buf, config_size)) {
kvfree(buf);
return -EFAULT;
}
@@ -238,18 +239,21 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
struct vdpa_device *vdpa = v->vdpa;
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+   ssize_t config_size;
u8 *buf;
 
if (copy_from_user(&config, c, size))
return -EFAULT;
-   if (vhost_vdpa_config_validate(v, &config))
-   return -EINVAL;
 
-   buf = vmemdup_user(c->buf, config.len);
+   config_size = vhost_vdpa_config_validate(v, &config);
+   if (config_size <= 0)
+   return config_size;
+
+   buf = vmemdup_user(c->buf, config_size);
if (IS_ERR(buf))
return PTR_ERR(buf);
 
-   vdpa_set_config(vdpa, config.off, buf, config.len);
+   vdpa_set_config(vdpa, config.off, buf, config_size);
 
kvfree(buf);
return 0;
-- 
2.29.2



[RFC PATCH 09/10] vhost/vdpa: use get_config_size callback in vhost_vdpa_config_validate()

2021-02-16 Thread Stefano Garzarella
Let's use the new 'get_config_size()' callback available instead of
using the 'virtio_id' to get the size of the device config space.

Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vdpa.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 544f8582a42b..21eea2be5afa 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -188,13 +188,8 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 
__user *statusp)
 static ssize_t vhost_vdpa_config_validate(struct vhost_vdpa *v,
  struct vhost_vdpa_config *c)
 {
-   u32 size = 0;
-
-   switch (v->virtio_id) {
-   case VIRTIO_ID_NET:
-   size = sizeof(struct virtio_net_config);
-   break;
-   }
+   struct vdpa_device *vdpa = v->vdpa;
+   u32 size = vdpa->config->get_config_size(vdpa);
 
if (c->len == 0)
return -EINVAL;
-- 
2.29.2



[RFC PATCH 10/10] vhost/vdpa: return configuration bytes read and written to user space

2021-02-16 Thread Stefano Garzarella
vdpa_get_config() and vdpa_set_config() now return the amount
of bytes read and written, so let's return them to the user space.

We also modify vhost_vdpa_config_validate() to return 0 (bytes read
or written) instead of an error, when the buffer length is 0.

Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vdpa.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 21eea2be5afa..b754c53171a7 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -191,9 +191,6 @@ static ssize_t vhost_vdpa_config_validate(struct vhost_vdpa 
*v,
struct vdpa_device *vdpa = v->vdpa;
u32 size = vdpa->config->get_config_size(vdpa);
 
-   if (c->len == 0)
-   return -EINVAL;
-
return min(c->len, size);
 }
 
@@ -204,6 +201,7 @@ static long vhost_vdpa_get_config(struct vhost_vdpa *v,
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
ssize_t config_size;
+   long ret;
u8 *buf;
 
if (copy_from_user(&config, c, size))
@@ -217,15 +215,18 @@ static long vhost_vdpa_get_config(struct vhost_vdpa *v,
if (!buf)
return -ENOMEM;
 
-   vdpa_get_config(vdpa, config.off, buf, config_size);
-
-   if (copy_to_user(c->buf, buf, config_size)) {
-   kvfree(buf);
-   return -EFAULT;
+   ret = vdpa_get_config(vdpa, config.off, buf, config_size);
+   if (ret < 0) {
+   ret = -EFAULT;
+   goto out;
}
 
+   if (copy_to_user(c->buf, buf, config_size))
+   ret = -EFAULT;
+
+out:
kvfree(buf);
-   return 0;
+   return ret;
 }
 
 static long vhost_vdpa_set_config(struct vhost_vdpa *v,
@@ -235,6 +236,7 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
ssize_t config_size;
+   long ret;
u8 *buf;
 
if (copy_from_user(&config, c, size))
@@ -248,10 +250,12 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
if (IS_ERR(buf))
return PTR_ERR(buf);
 
-   vdpa_set_config(vdpa, config.off, buf, config_size);
+   ret = vdpa_set_config(vdpa, config.off, buf, config_size);
+   if (ret < 0)
+   ret = -EFAULT;
 
kvfree(buf);
-   return 0;
+   return ret;
 }
 
 static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
-- 
2.29.2



Re: [PATCH for 5.10] vdpa_sim: fix param validation in vdpasim_get_config()

2021-02-16 Thread Stefano Garzarella

On Mon, Feb 15, 2021 at 04:23:54PM +0100, Greg KH wrote:

On Mon, Feb 15, 2021 at 04:03:21PM +0100, Stefano Garzarella wrote:

On Mon, Feb 15, 2021 at 03:32:19PM +0100, Greg KH wrote:
> On Thu, Feb 11, 2021 at 05:25:19PM +0100, Stefano Garzarella wrote:
> > Commit 65b709586e222fa6ffd4166ac7fdb5d5dad113ee upstream.
>
> No, this really is not that commit, so please do not say it is.

Oops, sorry.

>
> > Before this patch, if 'offset + len' was equal to
> > sizeof(struct virtio_net_config), the entire buffer wasn't filled,
> > returning incorrect values to the caller.
> >
> > Since 'vdpasim->config' type is 'struct virtio_net_config', we can
> > safely copy its content under this condition.
> >
> > Commit 65b709586e22 ("vdpa_sim: add get_config callback in
> > vdpasim_dev_attr") unintentionally solved it upstream while
> > refactoring vdpa_sim.c to support multiple devices. But we don't want
> > to backport it to stable branches as it contains many changes.
> >
> > Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator")
> > Cc:  # 5.10.x
> > Signed-off-by: Stefano Garzarella 
> > ---
> >  drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim.c
> > index 6a90fdb9cbfc..8ca178d7b02f 100644
> > --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
> > +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
> > @@ -572,7 +572,7 @@ static void vdpasim_get_config(struct vdpa_device 
*vdpa, unsigned int offset,
> >  {
> >   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
> >
> > - if (offset + len < sizeof(struct virtio_net_config))
> > + if (offset + len <= sizeof(struct virtio_net_config))
> >   memcpy(buf, (u8 *)&vdpasim->config + offset, len);
> >  }
>
> I'll be glad to take a one-off patch, but why can't we take the real
> upstream patch?  That is always the better long-term solution, right?

Because that patch depends on the following patches merged in v5.11-rc1
while refactoring vdpa_sim:
  f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type
  cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer
  a13b5918fdd0 vdpa_sim: add work_fn in vdpasim_dev_attr
  011c35bac5ef vdpa_sim: add supported_features field in vdpasim_dev_attr
  2f8f46188805 vdpa_sim: add device id field in vdpasim_dev_attr
  6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes
  36a9c3063025 vdpa_sim: rename vdpasim_config_ops variables
  423248d60d2b vdpa_sim: remove hard-coded virtq count

Maybe we can skip some of them, but IMHO should be less risky to apply only
this change.

If you want I can try to figure out the minimum sub-set of patches needed
for 65b709586e22 ("vdpa_sim: add get_config callback in vdpasim_dev_attr").


The minimum is always nice :)



The minimum set, including the patch that fixes the issue, is the 
following:


  65b709586e22 vdpa_sim: add get_config callback in vdpasim_dev_attr
  f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type
  cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer
  6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes
  423248d60d2b vdpa_sim: remove hard-coded virtq count

The patches apply fairly cleanly. There are a few contextual differences 
due to the lack of the other patches:


  $ git backport-diff -u master -r linux-5.10.y..HEAD
  Key:
  [] : patches are identical
  [] : number of functional differences between upstream/downstream patch
  [down] : patch is downstream-only
  The flags [FC] indicate (F)unctional and (C)ontextual differences, 
respectively

  001/5:[] [--] 'vdpa_sim: remove hard-coded virtq count'
  002/5:[] [-C] 'vdpa_sim: add struct vdpasim_dev_attr for device 
attributes'
  003/5:[] [--] 'vdpa_sim: store parsed MAC address in a buffer'
  004/5:[] [-C] 'vdpa_sim: make 'config' generic and usable for any device 
type'
  005/5:[] [-C] 'vdpa_sim: add get_config callback in vdpasim_dev_attr'


If it's just too much churn for no good reason, then yes, the one-line
change above will be ok, but you need to document the heck out of why
this is not upstream and that it is a one-off thing.



Shortly I'll send the series to sta...@vger.kernel.org so you can judge 
if it's okay or better to resend this patch with a better description.


Thanks
Stefano



[PATCH for 5.10 v2 0/5] vdpa_sim: fix param validation in vdpasim_get_config()

2021-02-16 Thread Stefano Garzarella
v1: https://lore.kernel.org/stable/20210211162519.215418-1-sgarz...@redhat.com/

v2:
- backport the upstream patch and related patches needed

Commit 65b709586e22 ("vdpa_sim: add get_config callback in
vdpasim_dev_attr") unintentionally solved an issue in vdpasim_get_config()
upstream while refactoring vdpa_sim.c to support multiple devices.

Before that patch, if 'offset + len' was equal to
sizeof(struct virtio_net_config), the entire buffer wasn't filled,
returning incorrect values to the caller.

Since 'vdpasim->config' type is 'struct virtio_net_config', we can
safely copy its content under this condition.

The minimum set of patches to backport the patch that fixes the issue, is the
following:

   423248d60d2b vdpa_sim: remove hard-coded virtq count
   6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes
   cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer
   f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type
   65b709586e22 vdpa_sim: add get_config callback in vdpasim_dev_attr

The patches apply fairly cleanly. There are a few contextual differences
due to the lack of the other patches:

   $ git backport-diff -u master -r linux-5.10.y..HEAD
   Key:
   [] : patches are identical
   [] : number of functional differences between upstream/downstream patch
   [down] : patch is downstream-only
   The flags [FC] indicate (F)unctional and (C)ontextual differences, 
respectively

   001/5:[] [--] 'vdpa_sim: remove hard-coded virtq count'
   002/5:[] [-C] 'vdpa_sim: add struct vdpasim_dev_attr for device 
attributes'
   003/5:[] [--] 'vdpa_sim: store parsed MAC address in a buffer'
   004/5:[] [-C] 'vdpa_sim: make 'config' generic and usable for any device 
type'
   005/5:[] [-C] 'vdpa_sim: add get_config callback in vdpasim_dev_attr'

Thanks,
Stefano

Max Gurtovoy (1):
  vdpa_sim: remove hard-coded virtq count

Stefano Garzarella (4):
  vdpa_sim: add struct vdpasim_dev_attr for device attributes
  vdpa_sim: store parsed MAC address in a buffer
  vdpa_sim: make 'config' generic and usable for any device type
  vdpa_sim: add get_config callback in vdpasim_dev_attr

 drivers/vdpa/vdpa_sim/vdpa_sim.c | 83 +++-
 1 file changed, 60 insertions(+), 23 deletions(-)

-- 
2.29.2



[PATCH for 5.10 v2 1/5] vdpa_sim: remove hard-coded virtq count

2021-02-16 Thread Stefano Garzarella
From: Max Gurtovoy 

commit 423248d60d2b655321fc49eca1545f95a1bc9d6c upstream.

Add a new attribute that will define the number of virt queues to be
created for the vdpasim device.

Signed-off-by: Max Gurtovoy 
[sgarzare: replace kmalloc_array() with kcalloc()]
Acked-by: Jason Wang 
Signed-off-by: Stefano Garzarella 
Link: https://lore.kernel.org/r/20201215144256.155342-4-sgarz...@redhat.com
Signed-off-by: Michael S. Tsirkin 
Cc:  # 5.10.x
Signed-off-by: Stefano Garzarella 
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 6a90fdb9cbfc..ee8f24a4643b 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -70,7 +70,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
 /* State of each vdpasim device */
 struct vdpasim {
struct vdpa_device vdpa;
-   struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM];
+   struct vdpasim_virtqueue *vqs;
struct work_struct work;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
@@ -80,6 +80,7 @@ struct vdpasim {
u32 status;
u32 generation;
u64 features;
+   int nvqs;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
 };
@@ -144,7 +145,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
 {
int i;
 
-   for (i = 0; i < VDPASIM_VQ_NUM; i++)
+   for (i = 0; i < vdpasim->nvqs; i++)
vdpasim_vq_reset(&vdpasim->vqs[i]);
 
spin_lock(&vdpasim->iommu_lock);
@@ -350,7 +351,7 @@ static struct vdpasim *vdpasim_create(void)
const struct vdpa_config_ops *ops;
struct vdpasim *vdpasim;
struct device *dev;
-   int ret = -ENOMEM;
+   int i, ret = -ENOMEM;
 
if (batch_mapping)
ops = &vdpasim_net_batch_config_ops;
@@ -361,6 +362,7 @@ static struct vdpasim *vdpasim_create(void)
if (!vdpasim)
goto err_alloc;
 
+   vdpasim->nvqs = VDPASIM_VQ_NUM;
INIT_WORK(&vdpasim->work, vdpasim_work);
spin_lock_init(&vdpasim->lock);
spin_lock_init(&vdpasim->iommu_lock);
@@ -371,6 +373,11 @@ static struct vdpasim *vdpasim_create(void)
goto err_iommu;
set_dma_ops(dev, &vdpasim_dma_ops);
 
+   vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue),
+  GFP_KERNEL);
+   if (!vdpasim->vqs)
+   goto err_iommu;
+
vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
if (!vdpasim->iommu)
goto err_iommu;
@@ -389,8 +396,8 @@ static struct vdpasim *vdpasim_create(void)
eth_random_addr(vdpasim->config.mac);
}
 
-   vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu);
-   vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu);
+   for (i = 0; i < vdpasim->nvqs; i++)
+   vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
 
vdpasim->vdpa.dma_dev = dev;
ret = vdpa_register_device(&vdpasim->vdpa);
@@ -659,6 +666,7 @@ static void vdpasim_free(struct vdpa_device *vdpa)
kfree(vdpasim->buffer);
if (vdpasim->iommu)
vhost_iotlb_free(vdpasim->iommu);
+   kfree(vdpasim->vqs);
 }
 
 static const struct vdpa_config_ops vdpasim_net_config_ops = {
-- 
2.29.2



  1   2   3   4   5   6   7   8   9   10   >