Re: [PATCH net-next v3 1/3] vsock/virtio: use skb_frag_*() helpers
On Wed, Dec 20, 2023 at 01:45:00PM -0800, Mina Almasry wrote: Minor fix for virtio: code wanting to access the fields inside an skb frag should use the skb_frag_*() helpers, instead of accessing the fields directly. This allows for extensions where the underlying memory is not a page. Signed-off-by: Mina Almasry --- v2: - Also fix skb_frag_off() + skb_frag_size() (David) - Did not apply the reviewed-by from Stefano since the patch changed relatively much. Sorry for the delay, I was off. LGTM! Acked-by: Stefano Garzarella Possibly we can also send this patch alone if the series is still under discussion because it's definitely an improvement to the current code. Thanks, Stefano --- net/vmw_vsock/virtio_transport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f495b9e5186b..1748268e0694 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -153,10 +153,10 @@ virtio_transport_send_pkt_work(struct work_struct *work) * 'virt_to_phys()' later to fill the buffer descriptor. * We don't touch memory at "virtual" address of this page. */ - va = page_to_virt(skb_frag->bv_page); + va = page_to_virt(skb_frag_page(skb_frag)); sg_init_one(sgs[out_sg], - va + skb_frag->bv_offset, - skb_frag->bv_len); + va + skb_frag_off(skb_frag), + skb_frag_size(skb_frag)); out_sg++; } } -- 2.43.0.472.g3155946c3a-goog
Re: [RFC PATCH v1] vsock/test: add '--peer-port' input argument
Hi Arseniy, thanks for this patch! On Sat, Jan 13, 2024 at 12:21:10AM +0300, Arseniy Krasnov wrote: Implement port for given CID as input argument instead of using hardcoded value '1234'. This allows to run different test instances on a single CID. Port argument is not required parameter and if it is not set, then default value will be '1234' - thus we preserve previous behaviour. Signed-off-by: Arseniy Krasnov --- tools/testing/vsock/util.c| 17 +++- tools/testing/vsock/util.h| 4 + tools/testing/vsock/vsock_diag_test.c | 18 - tools/testing/vsock/vsock_test.c | 96 +-- tools/testing/vsock/vsock_test_zerocopy.c | 12 +-- tools/testing/vsock/vsock_uring_test.c| 16 +++- 6 files changed, 107 insertions(+), 56 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index ae2b33c21c45..554b290fefdc 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -33,8 +33,7 @@ void init_signals(void) signal(SIGPIPE, SIG_IGN); } -/* Parse a CID in string representation */ -unsigned int parse_cid(const char *str) +static unsigned int parse_uint(const char *str, const char *err_str) { char *endptr = NULL; unsigned long n; @@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str) errno = 0; n = strtoul(str, &endptr, 10); if (errno || *endptr != '\0') { - fprintf(stderr, "malformed CID \"%s\"\n", str); + fprintf(stderr, "malformed %s \"%s\"\n", err_str, str); exit(EXIT_FAILURE); } return n; } +/* Parse a CID in string representation */ +unsigned int parse_cid(const char *str) +{ + return parse_uint(str, "CID"); +} + +/* Parse a port in string representation */ +unsigned int parse_port(const char *str) +{ + return parse_uint(str, "port"); +} + /* Wait for the remote to close the connection */ void vsock_wait_remote_close(int fd) { diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index 03c88d0cb861..e95e62485959 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -12,10 +12,13 @@ enum test_mode { TEST_MODE_SERVER }; +#define DEFAULT_PEER_PORT 1234 + /* Test runner options */ struct test_opts { enum test_mode mode; unsigned int peer_cid; + unsigned int peer_port; }; /* A test case definition. Test functions must print failures to stderr and @@ -35,6 +38,7 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); +unsigned int parse_port(const char *str); int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c index fa927ad16f8a..5e6049226b77 100644 --- a/tools/testing/vsock/vsock_diag_test.c +++ b/tools/testing/vsock/vsock_diag_test.c @@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct test_opts *opts) } addr = { .svm = { .svm_family = AF_VSOCK, - .svm_port = 1234, + .svm_port = opts->peer_port, .svm_cid = VMADDR_CID_ANY, }, }; @@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts *opts) LIST_HEAD(sockets); struct vsock_stat *st; - fd = vsock_stream_connect(opts->peer_cid, 1234); + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); if (fd < 0) { perror("connect"); exit(EXIT_FAILURE); @@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts *opts) LIST_HEAD(sockets); int client_fd; - client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); if (client_fd < 0) { perror("accept"); exit(EXIT_FAILURE); @@ -461,6 +461,11 @@ static const struct option longopts[] = { .has_arg = required_argument, .val = 'p', }, + { + .name = "peer-port", + .has_arg = required_argument, + .val = 'q', + }, { .name = "list", .has_arg = no_argument, @@ -481,7 +486,7 @@ static const struct option longopts[] = { static void usage(void) { - fprintf(stderr, "Usage: vsock_diag_test [--help] [--control-host=] --control-port= --mode=client|server --peer-cid= [--list] [--skip=]\n" + fprintf(stderr, "Usage: vsock_diag_test [--help] [--control-host=] --control-port= --mode=client|server --peer-cid= [--peer-port=] [--list] [--skip=]\n" "\n" " Server: vsock_diag_test --control-port=1234 --mode=server --peer
Re: [PATCH V1] vdpa_sim: reset must not run
On Wed, Jan 17, 2024 at 11:23:23AM -0800, Steve Sistare wrote: vdpasim_do_reset sets running to true, which is wrong, as it allows vdpasim_kick_vq to post work requests before the device has been configured. To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK is set. Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op") Signed-off-by: Steve Sistare Reviewed-by: Eugenio Pérez --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index be2925d0d283..6304cb0b4770 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 flags) } } - vdpasim->running = true; + vdpasim->running = false; spin_unlock(&vdpasim->iommu_lock); vdpasim->features = 0; @@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) mutex_lock(&vdpasim->mutex); vdpasim->status = status; + vdpasim->running = (status & VIRTIO_CONFIG_S_FEATURES_OK) != 0; mutex_unlock(&vdpasim->mutex); Should we do something similar also in vdpasim_resume() ? I mean something like this: diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index be2925d0d283..55e4633d5442 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -520,7 +520,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa) int i; mutex_lock(&vdpasim->mutex); - vdpasim->running = true; + vdpasim->running = (vdpasim->status & VIRTIO_CONFIG_S_FEATURES_OK) != 0; if (vdpasim->pending_kick) { /* Process pending descriptors */ Thanks, Stefano
Re: Re: [PATCH V1] vdpa_sim: reset must not run
On Mon, Jan 22, 2024 at 11:47:22AM +0100, Eugenio Perez Martin wrote: On Mon, Jan 22, 2024 at 11:22 AM Stefano Garzarella wrote: On Wed, Jan 17, 2024 at 11:23:23AM -0800, Steve Sistare wrote: >vdpasim_do_reset sets running to true, which is wrong, as it allows >vdpasim_kick_vq to post work requests before the device has been >configured. To fix, do not set running until VIRTIO_CONFIG_S_FEATURES_OK >is set. > >Fixes: 0c89e2a3a9d0 ("vdpa_sim: Implement suspend vdpa op") >Signed-off-by: Steve Sistare >Reviewed-by: Eugenio Pérez >--- > drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > >diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c >index be2925d0d283..6304cb0b4770 100644 >--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c >+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c >@@ -160,7 +160,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim, u32 flags) > } > } > >- vdpasim->running = true; >+ vdpasim->running = false; > spin_unlock(&vdpasim->iommu_lock); > > vdpasim->features = 0; >@@ -483,6 +483,7 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) > > mutex_lock(&vdpasim->mutex); > vdpasim->status = status; >+ vdpasim->running = (status & VIRTIO_CONFIG_S_FEATURES_OK) != 0; > mutex_unlock(&vdpasim->mutex); Should we do something similar also in vdpasim_resume() ? I mean something like this: diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index be2925d0d283..55e4633d5442 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -520,7 +520,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa) int i; mutex_lock(&vdpasim->mutex); - vdpasim->running = true; + vdpasim->running = (vdpasim->status & VIRTIO_CONFIG_S_FEATURES_OK) != 0; if (vdpasim->pending_kick) { /* Process pending descriptors */ Thanks, Stefano The suspend and resume operation should not be called before DRIVER_OK, so maybe we should add that protection at drivers/vhost/vdpa.c actually? Yeah, I think so! Anyway, IMHO we should at least return an error in vdpa_sim if vdpasim_suspend/resume are called before DRIVER_OK (in another patch of course). Stefano
Re: [PATCH net-next v2] vsock/test: add '--peer-port' input argument
On Tue, Jan 23, 2024 at 10:27:50AM +0300, Arseniy Krasnov wrote: Implement port for given CID as input argument instead of using hardcoded value '1234'. This allows to run different test instances on a single CID. Port argument is not required parameter and if it is not set, then default value will be '1234' - thus we preserve previous behaviour. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Reword usage message. * Add commas after last field in 'opts' declaration. * 'RFC' -> 'net-next'. Thanks for the changes, LGTM! Reviewed-by: Stefano Garzarella tools/testing/vsock/util.c| 17 +++- tools/testing/vsock/util.h| 4 + tools/testing/vsock/vsock_diag_test.c | 21 +++-- tools/testing/vsock/vsock_test.c | 102 +- tools/testing/vsock/vsock_test_zerocopy.c | 12 +-- tools/testing/vsock/vsock_uring_test.c| 17 +++- 6 files changed, 115 insertions(+), 58 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index ae2b33c21c45..554b290fefdc 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -33,8 +33,7 @@ void init_signals(void) signal(SIGPIPE, SIG_IGN); } -/* Parse a CID in string representation */ -unsigned int parse_cid(const char *str) +static unsigned int parse_uint(const char *str, const char *err_str) { char *endptr = NULL; unsigned long n; @@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str) errno = 0; n = strtoul(str, &endptr, 10); if (errno || *endptr != '\0') { - fprintf(stderr, "malformed CID \"%s\"\n", str); + fprintf(stderr, "malformed %s \"%s\"\n", err_str, str); exit(EXIT_FAILURE); } return n; } +/* Parse a CID in string representation */ +unsigned int parse_cid(const char *str) +{ + return parse_uint(str, "CID"); +} + +/* Parse a port in string representation */ +unsigned int parse_port(const char *str) +{ + return parse_uint(str, "port"); +} + /* Wait for the remote to close the connection */ void vsock_wait_remote_close(int fd) { diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index 03c88d0cb861..e95e62485959 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -12,10 +12,13 @@ enum test_mode { TEST_MODE_SERVER }; +#define DEFAULT_PEER_PORT 1234 + /* Test runner options */ struct test_opts { enum test_mode mode; unsigned int peer_cid; + unsigned int peer_port; }; /* A test case definition. Test functions must print failures to stderr and @@ -35,6 +38,7 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); +unsigned int parse_port(const char *str); int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c index fa927ad16f8a..9d61b1f1c4c3 100644 --- a/tools/testing/vsock/vsock_diag_test.c +++ b/tools/testing/vsock/vsock_diag_test.c @@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct test_opts *opts) } addr = { .svm = { .svm_family = AF_VSOCK, - .svm_port = 1234, + .svm_port = opts->peer_port, .svm_cid = VMADDR_CID_ANY, }, }; @@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts *opts) LIST_HEAD(sockets); struct vsock_stat *st; - fd = vsock_stream_connect(opts->peer_cid, 1234); + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); if (fd < 0) { perror("connect"); exit(EXIT_FAILURE); @@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts *opts) LIST_HEAD(sockets); int client_fd; - client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); if (client_fd < 0) { perror("accept"); exit(EXIT_FAILURE); @@ -461,6 +461,11 @@ static const struct option longopts[] = { .has_arg = required_argument, .val = 'p', }, + { + .name = "peer-port", + .has_arg = required_argument, + .val = 'q', + }, { .name = "list", .has_arg = no_argument, @@ -481,7 +486,7 @@ static const struct option longopts[] = { static void usage(void) { - fprintf(stderr, "Usag
Re: [PATCH net-next v1] vsock/test: print type for SOCK_SEQPACKET
On Wed, Jan 24, 2024 at 10:32:55PM +0300, Arseniy Krasnov wrote: SOCK_SEQPACKET is supported for virtio transport, so do not interpret such type of socket as unknown. Signed-off-by: Arseniy Krasnov --- tools/testing/vsock/vsock_diag_test.c | 2 ++ 1 file changed, 2 insertions(+) Yeah, LGTM! Reviewed-by: Stefano Garzarella diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c index 5e6049226b77..17aeba7cbd14 100644 --- a/tools/testing/vsock/vsock_diag_test.c +++ b/tools/testing/vsock/vsock_diag_test.c @@ -39,6 +39,8 @@ static const char *sock_type_str(int type) return "DGRAM"; case SOCK_STREAM: return "STREAM"; + case SOCK_SEQPACKET: + return "SEQPACKET"; default: return "INVALID TYPE"; } -- 2.25.1
[PATCH] vhost-vdpa: fail enabling virtqueue in certain conditions
If VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not negotiated, we expect the driver to enable virtqueue before setting DRIVER_OK. If the driver tries anyway, better to fail right away as soon as we get the ioctl. Let's also update the documentation to make it clearer. We had a problem in QEMU for not meeting this requirement, see https://lore.kernel.org/qemu-devel/20240202132521.32714-1-kw...@redhat.com/ Fixes: 9f09fd6171fe ("vdpa: accept VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK backend feature") Cc: epere...@redhat.com Signed-off-by: Stefano Garzarella --- include/uapi/linux/vhost_types.h | 3 ++- drivers/vhost/vdpa.c | 4 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index d7656908f730..5df49b6021a7 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -182,7 +182,8 @@ struct vhost_vdpa_iova_range { /* Device can be resumed */ #define VHOST_BACKEND_F_RESUME 0x5 /* Device supports the driver enabling virtqueues both before and after - * DRIVER_OK + * DRIVER_OK. If this feature is not negotiated, the virtqueues must be + * enabled before setting DRIVER_OK. */ #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK 0x6 /* Device may expose the virtqueue's descriptor area, driver area and diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index bc4a51e4638b..1fba305ba8c1 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -651,6 +651,10 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, case VHOST_VDPA_SET_VRING_ENABLE: if (copy_from_user(&s, argp, sizeof(s))) return -EFAULT; + if (!vhost_backend_has_feature(vq, + VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) && + (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)) + return -EINVAL; ops->set_vq_ready(vdpa, idx, s.num); return 0; case VHOST_VDPA_GET_VRING_GROUP: -- 2.43.0
Re: Re: [PATCH] vhost-vdpa: fail enabling virtqueue in certain conditions
On Tue, Feb 06, 2024 at 10:56:50AM -0500, Michael S. Tsirkin wrote: better @subj: try late vq enable only if negotiated I rewrote it 3/4 times, and before sending it I was not happy with the result. Thank you, much better! I'll change it in v2. Stefano
Re: Re: [PATCH] vhost-vdpa: fail enabling virtqueue in certain conditions
On Wed, Feb 07, 2024 at 11:27:14AM +0800, Jason Wang wrote: On Tue, Feb 6, 2024 at 10:52 PM Stefano Garzarella wrote: If VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not negotiated, we expect the driver to enable virtqueue before setting DRIVER_OK. If the driver tries anyway, better to fail right away as soon as we get the ioctl. Let's also update the documentation to make it clearer. We had a problem in QEMU for not meeting this requirement, see https://lore.kernel.org/qemu-devel/20240202132521.32714-1-kw...@redhat.com/ Maybe it's better to only enable cvq when the backend supports VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK. Eugenio, any comment on this? Fixes: 9f09fd6171fe ("vdpa: accept VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK backend feature") Cc: epere...@redhat.com Signed-off-by: Stefano Garzarella --- include/uapi/linux/vhost_types.h | 3 ++- drivers/vhost/vdpa.c | 4 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index d7656908f730..5df49b6021a7 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -182,7 +182,8 @@ struct vhost_vdpa_iova_range { /* Device can be resumed */ #define VHOST_BACKEND_F_RESUME 0x5 /* Device supports the driver enabling virtqueues both before and after - * DRIVER_OK + * DRIVER_OK. If this feature is not negotiated, the virtqueues must be + * enabled before setting DRIVER_OK. */ #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK 0x6 /* Device may expose the virtqueue's descriptor area, driver area and diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index bc4a51e4638b..1fba305ba8c1 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -651,6 +651,10 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, case VHOST_VDPA_SET_VRING_ENABLE: if (copy_from_user(&s, argp, sizeof(s))) return -EFAULT; + if (!vhost_backend_has_feature(vq, + VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK) && + (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)) + return -EINVAL; As discussed, without VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK, we don't know if parents can do vq_ready after driver_ok. So maybe we need to keep this behaviour to unbreak some "legacy" userspace? I'm not sure it's a good idea, since "legacy" userspace are currently broken if used with VDUSE device. So we need to fix userspace in any case, and IMHO is better if we start to return an error, so the user understands what went wrong, because the problem in QEMU took us quite some time to figure out that we couldn't enable vq after DRIVER_OK. Since userspace is unable to understand if a vhost-vdpa device is VDUSE or not, I think we have only 2 options either merge this patch or fix VDUSE somehow. But the last one I think is more complicated/intrusive. Thanks, Stefano For example ifcvf did: static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev, u16 qid, bool ready) { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); ifcvf_set_vq_ready(vf, qid, ready); } And it did: void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready) { struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; vp_iowrite16(qid, &cfg->queue_select); vp_iowrite16(ready, &cfg->queue_enable); } Though it didn't advertise VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK? Adding LingShan for more thought. Thanks ops->set_vq_ready(vdpa, idx, s.num); return 0; case VHOST_VDPA_GET_VRING_GROUP: -- 2.43.0
Re: [PATCH net-next 1/2] net/vsockmon: Leverage core stats allocator
On Fri, Feb 23, 2024 at 03:58:37AM -0800, Breno Leitao wrote: With commit 34d21de99cea9 ("net: Move {l,t,d}stats allocation to core and convert veth & vrf"), stats allocation could be done on net core instead of this driver. With this new approach, the driver doesn't have to bother with error handling (allocation failure checking, making sure free happens in the right spot, etc). This is core responsibility now. Remove the allocation in the vsockmon driver and leverage the network core allocation instead. Signed-off-by: Breno Leitao --- drivers/net/vsockmon.c | 16 +--- 1 file changed, 1 insertion(+), 15 deletions(-) Thanks for this patch! Reviewed-by: Stefano Garzarella diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c index b1bb1b04b664..a0b4dca36baf 100644 --- a/drivers/net/vsockmon.c +++ b/drivers/net/vsockmon.c @@ -13,19 +13,6 @@ #define DEFAULT_MTU (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + \ sizeof(struct af_vsockmon_hdr)) -static int vsockmon_dev_init(struct net_device *dev) -{ - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) - return -ENOMEM; - return 0; -} - -static void vsockmon_dev_uninit(struct net_device *dev) -{ - free_percpu(dev->lstats); -} - struct vsockmon { struct vsock_tap vt; }; @@ -79,8 +66,6 @@ static int vsockmon_change_mtu(struct net_device *dev, int new_mtu) } static const struct net_device_ops vsockmon_ops = { - .ndo_init = vsockmon_dev_init, - .ndo_uninit = vsockmon_dev_uninit, .ndo_open = vsockmon_open, .ndo_stop = vsockmon_close, .ndo_start_xmit = vsockmon_xmit, @@ -112,6 +97,7 @@ static void vsockmon_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->mtu = DEFAULT_MTU; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; } static struct rtnl_link_ops vsockmon_link_ops __read_mostly = { -- 2.39.3
Re: [PATCH net-next 2/2] net/vsockmon: Do not set zeroed statistics
On Fri, Feb 23, 2024 at 03:58:38AM -0800, Breno Leitao wrote: Do not set rtnl_link_stats64 fields to zero, since they are zeroed before ops->ndo_get_stats64 is called in core dev_get_stats() function. Signed-off-by: Breno Leitao --- drivers/net/vsockmon.c | 3 --- 1 file changed, 3 deletions(-) Reviewed-by: Stefano Garzarella diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c index a0b4dca36baf..a1ba5169ed5d 100644 --- a/drivers/net/vsockmon.c +++ b/drivers/net/vsockmon.c @@ -46,9 +46,6 @@ static void vsockmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes); - - stats->tx_packets = 0; - stats->tx_bytes = 0; } static int vsockmon_is_valid_mtu(int new_mtu) -- 2.39.3
Re: [PATCH v3] vhost/vdpa: Add MSI translation tables to iommu for software-managed MSI
On Wed, Mar 20, 2024 at 06:19:12PM +0800, Wang Rong wrote: From: Rong Wang Once enable iommu domain for one device, the MSI translation tables have to be there for software-managed MSI. Otherwise, platform with software-managed MSI without an irq bypass function, can not get a correct memory write event from pcie, will not get irqs. The solution is to obtain the MSI phy base address from iommu reserved region, and set it to iommu MSI cookie, then translation tables will be created while request irq. Change log -- v1->v2: - add resv iotlb to avoid overlap mapping. v2->v3: - there is no need to export the iommu symbol anymore. Signed-off-by: Rong Wang --- drivers/vhost/vdpa.c | 59 +--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index ba52d128aeb7..28b56b10372b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -49,6 +49,7 @@ struct vhost_vdpa { struct completion completion; struct vdpa_device *vdpa; struct hlist_head as[VHOST_VDPA_IOTLB_BUCKETS]; + struct vhost_iotlb resv_iotlb; struct device dev; struct cdev cdev; atomic_t opened; @@ -247,6 +248,7 @@ static int _compat_vdpa_reset(struct vhost_vdpa *v) static int vhost_vdpa_reset(struct vhost_vdpa *v) { v->in_batch = 0; + vhost_iotlb_reset(&v->resv_iotlb); return _compat_vdpa_reset(v); } @@ -1219,10 +1221,15 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, msg->iova + msg->size - 1 > v->range.last) return -EINVAL; + if (vhost_iotlb_itree_first(&v->resv_iotlb, msg->iova, + msg->iova + msg->size - 1)) + return -EINVAL; + if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + Unnecessary new line here. if (vdpa->use_va) return vhost_vdpa_va_map(v, iotlb, msg->iova, msg->size, msg->uaddr, msg->perm); @@ -1307,6 +1314,45 @@ static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb, return vhost_chr_write_iter(dev, from); } +static int vhost_vdpa_resv_iommu_region(struct iommu_domain *domain, struct device *dma_dev, + struct vhost_iotlb *resv_iotlb) +{ + struct list_head dev_resv_regions; + phys_addr_t resv_msi_base = 0; + struct iommu_resv_region *region; + int ret = 0; + bool with_sw_msi = false; + bool with_hw_msi = false; + + INIT_LIST_HEAD(&dev_resv_regions); + iommu_get_resv_regions(dma_dev, &dev_resv_regions); + + list_for_each_entry(region, &dev_resv_regions, list) { + ret = vhost_iotlb_add_range_ctx(resv_iotlb, region->start, + region->start + region->length - 1, + 0, 0, NULL); + if (ret) { + vhost_iotlb_reset(resv_iotlb); + break; + } + + if (region->type == IOMMU_RESV_MSI) + with_hw_msi = true; + + if (region->type == IOMMU_RESV_SW_MSI) { + resv_msi_base = region->start; Can it happen that there are multiple regions of the IOMMU_RESV_SW_MSI type? In this case, is it correct to overwrite `resv_msi_base`? + with_sw_msi = true; + } + } + + if (!ret && !with_hw_msi && with_sw_msi) + ret = iommu_get_msi_cookie(domain, resv_msi_base); If `iommu_get_msi_cookie()` fails: - Should we avoid calling iommu_put_resv_regions()? - Should we also call `vhost_iotlb_reset(resv_iotlb)` like for the vhost_iotlb_add_range_ctx() failure ? If it is the case, maybe it's better to add an error label where do the cleanup. + + iommu_put_resv_regions(dma_dev, &dev_resv_regions); + + return ret; +} + static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; @@ -1335,11 +1381,16 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) ret = iommu_attach_device(v->domain, dma_dev); if (ret) - goto err_attach; + goto err_alloc_domain; - return 0; + ret = vhost_vdpa_resv_iommu_region(v->domain, dma_dev, &v->resv_iotlb); + if (ret) + goto err_attach_device; -err_attach: + return 0; I suggest to add a new line here to separate the error path for the success path. +err_attach_device: + iommu_detach_device(v->domain, dma_dev); +err_alloc_domain: iommu_domain_free(v->domain); v->domain = NULL; return ret; @@ -1595,6 +1646,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) goto err; } + vhost_iotlb_init(&v->resv_iotlb, 0, 0); + IIUC t
Re: [PATCH] vsock/virtio: fix packet delivery to tap device
On Mon, Mar 25, 2024 at 06:12:38PM +0100, Marco Pinna wrote: Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added virtio_transport_deliver_tap_pkt() for handing packets to the vsockmon device. However, in virtio_transport_send_pkt_work(), the function is called before actually sending the packet (i.e. before placing it in the virtqueue with virtqueue_add_sgs() and checking whether it returned successfully). From here.. This may cause timing issues since the sending of the packet may fail, causing it to be re-queued (possibly multiple times), while the tap device would show the packet being sent correctly. to here... This a bit unclear, I would rephrase with something like this: Queuing the packet in the virtqueue can fail even multiple times. However, in virtio_transport_deliver_tap_pkt() we deliver the packet to the monitoring tap interface only the first time we call it. This certainly avoids seeing the same packet replicated multiple times in the monitoring interface, but it can show the packet sent with the wrong timestamp or even before we succeed to queue it in the virtqueue. Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs() and making sure it returned successfully. Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") Signed-off-by: Marco Pinna --- net/vmw_vsock/virtio_transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1748268e0694..ee5d306a96d0 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (!skb) break; - virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), @@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) break; } + virtio_transport_deliver_tap_pkt(skb); + I was just worried that consume_skb(), called in virtio_transport_tx_work() when the host sends an interrupt to the guest after it has consumed the packet, might be called before this point, but both run with `vsock->tx_lock` held, so we are protected from this case. So, the patch LGTM, I would just clarify the commit message. Thanks, Stefano if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; -- 2.44.0
Re: [PATCH v3 1/3] vhost: Add smp_rmb() in vhost_vq_avail_empty()
On Thu, Mar 28, 2024 at 10:21:47AM +1000, Gavin Shan wrote: A smp_rmb() has been missed in vhost_vq_avail_empty(), spotted by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_vq_avail_empty(). When tx_can_batch() returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit 275bf960ac697 ("vhost: better detection of available buffers"). Fixes: 275bf960ac69 ("vhost: better detection of available buffers") Cc: # v4.11+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang --- drivers/vhost/vhost.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) Reviewed-by: Stefano Garzarella diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 045f666b4f12..29df65b2ebf2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2799,9 +2799,19 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) r = vhost_get_avail_idx(vq, &avail_idx); if (unlikely(r)) return false; + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following +* call to vhost_get_vq_desc() will read available +* ring entries. Make sure that read happens after +* the avail_idx read. +*/ + smp_rmb(); + return false; + } - return vq->avail_idx == vq->last_avail_idx; + return true; } EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); -- 2.44.0
Re: [PATCH v3 2/3] vhost: Add smp_rmb() in vhost_enable_notify()
On Thu, Mar 28, 2024 at 10:21:48AM +1000, Gavin Shan wrote: A smp_rmb() has been missed in vhost_enable_notify(), inspired by Will. Otherwise, it's not ensured the available ring entries pushed by guest can be observed by vhost in time, leading to stale available ring entries fetched by vhost in vhost_get_vq_desc(), as reported by Yihuang Yu on NVidia's grace-hopper (ARM64) platform. /home/gavin/sandbox/qemu.main/build/qemu-system-aarch64 \ -accel kvm -machine virt,gic-version=host -cpu host \ -smp maxcpus=1,cpus=1,sockets=1,clusters=1,cores=1,threads=1 \ -m 4096M,slots=16,maxmem=64G \ -object memory-backend-ram,id=mem0,size=4096M\ : \ -netdev tap,id=vnet0,vhost=true \ -device virtio-net-pci,bus=pcie.8,netdev=vnet0,mac=52:54:00:f1:26:b0 : guest# netperf -H 10.26.1.81 -l 60 -C -c -t UDP_STREAM virtio_net virtio0: output.0:id 100 is not a head! Add the missed smp_rmb() in vhost_enable_notify(). When it returns true, it means there's still pending tx buffers. Since it might read indices, so it still can bypass the smp_rmb() in vhost_get_vq_desc(). Note that it should be safe until vq->avail_idx is changed by commit d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()"). Fixes: d3bb267bbdcb ("vhost: cache avail index in vhost_enable_notify()") Cc: # v5.18+ Reported-by: Yihuang Yu Suggested-by: Will Deacon Signed-off-by: Gavin Shan Acked-by: Jason Wang --- drivers/vhost/vhost.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) Thanks for fixing this! Reviewed-by: Stefano Garzarella diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 29df65b2ebf2..32686c79c41d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2848,9 +2848,19 @@ bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) &vq->avail->idx, r); return false; } + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); + if (vq->avail_idx != vq->last_avail_idx) { + /* Since we have updated avail_idx, the following +* call to vhost_get_vq_desc() will read available +* ring entries. Make sure that read happens after +* the avail_idx read. +*/ + smp_rmb(); + return true; + } - return vq->avail_idx != vq->last_avail_idx; + return false; } EXPORT_SYMBOL_GPL(vhost_enable_notify); -- 2.44.0
Re: [PATCH net v2] vsock/virtio: fix packet delivery to tap device
On Fri, Mar 29, 2024 at 05:12:59PM +0100, Marco Pinna wrote: Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added virtio_transport_deliver_tap_pkt() for handing packets to the vsockmon device. However, in virtio_transport_send_pkt_work(), the function is called before actually sending the packet (i.e. before placing it in the virtqueue with virtqueue_add_sgs() and checking whether it returned successfully). Queuing the packet in the virtqueue can fail even multiple times. However, in virtio_transport_deliver_tap_pkt() we deliver the packet to the monitoring tap interface only the first time we call it. This certainly avoids seeing the same packet replicated multiple times in the monitoring interface, but it can show the packet sent with the wrong timestamp or even before we succeed to queue it in the virtqueue. Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs() and making sure it returned successfully. Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") Cc: sta...@vge.kernel.org Signed-off-by: Marco Pinna --- net/vmw_vsock/virtio_transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Reviewed-by: Stefano Garzarella diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1748268e0694..ee5d306a96d0 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (!skb) break; - virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), @@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) break; } + virtio_transport_deliver_tap_pkt(skb); + if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; -- 2.44.0
Re: [PATCH] vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE
On Tue, Apr 02, 2024 at 05:21:39PM -0400, Michael S. Tsirkin wrote: VDPA_GET_VRING_SIZE by mistake uses the already occupied ioctl # 0x80 and we never noticed - it happens to work because the direction and size are different, but confuses tools such as perf which like to look at just the number, and breaks the extra robustness of the ioctl numbering macros. To fix, sort the entries and renumber the ioctl - not too late since it wasn't in any released kernels yet. Cc: Arnaldo Carvalho de Melo Reported-by: Namhyung Kim Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size") Cc: "Zhu Lingshan" Signed-off-by: Michael S. Tsirkin --- Build tested only - userspace patches using this will have to adjust. I will merge this in a week or so unless I hear otherwise, and afterwards perf can update there header. Fortunately, we haven't released any kernels with this yet, right? (other than v6.9-rc*) LGTM: Reviewed-by: Stefano Garzarella include/uapi/linux/vhost.h | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bea697390613..b95dd84eef2d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -228,10 +222,17 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + /* Get the queue size of a specific virtqueue. * userspace set the vring index in vhost_vring_state.index * kernel set the queue size in vhost_vring_state.num */ -#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x80, \ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) #endif -- MST
Re: [PATCH] vhost/vsock: always initialize seqpacket_allow
On Wed, May 15, 2024 at 11:05:43AM GMT, Michael S. Tsirkin wrote: There are two issues around seqpacket_allow: 1. seqpacket_allow is not initialized when socket is created. Thus if features are never set, it will be read uninitialized. 2. if VIRTIO_VSOCK_F_SEQPACKET is set and then cleared, then seqpacket_allow will not be cleared appropriately (existing apps I know about don't usually do this but it's legal and there's no way to be sure no one relies on this). To fix: - initialize seqpacket_allow after allocation - set it unconditionally in set_features Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com Reported-by: Jeongjun Park Fixes: ced7b713711f ("vhost/vsock: support SEQPACKET for transport"). Cc: Arseny Krasnov Cc: David S. Miller Cc: Stefan Hajnoczi Signed-off-by: Michael S. Tsirkin Acked-by: Arseniy Krasnov Tested-by: Arseniy Krasnov --- Reposting now it's been tested. drivers/vhost/vsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Thanks for fixing this issue! Reviewed-by: Stefano Garzarella
Re: [RFC PATCH v1 0/2] send credit update during setting SO_RCVLOWAT
On Wed, Nov 08, 2023 at 10:20:02AM +0300, Arseniy Krasnov wrote: Hello, DESCRIPTION This patchset fixes old problem with hungup of both rx/tx sides and adds test for it. This happens due to non-default SO_RCVLOWAT value and deferred credit update in virtio/vsock. Link to previous old patchset: https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/ Here is what happens step by step: TEST INITIAL CONDITIONS 1) Vsock buffer size is 128KB. 2) Maximum packet size is also 64KB as defined in header (yes it is hardcoded, just to remind about that value). 3) SO_RCVLOWAT is default, e.g. 1 byte. STEPS SENDER RECEIVER 1) sends 128KB + 1 byte in a single buffer. 128KB will be sent, but for 1 byte sender will wait for free space at peer. Sender goes to sleep. 2) reads 64KB, credit update not sent 3) sets SO_RCVLOWAT to 64KB + 1 4) poll() -> wait forever, there is only 64KB available to read. So in step 4) receiver also goes to sleep, waiting for enough data or connection shutdown message from the sender. Idea to fix it is that rx kicks tx side to continue transmission (and may be close connection) when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and this value is bigger than number of available bytes to read. I've added small test for this, but not sure as it uses hardcoded value Thanks for adding the test! for maximum packet length, this value is defined in kernel header and used to control deferred credit update. And as this is not available to userspace, I can't control test parameters correctly (if one day this define will be changed - test may become useless). I see, I'll leave a comment in the patch! Thanks, Stefano Head for this patchset is: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=ff269e2cd5adce4ae14f883fc9c8803bc43ee1e9 Arseniy Krasnov (2): virtio/vsock: send credit update during setting SO_RCVLOWAT vsock/test: SO_RCVLOWAT + deferred credit update test drivers/vhost/vsock.c | 2 + include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 2 + net/vmw_vsock/virtio_transport_common.c | 31 ++ net/vmw_vsock/vsock_loopback.c | 2 + tools/testing/vsock/vsock_test.c| 131 6 files changed, 169 insertions(+) -- 2.25.1
Re: [RFC PATCH v1 1/2] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Wed, Nov 08, 2023 at 10:20:03AM +0300, Arseniy Krasnov wrote: This adds sending credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Signed-off-by: Arseniy Krasnov --- drivers/vhost/vsock.c | 2 ++ include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 2 ++ net/vmw_vsock/virtio_transport_common.c | 31 + net/vmw_vsock/vsock_loopback.c | 2 ++ 5 files changed, 38 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index f75731396b7e..ecfa5c11f5ee 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + + .set_rcvlowat = virtio_transport_set_rcvlowat }, .send_pkt = vhost_transport_send_pkt, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..97dc1bebc69c 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index af5bab1acee1..cf3431189d0c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + + .set_rcvlowat = virtio_transport_set_rcvlowat }, .send_pkt = virtio_transport_send_pkt, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e22c81435ef7..88a58163046e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1676,6 +1676,37 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); +int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool send_update = false; I'd declare this not initialized. + + spin_lock_bh(&vvs->rx_lock); + + /* If number of available bytes is less than new +* SO_RCVLOWAT value, kick sender to send more +* data, because sender may sleep in its 'send()' +* syscall waiting for enough space at our side. +*/ + if (vvs->rx_bytes < val) + send_update = true; Then here just: send_update = vvs->rx_bytes < val; + + spin_unlock_bh(&vvs->rx_lock); + + if (send_update) { + int err; + + err = virtio_transport_send_credit_update(vsk); + if (err < 0) + return err; + } + + WRITE_ONCE(sk_vsock(vsk)->sk_rcvlowat, val ? : 1); Not in this patch, but what about doing this in vsock_set_rcvlowat() in af_vsock.c? I mean avoid to return if `transport->set_rcvlowat(vsk, val)` is successfully, so set sk_rcvlowat in a single point. The rest LGTM! Stefano + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 048640167411..388c157f6633 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + + .set_rcvlowat = virtio_transport_set_rcvlowat }, .send_pkt = vsock_loopback_send_pkt, -- 2.25.1
Re: [RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test
On Wed, Nov 08, 2023 at 10:20:04AM +0300, Arseniy Krasnov wrote: This adds test which checks, that updating SO_RCVLOWAT value also sends You can avoid "This adds", and write just "Add test ...". See https://docs.kernel.org/process/submitting-patches.html#describe-your-changes Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour. Also in the other patch. credit update message. Otherwise mutual hungup may happen when receiver didn't send credit update and then calls 'poll()' with non default SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender waits for free space at receiver's side. Signed-off-by: Arseniy Krasnov --- tools/testing/vsock/vsock_test.c | 131 +++ 1 file changed, 131 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index c1f7bc9abd22..c71b3875fd16 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct test_opts *opts) close(fd); } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) What about adding a comment like the one in the cover letter about dependency with kernel values? Please add it also in the commit description. I'm thinking if we should move all the defines that depends on the kernel in some special header. + +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts *opts) +{ + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte more than peer's buffer size. */ + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1; + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until peer sets needed buffer size. */ + control_expectln("SRVREADY"); + + if (send(fd, buf, buf_size, 0) != buf_size) { + perror("send failed"); + exit(EXIT_FAILURE); + } + + free(buf); + close(fd); +} + +static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts *opts) +{ + size_t recv_buf_size; + struct pollfd fds; + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + &buf_size, sizeof(buf_size))) { + perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + exit(EXIT_FAILURE); + } + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + control_writeln("SRVREADY"); + + /* Wait until there will be 128KB of data in rx queue. */ + while (1) { + ssize_t res; + + res = recv(fd, buf, buf_size, MSG_PEEK); + if (res == buf_size) + break; + + if (res <= 0) { + fprintf(stderr, "unexpected 'recv()' return: %zi\n", res); + exit(EXIT_FAILURE); + } + } + + /* There is 128KB of data in the socket's rx queue, +* dequeue first 64KB, credit update is not sent. +*/ + recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size); + recv_buf_size++; + + /* Updating SO_RCVLOWAT will send credit update. */ + if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, + &recv_buf_size, sizeof(recv_buf_size))) { + perror("setsockopt(SO_RCVLOWAT)"); + exit(EXIT_FAILURE); + } + + memset(&fds, 0, sizeof(fds)); + fds.fd = fd; + fds.events = POLLIN | POLLRDNORM | POLLERR | +POLLRDHUP | POLLHUP; + + /* This 'poll()' will return once we receive last byte +* sent by client. +*/ + if (poll(&fds, 1, -1) < 0) { + perror("poll"); + exit(EXIT_FAILURE); + } + + if (fds.revents & POLLERR) { + fprintf(stderr, "'poll()' error\n"); + exit(EXIT_FAILURE); + } + + if (fds.revents & (POLLIN | POLLRDNORM)) { + recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size); + } else { + /* These flags must be set, as there is at +* least 64KB of da
Re: [RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test
On Fri, Nov 17, 2023 at 10:12:38AM +0300, Arseniy Krasnov wrote: On 15.11.2023 14:11, Stefano Garzarella wrote: On Wed, Nov 08, 2023 at 10:20:04AM +0300, Arseniy Krasnov wrote: This adds test which checks, that updating SO_RCVLOWAT value also sends You can avoid "This adds", and write just "Add test ...". See https://docs.kernel.org/process/submitting-patches.html#describe-your-changes Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour. Also in the other patch. credit update message. Otherwise mutual hungup may happen when receiver didn't send credit update and then calls 'poll()' with non default SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender waits for free space at receiver's side. Signed-off-by: Arseniy Krasnov --- tools/testing/vsock/vsock_test.c | 131 +++ 1 file changed, 131 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index c1f7bc9abd22..c71b3875fd16 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct test_opts *opts) close(fd); } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) What about adding a comment like the one in the cover letter about dependency with kernel values? Please add it also in the commit description. I'm thinking if we should move all the defines that depends on the kernel in some special header. IIUC it will be new header file in tools/testing/vsock, which includes such defines. At this moment in will contain only VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. Idea is that such defines So this only works on the virtio transport though, not the other transports, right? (but maybe the others don't have this problem, so it's fine). are not supposed to use by user (so do not move it to uapi headers), but needed by tests to check kernel behaviour. Please correct me if i'm wrong. Right! Maybe if it's just one, we can leave it there for now, but with a comment on top explaining where it comes. Thanks, Stefano
Re: [PATCH net v1] vsock/test: fix SEQPACKET message bounds test
On Wed, Nov 22, 2023 at 12:16:42AM +0300, Arseniy Krasnov wrote: Tune message length calculation to make this test work on machines where 'getpagesize()' returns >32KB. Now maximum message length is not hardcoded (on machines above it was smaller than 'getpagesize()' return value, thus we get negative value and test fails), but calculated at runtime and always bigger than 'getpagesize()' result. Reproduced on aarch64 with 64KB page size. It was reported to me by Bogdan, so we can add: Reported-by: Bogdan Marcynkov Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Signed-off-by: Arseniy Krasnov --- tools/testing/vsock/vsock_test.c | 19 +-- 1 file changed, 13 insertions(+), 6 deletions(-) The fix LGTM and it worked on aarch64 machine. Reviewed-by: Stefano Garzarella Thanks for the fast fix! Stefano diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index f5623b8d76b7..691e44c746bf 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -353,11 +353,12 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) } #define SOCK_BUF_SIZE (2 * 1024 * 1024) -#define MAX_MSG_SIZE (32 * 1024) +#define MAX_MSG_PAGES 4 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) { unsigned long curr_hash; + size_t max_msg_size; int page_size; int msg_count; int fd; @@ -373,7 +374,8 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) curr_hash = 0; page_size = getpagesize(); - msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE; + max_msg_size = MAX_MSG_PAGES * page_size; + msg_count = SOCK_BUF_SIZE / max_msg_size; for (int i = 0; i < msg_count; i++) { size_t buf_size; @@ -383,7 +385,7 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) /* Use "small" buffers and "big" buffers. */ if (i & 1) buf_size = page_size + - (rand() % (MAX_MSG_SIZE - page_size)); + (rand() % (max_msg_size - page_size)); else buf_size = 1 + (rand() % page_size); @@ -429,7 +431,6 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) unsigned long remote_hash; unsigned long curr_hash; int fd; - char buf[MAX_MSG_SIZE]; struct msghdr msg = {0}; struct iovec iov = {0}; @@ -457,8 +458,13 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) control_writeln("SRVREADY"); /* Wait, until peer sends whole data. */ control_expectln("SENDDONE"); - iov.iov_base = buf; - iov.iov_len = sizeof(buf); + iov.iov_len = MAX_MSG_PAGES * getpagesize(); + iov.iov_base = malloc(iov.iov_len); + if (!iov.iov_base) { + perror("malloc"); + exit(EXIT_FAILURE); + } + msg.msg_iov = &iov; msg.msg_iovlen = 1; @@ -483,6 +489,7 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size); } + free(iov.iov_base); close(fd); remote_hash = control_readulong(); -- 2.25.1
Re: [RFC PATCH v3 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Wed, Nov 22, 2023 at 09:05:09PM +0300, Arseniy Krasnov wrote: Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Do not initialize 'send_update' variable - set it directly during first usage. drivers/vhost/vsock.c | 2 ++ include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 2 ++ net/vmw_vsock/virtio_transport_common.c | 28 + net/vmw_vsock/vsock_loopback.c | 2 ++ 5 files changed, 35 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index f75731396b7e..ecfa5c11f5ee 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + + .set_rcvlowat = virtio_transport_set_rcvlowat Since now we don't set it anymore in the callback, what about following the notify_* callbacks and rename it in `notify_set_rcvlowat`? Eventually I think we can rename it in the previous patch. }, .send_pkt = vhost_transport_send_pkt, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..97dc1bebc69c 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index af5bab1acee1..cf3431189d0c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + + .set_rcvlowat = virtio_transport_set_rcvlowat }, .send_pkt = virtio_transport_send_pkt, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f6dc896bf44c..4acee21b4350 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1684,6 +1684,34 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); +int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool send_update; + + spin_lock_bh(&vvs->rx_lock); + + /* If number of available bytes is less than new +* SO_RCVLOWAT value, kick sender to send more +* data, because sender may sleep in its 'send()' +* syscall waiting for enough space at our side. +*/ Let's try to use at least the full 80 characters so we can reduce the lines in this comment block. + send_update = vvs->rx_bytes < val; + + spin_unlock_bh(&vvs->rx_lock); + + if (send_update) { + int err; + + err = virtio_transport_send_credit_update(vsk); + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 048640167411..388c157f6633 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + + .set_rcvlowat = virtio_transport_set_rcvlowat }, .send_pkt = vsock_loopback_send_pkt, -- 2.25.1
Re: [RFC PATCH v3 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test
On Wed, Nov 22, 2023 at 09:05:10PM +0300, Arseniy Krasnov wrote: Test which checks, that updating SO_RCVLOWAT value also sends credit update message. Otherwise mutual hungup may happen when receiver didn't send credit update and then calls 'poll()' with non default SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender waits for free space at receiver's side. Important thing is that this test relies on kernel's define for maximum packet size for virtio transport and this value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this define is used to control moment when to send credit update message). If this value or its usage will be changed in kernel - this test may become useless/broken. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Update commit message by adding details about dependency for this test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. * Add comment for this dependency in 'vsock_test.c' where this define is duplicated. v2 -> v3: * Replace synchronization based on control TCP socket with vsock data socket - this is needed to allow sender transmit data only when new buffer size of receiver is visible to sender. Otherwise there is race and test fails sometimes. tools/testing/vsock/vsock_test.c | 142 +++ 1 file changed, 142 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 5b0e93f9996c..773a71260fba 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1225,6 +1225,143 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +/* This define is the same as in 'include/linux/virtio_vsock.h': + * it is used to decide when to send credit update message during + * reading from rx queue of a socket. Value and its usage in + * kernel is important for this test. + */ +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts *opts) +{ + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte more than peer's buffer size. */ + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1; + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until peer sets needed buffer size. */ + recv_byte(fd, 1, 0); + + if (send(fd, buf, buf_size, 0) != buf_size) { + perror("send failed"); + exit(EXIT_FAILURE); + } + + free(buf); + close(fd); +} + +static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts *opts) +{ + size_t recv_buf_size; + struct pollfd fds; + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + &buf_size, sizeof(buf_size))) { + perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + exit(EXIT_FAILURE); + } + + /* Send one dummy byte here, because 'setsockopt()' above also +* sends special packet which tells sender to update our buffer +* size. This 'send_byte()' will serialize such packet with data +* reads in a loop below. Sender starts transmission only when +* it receives this single byte. +*/ + send_byte(fd, 1, 0); + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until there will be 128KB of data in rx queue. */ + while (1) { + ssize_t res; + + res = recv(fd, buf, buf_size, MSG_PEEK); + if (res == buf_size) + break; + + if (res <= 0) { + fprintf(stderr, "unexpected 'recv()' return: %zi\n", res); + exit(EXIT_FAILURE); + } + } + + /* There is 128KB of data in the socket's rx queue, +* dequeue first 64KB, credit update is not sent. +*/ + recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size); + recv_buf_size++; + + /* Updating SO_RCVLOWAT will send credit update. */ + if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, + &recv_buf_size, sizeof(recv_buf_size))) { + perror("setsockopt(SO_RC
Re: [RFC PATCH v3 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test
On Wed, Nov 29, 2023 at 12:16:54PM +0300, Arseniy Krasnov wrote: On 29.11.2023 12:16, Stefano Garzarella wrote: On Wed, Nov 22, 2023 at 09:05:10PM +0300, Arseniy Krasnov wrote: Test which checks, that updating SO_RCVLOWAT value also sends credit update message. Otherwise mutual hungup may happen when receiver didn't send credit update and then calls 'poll()' with non default SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender waits for free space at receiver's side. Important thing is that this test relies on kernel's define for maximum packet size for virtio transport and this value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this define is used to control moment when to send credit update message). If this value or its usage will be changed in kernel - this test may become useless/broken. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Update commit message by adding details about dependency for this test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. * Add comment for this dependency in 'vsock_test.c' where this define is duplicated. v2 -> v3: * Replace synchronization based on control TCP socket with vsock data socket - this is needed to allow sender transmit data only when new buffer size of receiver is visible to sender. Otherwise there is race and test fails sometimes. tools/testing/vsock/vsock_test.c | 142 +++ 1 file changed, 142 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 5b0e93f9996c..773a71260fba 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1225,6 +1225,143 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +/* This define is the same as in 'include/linux/virtio_vsock.h': + * it is used to decide when to send credit update message during + * reading from rx queue of a socket. Value and its usage in + * kernel is important for this test. + */ +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts *opts) +{ + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte more than peer's buffer size. */ + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1; + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until peer sets needed buffer size. */ + recv_byte(fd, 1, 0); + + if (send(fd, buf, buf_size, 0) != buf_size) { + perror("send failed"); + exit(EXIT_FAILURE); + } + + free(buf); + close(fd); +} + +static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts *opts) +{ + size_t recv_buf_size; + struct pollfd fds; + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + &buf_size, sizeof(buf_size))) { + perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + exit(EXIT_FAILURE); + } + + /* Send one dummy byte here, because 'setsockopt()' above also + * sends special packet which tells sender to update our buffer + * size. This 'send_byte()' will serialize such packet with data + * reads in a loop below. Sender starts transmission only when + * it receives this single byte. + */ + send_byte(fd, 1, 0); + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until there will be 128KB of data in rx queue. */ + while (1) { + ssize_t res; + + res = recv(fd, buf, buf_size, MSG_PEEK); + if (res == buf_size) + break; + + if (res <= 0) { + fprintf(stderr, "unexpected 'recv()' return: %zi\n", res); + exit(EXIT_FAILURE); + } + } + + /* There is 128KB of data in the socket's rx queue, + * dequeue first 64KB, credit update is not sent. + */ + recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size); + recv_buf_size++; + + /* Updating SO_RCVLOWAT will send credit update. */ + if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, + &recv_buf_size, sizeof(recv_buf_size))) { + perror("setsockopt(SO_RCVLOWA
Re: [RFC PATCH v4 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Thu, Nov 30, 2023 at 12:25:18AM +0300, Arseniy Krasnov wrote: Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Do not initialize 'send_update' variable - set it directly during first usage. v3 -> v4: * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. drivers/vhost/vsock.c | 3 ++- include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 3 ++- net/vmw_vsock/virtio_transport_common.c | 27 + net/vmw_vsock/vsock_loopback.c | 3 ++- 5 files changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index f75731396b7e..c5e58a60a546 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -449,8 +449,9 @@ static struct virtio_transport vhost_transport = { .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, - .read_skb = virtio_transport_read_skb, + .read_skb = virtio_transport_read_skb I think it is better to avoid this change, so when we will need to add new callbacks, we don't need to edit this line again. Please avoid it also in the other place in this patch. The rest LGTM. Thanks, Stefano }, .send_pkt = vhost_transport_send_pkt, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..c82089dee0c8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index af5bab1acee1..8b7bb7ca8ea5 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -537,8 +537,9 @@ static struct virtio_transport virtio_transport = { .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, - .read_skb = virtio_transport_read_skb, + .read_skb = virtio_transport_read_skb }, .send_pkt = virtio_transport_send_pkt, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f6dc896bf44c..1cb556ad4597 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool send_update; + + spin_lock_bh(&vvs->rx_lock); + + /* If number of available bytes is less than new SO_RCVLOWAT value, +* kick sender to send more data, because sender may sleep in its +* 'send()' syscall waiting for enough space at our side. +*/ + send_update = vvs->rx_bytes < val; + + spin_unlock_bh(&vvs->rx_lock); + + if (send_update) { + int err; + + err = virtio_transport_send_credit_update(vsk); + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 048640167411..454f69838c2a 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -96,8 +96,9 @@ static struct virtio_transport loopback_transport = { .notify_send_pre_enqueue = virtio_transport_notify_send_pr
Re: [RFC PATCH v4 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test
On Thu, Nov 30, 2023 at 12:25:19AM +0300, Arseniy Krasnov wrote: Test which checks, that updating SO_RCVLOWAT value also sends credit update message. Otherwise mutual hungup may happen when receiver didn't send credit update and then calls 'poll()' with non default SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender waits for free space at receiver's side. Important thing is that this test relies on kernel's define for maximum packet size for virtio transport and this value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this define is used to control moment when to send credit update message). If this value or its usage will be changed in kernel - this test may become useless/broken. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Update commit message by adding details about dependency for this test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. * Add comment for this dependency in 'vsock_test.c' where this define is duplicated. v2 -> v3: * Replace synchronization based on control TCP socket with vsock data socket - this is needed to allow sender transmit data only when new buffer size of receiver is visible to sender. Otherwise there is race and test fails sometimes. v3 -> v4: * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation in server part. This is needed to ensure that 'poll()' wake up us when number of bytes ready to read is equal to SO_RCVLOWAT value. tools/testing/vsock/vsock_test.c | 149 +++ 1 file changed, 149 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 01fa816868bc..68f7037834db 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1232,6 +1232,150 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +/* This define is the same as in 'include/linux/virtio_vsock.h': + * it is used to decide when to send credit update message during + * reading from rx queue of a socket. Value and its usage in + * kernel is important for this test. + */ +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts *opts) +{ + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte more than peer's buffer size. */ + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1; + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until peer sets needed buffer size. */ + recv_byte(fd, 1, 0); + + if (send(fd, buf, buf_size, 0) != buf_size) { + perror("send failed"); + exit(EXIT_FAILURE); + } + + free(buf); + close(fd); +} + +static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts *opts) +{ + size_t recv_buf_size; + struct pollfd fds; + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + &buf_size, sizeof(buf_size))) { + perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + exit(EXIT_FAILURE); + } + + /* Send one dummy byte here, because 'setsockopt()' above also +* sends special packet which tells sender to update our buffer +* size. This 'send_byte()' will serialize such packet with data +* reads in a loop below. Sender starts transmission only when +* it receives this single byte. +*/ + send_byte(fd, 1, 0); + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until there will be 128KB of data in rx queue. */ + while (1) { + ssize_t res; + + res = recv(fd, buf, buf_size, MSG_PEEK); + if (res == buf_size) + break; + + if (res <= 0) { + fprintf(stderr, "unexpected 'recv()' return: %zi\n", res); + exit(EXIT_FAILURE); + } + } + + /* There is 128KB of data in the socket's rx queue, +* dequeue first 64KB, credit update is not sent. +*/ + recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size); + recv_buf_size++;
Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote: Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Do not initialize 'send_update' variable - set it directly during first usage. v3 -> v4: * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. v4 -> v5: * Do not change callbacks order in transport structures. drivers/vhost/vsock.c | 1 + include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 1 + net/vmw_vsock/virtio_transport_common.c | 27 + net/vmw_vsock/vsock_loopback.c | 1 + 5 files changed, 31 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index f75731396b7e..4146f80db8ac 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat As we discussed in chat, better the order of the previous version, but leaving the line of `.read_skb` untouched (with the final comma). With that fixed in all transports, feel free to add: Reviewed-by: Stefano Garzarella }, .send_pkt = vhost_transport_send_pkt, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..c82089dee0c8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index af5bab1acee1..8007593a3a93 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat }, .send_pkt = virtio_transport_send_pkt, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f6dc896bf44c..1cb556ad4597 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool send_update; + + spin_lock_bh(&vvs->rx_lock); + + /* If number of available bytes is less than new SO_RCVLOWAT value, +* kick sender to send more data, because sender may sleep in its +* 'send()' syscall waiting for enough space at our side. +*/ + send_update = vvs->rx_bytes < val; + + spin_unlock_bh(&vvs->rx_lock); + + if (send_update) { + int err; + + err = virtio_transport_send_credit_update(vsk); + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 048640167411..9f4b814fbbc7 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -98,6 +98,7 @@ static struct virtio_transport loopback_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .read_skb = virtio_transport_read_skb, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat }, .send_pkt = vsock_loopback_send_pkt, -- 2.25.1
Re: [PATCH net-next v5 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test
On Thu, Nov 30, 2023 at 04:08:40PM +0300, Arseniy Krasnov wrote: Test which checks, that updating SO_RCVLOWAT value also sends credit update message. Otherwise mutual hungup may happen when receiver didn't send credit update and then calls 'poll()' with non default SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender waits for free space at receiver's side. Important thing is that this test relies on kernel's define for maximum packet size for virtio transport and this value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this define is used to control moment when to send credit update message). If this value or its usage will be changed in kernel - this test may become useless/broken. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Update commit message by adding details about dependency for this test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. * Add comment for this dependency in 'vsock_test.c' where this define is duplicated. v2 -> v3: * Replace synchronization based on control TCP socket with vsock data socket - this is needed to allow sender transmit data only when new buffer size of receiver is visible to sender. Otherwise there is race and test fails sometimes. v3 -> v4: * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation in server part. This is needed to ensure that 'poll()' wake up us when number of bytes ready to read is equal to SO_RCVLOWAT value. v4 -> v5: * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'. tools/testing/vsock/vsock_test.c | 142 +++ 1 file changed, 142 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 01fa816868bc..d66bc4987026 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1232,6 +1232,143 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +/* This define is the same as in 'include/linux/virtio_vsock.h': + * it is used to decide when to send credit update message during + * reading from rx queue of a socket. Value and its usage in + * kernel is important for this test. + */ +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts *opts) +{ + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte more than peer's buffer size. */ + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1; + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until peer sets needed buffer size. */ + recv_byte(fd, 1, 0); + + if (send(fd, buf, buf_size, 0) != buf_size) { + perror("send failed"); + exit(EXIT_FAILURE); + } + + free(buf); + close(fd); +} + +static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts *opts) +{ + size_t recv_buf_size; + struct pollfd fds; + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + &buf_size, sizeof(buf_size))) { + perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + exit(EXIT_FAILURE); + } + + /* Send one dummy byte here, because 'setsockopt()' above also +* sends special packet which tells sender to update our buffer +* size. This 'send_byte()' will serialize such packet with data +* reads in a loop below. Sender starts transmission only when +* it receives this single byte. +*/ + send_byte(fd, 1, 0); + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until there will be 128KB of data in rx queue. */ + while (1) { + ssize_t res; + + res = recv(fd, buf, buf_size, MSG_PEEK); + if (res == buf_size) + break; + + if (res <= 0) { + fprintf(stderr, "unexpected 'recv()' return: %zi\n", res); + exit(EXIT_FAILURE); + } + } + + /* There is 128KB of data in the socket's rx queue, +* dequeue first 64KB, credit update is not sent. +*/ + recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + re
Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote: On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote: On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote: > On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote: > > > > > > On 30.11.2023 16:42, Michael S. Tsirkin wrote: > > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote: > > >> Send credit update message when SO_RCVLOWAT is updated and it is bigger > > >> than number of bytes in rx queue. It is needed, because 'poll()' will > > >> wait until number of bytes in rx queue will be not smaller than > > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup > > >> for tx/rx is possible: sender waits for free space and receiver is > > >> waiting data in 'poll()'. > > >> > > >> Signed-off-by: Arseniy Krasnov > > >> --- > > >> Changelog: > > >> v1 -> v2: > > >> * Update commit message by removing 'This patch adds XXX' manner. > > >> * Do not initialize 'send_update' variable - set it directly during > > >> first usage. > > >> v3 -> v4: > > >> * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. > > >> v4 -> v5: > > >> * Do not change callbacks order in transport structures. > > >> > > >> drivers/vhost/vsock.c | 1 + > > >> include/linux/virtio_vsock.h| 1 + > > >> net/vmw_vsock/virtio_transport.c| 1 + > > >> net/vmw_vsock/virtio_transport_common.c | 27 + > > >> net/vmw_vsock/vsock_loopback.c | 1 + > > >> 5 files changed, 31 insertions(+) > > >> > > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c > > >> index f75731396b7e..4146f80db8ac 100644 > > >> --- a/drivers/vhost/vsock.c > > >> +++ b/drivers/vhost/vsock.c > > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = { > > >> .notify_buffer_size = virtio_transport_notify_buffer_size, > > >> > > >> .read_skb = virtio_transport_read_skb, > > >> +.notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat > > >> }, > > >> > > >> .send_pkt = vhost_transport_send_pkt, > > >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h > > >> index ebb3ce63d64d..c82089dee0c8 100644 > > >> --- a/include/linux/virtio_vsock.h > > >> +++ b/include/linux/virtio_vsock.h > > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); > > >> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); > > >> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); > > >> int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); > > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); > > >> #endif /* _LINUX_VIRTIO_VSOCK_H */ > > >> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c > > >> index af5bab1acee1..8007593a3a93 100644 > > >> --- a/net/vmw_vsock/virtio_transport.c > > >> +++ b/net/vmw_vsock/virtio_transport.c > > >> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = { > > >> .notify_buffer_size = virtio_transport_notify_buffer_size, > > >> > > >> .read_skb = virtio_transport_read_skb, > > >> +.notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat > > >> }, > > >> > > >> .send_pkt = virtio_transport_send_pkt, > > >> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c > > >> index f6dc896bf44c..1cb556ad4597 100644 > > >> --- a/net/vmw_vsock/virtio_transport_common.c > > >> +++ b/net/vmw_vsock/virtio_transport_common.c > > >> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto > > >> } > > >> EXPORT_SYMBOL_GPL(virtio_transport_read_skb); > > >> > > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, > > >> int val) > > >> +{ > > >> +
Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Fri, Dec 01, 2023 at 11:35:56AM +0300, Arseniy Krasnov wrote: On 01.12.2023 11:27, Stefano Garzarella wrote: On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote: On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote: On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote: > On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote: > > > > > > On 30.11.2023 16:42, Michael S. Tsirkin wrote: > > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote: > > >> Send credit update message when SO_RCVLOWAT is updated and it is bigger > > >> than number of bytes in rx queue. It is needed, because 'poll()' will > > >> wait until number of bytes in rx queue will be not smaller than > > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup > > >> for tx/rx is possible: sender waits for free space and receiver is > > >> waiting data in 'poll()'. > > >> > > >> Signed-off-by: Arseniy Krasnov > > >> --- > > >> Changelog: > > >> v1 -> v2: > > >> * Update commit message by removing 'This patch adds XXX' manner. > > >> * Do not initialize 'send_update' variable - set it directly during > > >> first usage. > > >> v3 -> v4: > > >> * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. > > >> v4 -> v5: > > >> * Do not change callbacks order in transport structures. > > >> > > >> drivers/vhost/vsock.c | 1 + > > >> include/linux/virtio_vsock.h | 1 + > > >> net/vmw_vsock/virtio_transport.c | 1 + > > >> net/vmw_vsock/virtio_transport_common.c | 27 + > > >> net/vmw_vsock/vsock_loopback.c | 1 + > > >> 5 files changed, 31 insertions(+) > > >> > > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c > > >> index f75731396b7e..4146f80db8ac 100644 > > >> --- a/drivers/vhost/vsock.c > > >> +++ b/drivers/vhost/vsock.c > > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = { > > >> .notify_buffer_size = virtio_transport_notify_buffer_size, > > >> > > >> .read_skb = virtio_transport_read_skb, > > >> + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat > > >> }, > > >> > > >> .send_pkt = vhost_transport_send_pkt, > > >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h > > >> index ebb3ce63d64d..c82089dee0c8 100644 > > >> --- a/include/linux/virtio_vsock.h > > >> +++ b/include/linux/virtio_vsock.h > > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); > > >> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); > > >> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); > > >> int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); > > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); > > >> #endif /* _LINUX_VIRTIO_VSOCK_H */ > > >> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c > > >> index af5bab1acee1..8007593a3a93 100644 > > >> --- a/net/vmw_vsock/virtio_transport.c > > >> +++ b/net/vmw_vsock/virtio_transport.c > > >> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = { > > >> .notify_buffer_size = virtio_transport_notify_buffer_size, > > >> > > >> .read_skb = virtio_transport_read_skb, > > >> + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat > > >> }, > > >> > > >> .send_pkt = virtio_transport_send_pkt, > > >> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c > > >> index f6dc896bf44c..1cb556ad4597 100644 > > >> --- a/net/vmw_vsock/virtio_transport_common.c > > >> +++ b/net/vmw_vsock/virtio_transport_common.c > > >> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto > > >> } > > >> EXPORT_SYMBOL_GPL(virtio_transport_read_skb); > > >> > > >> +int virtio_transport_notify_set_rcvlowat(struc
Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Sat, Dec 02, 2023 at 03:22:39PM -0500, Michael S. Tsirkin wrote: On Fri, Dec 01, 2023 at 01:40:41PM +0300, Arseniy Krasnov wrote: On 01.12.2023 12:48, Stefano Garzarella wrote: > On Fri, Dec 01, 2023 at 11:35:56AM +0300, Arseniy Krasnov wrote: >> >> >> On 01.12.2023 11:27, Stefano Garzarella wrote: >>> On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote: >>>> On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote: >>>>> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote: >>>>> > On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote: >>>>> > > >>>>> > > >>>>> > > On 30.11.2023 16:42, Michael S. Tsirkin wrote: >>>>> > > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote: >>>>> > > >> Send credit update message when SO_RCVLOWAT is updated and it is bigger >>>>> > > >> than number of bytes in rx queue. It is needed, because 'poll()' will >>>>> > > >> wait until number of bytes in rx queue will be not smaller than >>>>> > > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup >>>>> > > >> for tx/rx is possible: sender waits for free space and receiver is >>>>> > > >> waiting data in 'poll()'. >>>>> > > >> >>>>> > > >> Signed-off-by: Arseniy Krasnov >>>>> > > >> --- >>>>> > > >> Changelog: >>>>> > > >> v1 -> v2: >>>>> > > >> * Update commit message by removing 'This patch adds XXX' manner. >>>>> > > >> * Do not initialize 'send_update' variable - set it directly during >>>>> > > >> first usage. >>>>> > > >> v3 -> v4: >>>>> > > >> * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. >>>>> > > >> v4 -> v5: >>>>> > > >> * Do not change callbacks order in transport structures. >>>>> > > >> >>>>> > > >> drivers/vhost/vsock.c | 1 + >>>>> > > >> include/linux/virtio_vsock.h | 1 + >>>>> > > >> net/vmw_vsock/virtio_transport.c | 1 + >>>>> > > >> net/vmw_vsock/virtio_transport_common.c | 27 + >>>>> > > >> net/vmw_vsock/vsock_loopback.c | 1 + >>>>> > > >> 5 files changed, 31 insertions(+) >>>>> > > >> >>>>> > > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c >>>>> > > >> index f75731396b7e..4146f80db8ac 100644 >>>>> > > >> --- a/drivers/vhost/vsock.c >>>>> > > >> +++ b/drivers/vhost/vsock.c >>>>> > > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = { >>>>> > > >> .notify_buffer_size = virtio_transport_notify_buffer_size, >>>>> > > >> >>>>> > > >> .read_skb = virtio_transport_read_skb, >>>>> > > >> + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat >>>>> > > >> }, >>>>> > > >> >>>>> > > >> .send_pkt = vhost_transport_send_pkt, >>>>> > > >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h >>>>> > > >> index ebb3ce63d64d..c82089dee0c8 100644 >>>>> > > >> --- a/include/linux/virtio_vsock.h >>>>> > > >> +++ b/include/linux/virtio_vsock.h >>>>> > > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); >>>>> > > >> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); >>>>> > > >> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); >>>>> > > >> int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); >>>>> > > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); >>>>> > > >>
Re: [PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages
On Tue, Dec 05, 2023 at 09:48:05AM +0300, Arseniy Krasnov wrote: Add one more condition for sending credit update during dequeue from stream socket: when number of bytes in the rx queue is smaller than SO_RCVLOWAT value of the socket. This is actual for non-default value of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data transmission, because we need at least SO_RCVLOWAT bytes in our rx queue to wake up user for reading data (in corner case it is also possible to stuck both tx and rx sides, this is why 'Fixes' is used). Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov --- net/vmw_vsock/virtio_transport_common.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e137d740804e..461c89882142 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_sock *vvs = vsk->trans; size_t bytes, total = 0; struct sk_buff *skb; + bool low_rx_bytes; int err = -EFAULT; u32 free_space; @@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + low_rx_bytes = (vvs->rx_bytes < + sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); As in the previous patch, should we avoid the update it if `fwd_cnt` and `last_fwd_cnt` are the same? Now I'm thinking if it is better to add that check directly in virtio_transport_send_credit_update(). Stefano spin_unlock_bh(&vvs->rx_lock); @@ -611,9 +614,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * too high causes extra messages. Too low causes transmitter * stalls. As stalls are in theory more expensive than extra * messages, we set the limit to a high value. TODO: experiment -* with different values. +* with different values. Also send credit update message when +* number of bytes in rx queue is not enough to wake up reader. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || + low_rx_bytes) virtio_transport_send_credit_update(vsk); return total; -- 2.25.1
Re: [PATCH net-next v6 4/4] vsock/test: two tests to check credit update logic
On Tue, Dec 05, 2023 at 09:48:06AM +0300, Arseniy Krasnov wrote: Both tests are almost same, only differs in two 'if' conditions, so implemented in a single function. Tests check, that credit update message is sent: 1) During setting SO_RCVLOWAT value of the socket. 2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Update commit message by adding details about dependency for this test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. * Add comment for this dependency in 'vsock_test.c' where this define is duplicated. v2 -> v3: * Replace synchronization based on control TCP socket with vsock data socket - this is needed to allow sender transmit data only when new buffer size of receiver is visible to sender. Otherwise there is race and test fails sometimes. v3 -> v4: * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation in server part. This is needed to ensure that 'poll()' wake up us when number of bytes ready to read is equal to SO_RCVLOWAT value. v4 -> v5: * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'. v5 -> v6: * Add second test which checks, that credit update is sent during reading data from socket. * Update commit message. tools/testing/vsock/vsock_test.c | 175 +++ 1 file changed, 175 insertions(+) Reviewed-by: Stefano Garzarella diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 01fa816868bc..66246d81d654 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) +/* This define is the same as in 'include/linux/virtio_vsock.h': + * it is used to decide when to send credit update message during + * reading from rx queue of a socket. Value and its usage in + * kernel is important for this test. + */ +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) + +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts *opts) +{ + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, 1234); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + /* Send 1 byte more than peer's buffer size. */ + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1; + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + /* Wait until peer sets needed buffer size. */ + recv_byte(fd, 1, 0); + + if (send(fd, buf, buf_size, 0) != buf_size) { + perror("send failed"); + exit(EXIT_FAILURE); + } + + free(buf); + close(fd); +} + +static void test_stream_credit_update_test(const struct test_opts *opts, + bool low_rx_bytes_test) +{ + size_t recv_buf_size; + struct pollfd fds; + size_t buf_size; + void *buf; + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE; + + if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE, + &buf_size, sizeof(buf_size))) { + perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)"); + exit(EXIT_FAILURE); + } + + if (low_rx_bytes_test) { + /* Set new SO_RCVLOWAT here. This enables sending credit +* update when number of bytes if our rx queue become < +* SO_RCVLOWAT value. +*/ + recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE; + + if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, + &recv_buf_size, sizeof(recv_buf_size))) { + perror("setsockopt(SO_RCVLOWAT)"); + exit(EXIT_FAILURE); + } + } + + /* Send one dummy byte here, because 'setsockopt()' above also +* sends special packet which tells sender to update our buffer +* size. This 'send_byte()' will serialize such packet with data +* reads in a loop below. Sender starts transmission only when +* it receives this single byte. +*/ + send_byte(fd, 1, 0); + + buf = malloc(buf_size); + if (!buf) { + perror("malloc"); + exit(EXIT_F
Re: [PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages
On Tue, Dec 05, 2023 at 03:07:47PM +0300, Arseniy Krasnov wrote: On 05.12.2023 13:54, Stefano Garzarella wrote: On Tue, Dec 05, 2023 at 09:48:05AM +0300, Arseniy Krasnov wrote: Add one more condition for sending credit update during dequeue from stream socket: when number of bytes in the rx queue is smaller than SO_RCVLOWAT value of the socket. This is actual for non-default value of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data transmission, because we need at least SO_RCVLOWAT bytes in our rx queue to wake up user for reading data (in corner case it is also possible to stuck both tx and rx sides, this is why 'Fixes' is used). Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov --- net/vmw_vsock/virtio_transport_common.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e137d740804e..461c89882142 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_sock *vvs = vsk->trans; size_t bytes, total = 0; struct sk_buff *skb; + bool low_rx_bytes; int err = -EFAULT; u32 free_space; @@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + low_rx_bytes = (vvs->rx_bytes < + sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); As in the previous patch, should we avoid the update it if `fwd_cnt` and `last_fwd_cnt` are the same? Now I'm thinking if it is better to add that check directly in virtio_transport_send_credit_update(). Good point, but I think, that it is better to keep this check here, because access to 'fwd_cnt' and 'last_fwd_cnt' requires taking rx_lock - so I guess it is better to avoid taking this lock every time in 'virtio_transport_send_credit_update()'. Yeah, I agree. So may be we can do something like: fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; free_space = vvs->buf_alloc - fwd_cnt_delta; Pre-existing issue, but should we handle the wrap (e.g. fwd_cnt wrapped, but last_fwd_cnt not yet?). Maybe in that case we can foce the status update. and then, after lock is released: if (fwd_cnt_delta && (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) virtio_transport_send_credit_update(vsk); WDYT? Yep, I agree. Also, I guess that next idea to update this optimization(in next patchset), is to make threshold depends on vvs->buf_alloc. Because if someone changes minimum buffer size to for example 32KB, and then sets buffer size to 32KB, then free_space will be always non-zero, thus optimization is off now and credit update is sent on every read. But does it make sense to allow a buffer smaller than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE? Maybe we should fail in virtio_transport_notify_buffer_size() or use it as minimum. Stefano
[PATCH net] vsock/virtio: fix "comparison of distinct pointer types lacks a cast" warning
After backporting commit 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") in CentOS Stream 9, CI reported the following error: In file included from ./include/linux/kernel.h:17, from ./include/linux/list.h:9, from ./include/linux/preempt.h:11, from ./include/linux/spinlock.h:56, from net/vmw_vsock/virtio_transport_common.c:9: net/vmw_vsock/virtio_transport_common.c: In function âvirtio_transport_can_zcopyâ: ./include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast [-Werror] 20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) | ^~ ./include/linux/minmax.h:26:18: note: in expansion of macro â__typecheckâ 26 | (__typecheck(x, y) && __no_side_effects(x, y)) | ^~~ ./include/linux/minmax.h:36:31: note: in expansion of macro â__safe_cmpâ 36 | __builtin_choose_expr(__safe_cmp(x, y), \ | ^~ ./include/linux/minmax.h:45:25: note: in expansion of macro â__careful_cmpâ 45 | #define min(x, y) __careful_cmp(x, y, <) | ^ net/vmw_vsock/virtio_transport_common.c:63:37: note: in expansion of macro âminâ 63 | int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS); We could solve it by using min_t(), but this operation seems entirely unnecessary, because we also pass MAX_SKB_FRAGS to iov_iter_npages(), which performs almost the same check, returning at most MAX_SKB_FRAGS elements. So, let's eliminate this unnecessary comparison. Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") Cc: avkras...@salutedevices.com Signed-off-by: Stefano Garzarella --- net/vmw_vsock/virtio_transport_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f6dc896bf44c..c8e162c9d1df 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -59,8 +59,7 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, t_ops = virtio_transport_get_ops(info->vsk); if (t_ops->can_msgzerocopy) { - int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS); - int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS); + int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS); /* +1 is for packet header. */ return t_ops->can_msgzerocopy(pages_to_send + 1); -- 2.43.0
Re: [PATCH net-next v7 3/4] virtio/vsock: fix logic which reduces credit update messages
On Thu, Dec 07, 2023 at 01:50:05AM +0300, Arseniy Krasnov wrote: On 07.12.2023 01:08, Michael S. Tsirkin wrote: On Thu, Dec 07, 2023 at 12:52:51AM +0300, Arseniy Krasnov wrote: On 07.12.2023 00:53, Michael S. Tsirkin wrote: On Thu, Dec 07, 2023 at 12:18:48AM +0300, Arseniy Krasnov wrote: Add one more condition for sending credit update during dequeue from stream socket: when number of bytes in the rx queue is smaller than SO_RCVLOWAT value of the socket. This is actual for non-default value of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data transmission, because we need at least SO_RCVLOWAT bytes in our rx queue to wake up user for reading data (in corner case it is also possible to stuck both tx and rx sides, this is why 'Fixes' is used). Also handle case when 'fwd_cnt' wraps, while 'last_fwd_cnt' is still not. Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov --- Changelog: v6 -> v7: * Handle wrap of 'fwd_cnt'. * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'. net/vmw_vsock/virtio_transport_common.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e137d740804e..39f8660d825d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_sock *vvs = vsk->trans; size_t bytes, total = 0; struct sk_buff *skb; + u32 fwd_cnt_delta; + bool low_rx_bytes; int err = -EFAULT; u32 free_space; @@ -601,7 +603,15 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } } - free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + /* Handle wrap of 'fwd_cnt'. */ + if (vvs->fwd_cnt < vvs->last_fwd_cnt) + fwd_cnt_delta = vvs->fwd_cnt + (U32_MAX - vvs->last_fwd_cnt); Are you sure there's no off by one here? for example if fwd_cnt is 0 and last_fwd_cnt is 0xf then apparently delta is 0. Seems yes, I need +1 here And then you will get a nop, because assigning U32_MAX + 1 to u32 gives you 0. Adding () does nothing to change the result, + and - are commutative. Ahh, unsigned here, yes. Ooops, sorry I was confused here! @Stefano, what did You mean about wrapping here? I think Michael is right, for example Yep, I agree! Sorry for this wrong suggestion! Stefano vvs->fwd_cnt wraps and now == 5 vvs->last_fwd_cnt == 0x now delta before this patch will be 6 - correct value May be I didn't get your idea, so implement it very naive? Thanks, Arseniy + else + fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; I actually don't see what is wrong with just fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt 32 bit unsigned math will I think handle wrap around correctly. And given buf_alloc is also u32 - I don't see where the bug is in the original code. I think problem is when fwd_cnt wraps, while last_fwd_cnt is not. In this case fwd_cnt_delta will be too big, so we won't send credit update which leads to stall for sender Thanks, Arseniy Care coming up with an example? + + free_space = vvs->buf_alloc - fwd_cnt_delta; + low_rx_bytes = (vvs->rx_bytes < + sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); spin_unlock_bh(&vvs->rx_lock); @@ -611,9 +621,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * too high causes extra messages. Too low causes transmitter * stalls. As stalls are in theory more expensive than extra * messages, we set the limit to a high value. TODO: experiment -* with different values. +* with different values. Also send credit update message when +* number of bytes in rx queue is not enough to wake up reader. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + if (fwd_cnt_delta && + (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) virtio_transport_send_credit_update(vsk); return total; -- 2.25.1
Re: [PATCH net-next v7 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Thu, Dec 07, 2023 at 12:18:47AM +0300, Arseniy Krasnov wrote: Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Signed-off-by: Arseniy Krasnov --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Do not initialize 'send_update' variable - set it directly during first usage. v3 -> v4: * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. v4 -> v5: * Do not change callbacks order in transport structures. v5 -> v6: * Reorder callbacks in transport structures. * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'. drivers/vhost/vsock.c | 1 + include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 1 + net/vmw_vsock/virtio_transport_common.c | 30 + net/vmw_vsock/vsock_loopback.c | 1 + 5 files changed, 34 insertions(+) Reviewed-by: Stefano Garzarella diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index f75731396b7e..ec20ecff85c7 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = { .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .read_skb = virtio_transport_read_skb, }, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..c82089dee0c8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index af5bab1acee1..f495b9e5186b 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = { .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, + .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .read_skb = virtio_transport_read_skb, }, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f6dc896bf44c..e137d740804e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1684,6 +1684,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + bool send_update; + + spin_lock_bh(&vvs->rx_lock); + + /* If number of available bytes is less than new SO_RCVLOWAT value, +* kick sender to send more data, because sender may sleep in its +* 'send()' syscall waiting for enough space at our side. Also +* don't send credit update when peer already knows actual value - +* such transmission will be useless. +*/ + send_update = (vvs->rx_bytes < val) && + (vvs->fwd_cnt != vvs->last_fwd_cnt); + + spin_unlock_bh(&vvs->rx_lock); + + if (send_update) { + int err; + + err = virtio_transport_send_credit_update(vsk); + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 048640167411..6dea6119f5b2 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_l
Re: [PATCH] vsock/virtio: Fix unsigned integer wrap around in virtio_transport_has_space()
On Mon, Dec 11, 2023 at 05:25:05PM +0300, Nikolay Kuratov wrote: We need to do signed arithmetic if we expect condition `if (bytes < 0)` to be possible Found by Linux Verification Center (linuxtesting.org) with SVACE We should add: Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Signed-off-by: Nikolay Kuratov --- net/vmw_vsock/virtio_transport_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index c8e162c9d1df..6df246b53260 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -843,7 +843,7 @@ static s64 virtio_transport_has_space(struct vsock_sock *vsk) struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; - bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); If we respect the credit, this should not happen. It can happen, though, that the receiver changes its buffer size while we're communicating, and if it reduces it, this could happen. So yes, we need to fix it! Thanks! Reviewed-by: Stefano Garzarella if (bytes < 0) bytes = 0; -- 2.34.1
Re: [PATCH v2] vsock/virtio: Fix unsigned integer wrap around in virtio_transport_has_space()
On Mon, Dec 11, 2023 at 07:23:17PM +0300, Nikolay Kuratov wrote: We need to do signed arithmetic if we expect condition `if (bytes < 0)` to be possible Found by Linux Verification Center (linuxtesting.org) with SVACE Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Signed-off-by: Nikolay Kuratov --- V1 -> V2: Added Fixes section Please, next time carry also R-b tags. net/vmw_vsock/virtio_transport_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Reviewed-by: Stefano Garzarella Thanks, Stefano diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index c8e162c9d1df..6df246b53260 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -843,7 +843,7 @@ static s64 virtio_transport_has_space(struct vsock_sock *vsk) struct virtio_vsock_sock *vvs = vsk->trans; s64 bytes; - bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + bytes = (s64)vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); if (bytes < 0) bytes = 0; -- 2.34.1
Re: [PATCH net-next v8 3/4] virtio/vsock: fix logic which reduces credit update messages
On Tue, Dec 12, 2023 at 12:16:57AM +0300, Arseniy Krasnov wrote: Add one more condition for sending credit update during dequeue from stream socket: when number of bytes in the rx queue is smaller than SO_RCVLOWAT value of the socket. This is actual for non-default value of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data transmission, because we need at least SO_RCVLOWAT bytes in our rx queue to wake up user for reading data (in corner case it is also possible to stuck both tx and rx sides, this is why 'Fixes' is used). Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov --- Changelog: v6 -> v7: * Handle wrap of 'fwd_cnt'. * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'. v7 -> v8: * Remove unneeded/wrong handling of wrap for 'fwd_cnt'. net/vmw_vsock/virtio_transport_common.c | 13 ++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) Reviewed-by: Stefano Garzarella Thanks! Stefano diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e137d740804e..8572f94bba88 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, struct virtio_vsock_sock *vvs = vsk->trans; size_t bytes, total = 0; struct sk_buff *skb; + u32 fwd_cnt_delta; + bool low_rx_bytes; int err = -EFAULT; u32 free_space; @@ -601,7 +603,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } } - free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt); + fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; + free_space = vvs->buf_alloc - fwd_cnt_delta; + low_rx_bytes = (vvs->rx_bytes < + sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); spin_unlock_bh(&vvs->rx_lock); @@ -611,9 +616,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * too high causes extra messages. Too low causes transmitter * stalls. As stalls are in theory more expensive than extra * messages, we set the limit to a high value. TODO: experiment -* with different values. +* with different values. Also send credit update message when +* number of bytes in rx queue is not enough to wake up reader. */ - if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) + if (fwd_cnt_delta && + (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) virtio_transport_send_credit_update(vsk); return total; -- 2.25.1
Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT
On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote: On 12.12.2023 19:12, Michael S. Tsirkin wrote: On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote: On 12.12.2023 18:54, Michael S. Tsirkin wrote: On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote: Hello, DESCRIPTION This patchset fixes old problem with hungup of both rx/tx sides and adds test for it. This happens due to non-default SO_RCVLOWAT value and deferred credit update in virtio/vsock. Link to previous old patchset: https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/ Patchset: Acked-by: Michael S. Tsirkin Thanks! But I worry whether we actually need 3/8 in net not in net-next. Because of "Fixes" tag ? I think this problem is not critical and reproducible only in special cases, but i'm not familiar with netdev process so good, so I don't have strong opinion. I guess @Stefano knows better. Thanks, Arseniy Fixes means "if you have that other commit then you need this commit too". I think as a minimum you need to rearrange patches to make the fix go in first. We don't want a regression followed by a fix. I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this patch fixes problem that is not related with the new patches from this patchset. I agree, patch 3 is for sure net material (I'm fine with both rearrangement or send it separately), but IMHO also patch 2 could be. I think with the same fixes tag, since before commit b89d882dc9fc ("vsock/virtio: reduce credit update messages") we sent a credit update for every bytes we read, so we should not have this problem, right? So, maybe all the series could be "net". Thanks, Stefano
Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT
On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote: On 13.12.2023 11:43, Stefano Garzarella wrote: On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote: On 12.12.2023 19:12, Michael S. Tsirkin wrote: On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote: On 12.12.2023 18:54, Michael S. Tsirkin wrote: On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote: Hello, DESCRIPTION This patchset fixes old problem with hungup of both rx/tx sides and adds test for it. This happens due to non-default SO_RCVLOWAT value and deferred credit update in virtio/vsock. Link to previous old patchset: https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/ Patchset: Acked-by: Michael S. Tsirkin Thanks! But I worry whether we actually need 3/8 in net not in net-next. Because of "Fixes" tag ? I think this problem is not critical and reproducible only in special cases, but i'm not familiar with netdev process so good, so I don't have strong opinion. I guess @Stefano knows better. Thanks, Arseniy Fixes means "if you have that other commit then you need this commit too". I think as a minimum you need to rearrange patches to make the fix go in first. We don't want a regression followed by a fix. I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this patch fixes problem that is not related with the new patches from this patchset. I agree, patch 3 is for sure net material (I'm fine with both rearrangement or send it separately), but IMHO also patch 2 could be. I think with the same fixes tag, since before commit b89d882dc9fc ("vsock/virtio: reduce credit update messages") we sent a credit update for every bytes we read, so we should not have this problem, right? Agree for 2, so I think I can rearrange: two fixes go first, then current 0001, and then tests. And send it as V9 for 'net' only ? Maybe you can add this to patch 1 if we want it on net: Fixes: e38f22c860ed ("vsock: SO_RCVLOWAT transport set callback") Then I think that patch should go before patch 2, so we don't need to touch that code multiple times. so, IMHO the order should be the actual order or 3 - 1 - 2 - 4. Another option is to send just 2 & 3 to net, and the rest (1 & 4) to net-next. IMHO should be fine to send the entire series to net with the fixes tag also in patch 1. Net maintainers and Michael might have a different advice. Thanks, Stefano
Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT
On Wed, Dec 13, 2023 at 08:11:57PM +0300, Arseniy Krasnov wrote: On 13.12.2023 18:13, Michael S. Tsirkin wrote: On Wed, Dec 13, 2023 at 10:05:44AM -0500, Michael S. Tsirkin wrote: On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote: On 13.12.2023 11:43, Stefano Garzarella wrote: On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote: On 12.12.2023 19:12, Michael S. Tsirkin wrote: On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote: On 12.12.2023 18:54, Michael S. Tsirkin wrote: On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote: Hello, DESCRIPTION This patchset fixes old problem with hungup of both rx/tx sides and adds test for it. This happens due to non-default SO_RCVLOWAT value and deferred credit update in virtio/vsock. Link to previous old patchset: https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/ Patchset: Acked-by: Michael S. Tsirkin Thanks! But I worry whether we actually need 3/8 in net not in net-next. Because of "Fixes" tag ? I think this problem is not critical and reproducible only in special cases, but i'm not familiar with netdev process so good, so I don't have strong opinion. I guess @Stefano knows better. Thanks, Arseniy Fixes means "if you have that other commit then you need this commit too". I think as a minimum you need to rearrange patches to make the fix go in first. We don't want a regression followed by a fix. I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this patch fixes problem that is not related with the new patches from this patchset. I agree, patch 3 is for sure net material (I'm fine with both rearrangement or send it separately), but IMHO also patch 2 could be. I think with the same fixes tag, since before commit b89d882dc9fc ("vsock/virtio: reduce credit update messages") we sent a credit update for every bytes we read, so we should not have this problem, right? Agree for 2, so I think I can rearrange: two fixes go first, then current 0001, and then tests. And send it as V9 for 'net' only ? Thanks, Arseniy hmm why not net-next? Oh I missed your previous discussion. I think everything in net-next is safer. Having said that, I won't nack it net, either. So, summarizing all above: 1) This patchset entirely goes to net-next as v9 2) I reorder patches like 3 - 2 - 1 - 4, e.g. two fixes goes first with Fixes tag 3) Add Acked-by: Michael S. Tsirkin to each patch @Michael, @Stefano ? Okay, let's do that ;-) Stefano
Re: [PATCH net-next v9 0/4] send credit update during setting SO_RCVLOWAT
On Thu, Dec 14, 2023 at 12:19:43PM +0300, Arseniy Krasnov wrote: Hello, DESCRIPTION This patchset fixes old problem with hungup of both rx/tx sides and adds test for it. This happens due to non-default SO_RCVLOWAT value and deferred credit update in virtio/vsock. Link to previous old patchset: https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/ Here is what happens step by step: TEST INITIAL CONDITIONS 1) Vsock buffer size is 128KB. 2) Maximum packet size is also 64KB as defined in header (yes it is hardcoded, just to remind about that value). 3) SO_RCVLOWAT is default, e.g. 1 byte. STEPS SENDER RECEIVER 1) sends 128KB + 1 byte in a single buffer. 128KB will be sent, but for 1 byte sender will wait for free space at peer. Sender goes to sleep. 2) reads 64KB, credit update not sent 3) sets SO_RCVLOWAT to 64KB + 1 4) poll() -> wait forever, there is only 64KB available to read. So in step 4) receiver also goes to sleep, waiting for enough data or connection shutdown message from the sender. Idea to fix it is that rx kicks tx side to continue transmission (and may be close connection) when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and this value is bigger than number of available bytes to read. I've added small test for this, but not sure as it uses hardcoded value for maximum packet length, this value is defined in kernel header and used to control deferred credit update. And as this is not available to userspace, I can't control test parameters correctly (if one day this define will be changed - test may become useless). Head for this patchset is: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a Link to v1: https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/ Link to v2: https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/ Link to v3: https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/ Link to v4: https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/ Link to v5: https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/ Link to v6: https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/ Link to v7: https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/ Link to v8: https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/ Changelog: v1 -> v2: * Patchset rebased and tested on new HEAD of net-next (see hash above). * New patch is added as 0001 - it removes return from SO_RCVLOWAT set callback in 'af_vsock.c' when transport callback is set - with that we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do not copy-paste it to every transport. It was discussed in v1. * See per-patch changelog after ---. v2 -> v3: * See changelog after --- in 0003 only (0001 and 0002 still same). v3 -> v4: * Patchset rebased and tested on new HEAD of net-next (see hash above). * See per-patch changelog after ---. v4 -> v5: * Change patchset tag 'RFC' -> 'net-next'. * See per-patch changelog after ---. v5 -> v6: * New patch 0003 which sends credit update during reading bytes from socket. * See per-patch changelog after ---. v6 -> v7: * Patchset rebased and tested on new HEAD of net-next (see hash above). * See per-patch changelog after ---. v7 -> v8: * See per-patch changelog after ---. v8 -> v9: * Patchset rebased and tested on new HEAD of net-next (see hash above). * Add 'Fixes' tag for the current 0002. * Reorder patches by moving two fixes first. Arseniy Krasnov (4): virtio/vsock: fix logic which reduces credit update messages virtio/vsock: send credit update during setting SO_RCVLOWAT vsock: update SO_RCVLOWAT setting callback vsock/test: two tests to check credit update logic This order will break the bisectability, since now patch 2 will not build if patch 3 is not applied. So you need to implement in patch 2 `set_rcvlowat` and in patch 3 updated it to `notify_set_rcvlowat`, otherwise we always need to backport patch 3 in stable branches, that should be applied before patch 2. You have 2 options: a. move patch 3 before patch 2 without changing the code b. change patch 2 to use `set_rcvlowat` and updated that code in patch 3 I don't have a strong opinion, but I slightly prefer option a. BTW that forces us to backport more patches on stable branches, so I'm fine with option b as well. That said: Nacked-by: Stefano Garzarella
Re: [PATCH net-next v9 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT
On Thu, Dec 14, 2023 at 12:19:45PM +0300, Arseniy Krasnov wrote: Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin --- Changelog: v1 -> v2: * Update commit message by removing 'This patch adds XXX' manner. * Do not initialize 'send_update' variable - set it directly during first usage. v3 -> v4: * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars. v4 -> v5: * Do not change callbacks order in transport structures. v5 -> v6: * Reorder callbacks in transport structures. * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'. v8 -> v9: * Add 'Fixes' tag. drivers/vhost/vsock.c | 1 + include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport.c| 1 + net/vmw_vsock/virtio_transport_common.c | 30 + net/vmw_vsock/vsock_loopback.c | 1 + 5 files changed, 34 insertions(+) As I already mentioned in the cover letter, this patch doesn't compile unless we apply patch 3 before this one, so: Nacked-by: Stefano Garzarella
Re: [RFC PATCH 1/5] vsock/virtio: Extend virtio-vsock spec with an "order" field
As Alyssa suggested, we should discuss spec changes in the virtio ML. BTW as long as this is an RFC, it's fine. Just be sure, though, to remember to merge the change in the specification first versus the patches in Linux. So I recommend that you don't send a non-RFC set into Linux until you have agreed on the changes to the specification. On Fri, May 17, 2024 at 10:46:03PM GMT, Xuewei Niu wrote: The "order" field determines the location of the device in the linked list, the device with CID 4, having a smallest order, is in the first place, and so forth. Do we really need an order, or would it suffice to just indicate the device to be used by default? (as the default gateway in networking) Rules: * It doesn’t have to be continuous; * It cannot exist conflicts; * It is optional for the mode of a single device, but is required for the mode of multiple devices. We should also add a feature to support this new field. Signed-off-by: Xuewei Niu --- include/uapi/linux/virtio_vsock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h index 64738838bee5..b62ec7d2ab1e 100644 --- a/include/uapi/linux/virtio_vsock.h +++ b/include/uapi/linux/virtio_vsock.h @@ -43,6 +43,7 @@ struct virtio_vsock_config { __le64 guest_cid; + __le64 order; Do we really need 64 bits for the order? } __attribute__((packed)); enum virtio_vsock_event_id { -- 2.34.1
Re: [RFC PATCH 2/5] vsock/virtio: Add support for multi-devices
On Fri, May 17, 2024 at 10:46:04PM GMT, Xuewei Niu wrote: The maximum number of devices is limited by `MAX_VSOCK_NUM`. Extends `vsock_transport` struct with 4 methods to support multi-devices: * `get_virtio_vsock()`: It receives a CID, and returns a struct of virtio vsock. This method is designed to select a vsock device by its CID. * `get_default_cid()`: It receives nothing, returns the default CID of the first vsock device registered to the kernel. * `get_local_cids()`: It returns a vector of vsock devices' CIDs. * `compare_order()`: It receives two different CIDs, named "left" and "right" respectively. It returns "-1" while the "left" is behind the "right". Otherwise, return "1". `get_local_cid()` is retained, but returns "-1" if the transport supports multi-devices. Replaces the single instance of `virtio_vsock` with a list, named `virtio_vsock_list`. The devices are inserted into the list when probing. The kernel will deny devices from being registered if there are conflicts existing in CIDs or orders. Signed-off-by: Xuewei Niu --- include/net/af_vsock.h | 16 ++ include/uapi/linux/vm_sockets.h | 6 + net/vmw_vsock/af_vsock.c| 82 ++-- net/vmw_vsock/virtio_transport.c| 246 ++-- net/vmw_vsock/virtio_transport_common.c | 10 +- 5 files changed, 293 insertions(+), 67 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 535701efc1e5..0151296a0bc5 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -174,6 +174,22 @@ struct vsock_transport { /* Addressing. */ u32 (*get_local_cid)(void); + /* Held rcu read lock by the caller. */ We should also explain why the rcu is needed. + struct virtio_vsock *(*get_virtio_vsock)(unsigned int cid); af_vsock supports several transports (i.e. HyperV, VMCI, VIRTIO/VHOST, loobpack), so we need to be generic here. In addition, the pointer returned by this function is never used, so why we need this? + unsigned int (*get_default_cid)(void); + /* Get an list containing all the CIDs of registered vsock. Return +* the length of the list. +* +* Held rcu read lock by the caller. +*/ + int (*get_local_cids)(unsigned int *local_cids); Why int? get_local_cid() returns an u32, we should do the same. In addition, can we remove get_local_cid() and implement get_local_cids() for all the transports? + /* Compare the order of two devices. Given the guest CIDs of two +* different devices, returns -1 while the left is behind the right. +* Otherwise, return 1. +* +* Held rcu read lock by the caller. +*/ + int (*compare_order)(unsigned int left, unsigned int right); Please check better the type for CIDs all over the place. /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index ed07181d4eff..36ca5023293a 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -189,6 +189,12 @@ struct sockaddr_vm { sizeof(__u8)]; }; +/* The maximum number of vsock devices. Each vsock device has an exclusive + * context id. + */ + +#define MAX_VSOCK_NUM 16 This is used internally in AF_VSOCK, I don't think we should expose it in the UAPI. + #define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9) /* MSG_ZEROCOPY notifications are encoded in the standard error format, diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 54ba7316f808..da06ddc940cd 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -234,19 +234,45 @@ static void __vsock_remove_connected(struct vsock_sock *vsk) static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { - struct vsock_sock *vsk; + struct vsock_sock *vsk, *any_vsk = NULL; + rcu_read_lock(); Why the rcu is needed? list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { + /* The highest priority: full match. */ if (vsock_addr_equals_addr(addr, &vsk->local_addr)) - return sk_vsock(vsk); + goto out; - if (addr->svm_port == vsk->local_addr.svm_port && - (vsk->local_addr.svm_cid == VMADDR_CID_ANY || -addr->svm_cid == VMADDR_CID_ANY)) - return sk_vsock(vsk); + /* Port match */ + if (addr->svm_port == vsk->local_addr.svm_port) { + /* The second priority: local cid is VMADDR_CID_ANY. */ + if (vsk->local_addr.svm_cid == VMADDR_CID_ANY) + goto out; + + /* The third priority: local cid isn't VMADDR_CID_ANY. */ + if (addr->svm_cid == VMADDR_CI
Re: [RFC PATCH 3/5] vsock/virtio: can_msgzerocopy adapts to multi-devices
On Fri, May 17, 2024 at 10:46:05PM GMT, Xuewei Niu wrote: Adds a new argument, named "cid", to let them know which `virtio_vsock` to be selected. Signed-off-by: Xuewei Niu --- include/linux/virtio_vsock.h| 2 +- net/vmw_vsock/virtio_transport.c| 5 ++--- net/vmw_vsock/virtio_transport_common.c | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) Every commit in linux must be working to support bisection. So these changes should be made before adding multi-device support. diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c82089dee0c8..21bfd5e0c2e7 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -168,7 +168,7 @@ struct virtio_transport { * extra checks and can perform zerocopy transmission by * default. */ - bool (*can_msgzerocopy)(int bufs_num); + bool (*can_msgzerocopy)(u32 cid, int bufs_num); }; ssize_t diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 93d25aeafb83..998b22e5ce36 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -521,14 +521,13 @@ static void virtio_vsock_rx_done(struct virtqueue *vq) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } -static bool virtio_transport_can_msgzerocopy(int bufs_num) +static bool virtio_transport_can_msgzerocopy(u32 cid, int bufs_num) { struct virtio_vsock *vsock; bool res = false; rcu_read_lock(); - - vsock = rcu_dereference(the_virtio_vsock); + vsock = virtio_transport_get_virtio_vsock(cid); if (vsock) { struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index bed75a41419e..e7315d7b9af1 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -39,7 +39,7 @@ virtio_transport_get_ops(struct vsock_sock *vsk) static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, struct virtio_vsock_pkt_info *info, - size_t pkt_len) + size_t pkt_len, unsigned int cid) { struct iov_iter *iov_iter; @@ -62,7 +62,7 @@ static bool virtio_transport_can_zcopy(const struct virtio_transport *t_ops, int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS); /* +1 is for packet header. */ - return t_ops->can_msgzerocopy(pages_to_send + 1); + return t_ops->can_msgzerocopy(cid, pages_to_send + 1); } return true; @@ -375,7 +375,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, info->msg->msg_flags &= ~MSG_ZEROCOPY; if (info->msg->msg_flags & MSG_ZEROCOPY) - can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len); + can_zcopy = virtio_transport_can_zcopy(t_ops, info, pkt_len, src_cid); if (can_zcopy) max_skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, -- 2.34.1
Re: [RFC PATCH 4/5] vsock: seqpacket_allow adapts to multi-devices
On Fri, May 17, 2024 at 10:46:06PM GMT, Xuewei Niu wrote: Adds a new argument, named "src_cid", to let them know which `virtio_vsock` to be selected. Signed-off-by: Xuewei Niu --- include/net/af_vsock.h | 2 +- net/vmw_vsock/af_vsock.c | 15 +-- net/vmw_vsock/virtio_transport.c | 4 ++-- net/vmw_vsock/vsock_loopback.c | 4 ++-- 4 files changed, 18 insertions(+), 7 deletions(-) Same for this. diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 0151296a0bc5..25f7dc3d602d 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -143,7 +143,7 @@ struct vsock_transport { int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); - bool (*seqpacket_allow)(u32 remote_cid); + bool (*seqpacket_allow)(u32 src_cid, u32 remote_cid); u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index da06ddc940cd..3b34be802bf2 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -470,10 +470,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { const struct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); - unsigned int remote_cid = vsk->remote_addr.svm_cid; + unsigned int src_cid, remote_cid; __u8 remote_flags; int ret; + remote_cid = vsk->remote_addr.svm_cid; + /* If the packet is coming with the source and destination CIDs higher * than VMADDR_CID_HOST, then a vsock channel where all the packets are * forwarded to the host should be established. Then the host will @@ -527,8 +529,17 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) return -ENODEV; if (sk->sk_type == SOCK_SEQPACKET) { + if (vsk->local_addr.svm_cid == VMADDR_CID_ANY) { + if (new_transport->get_default_cid) + src_cid = new_transport->get_default_cid(); + else + src_cid = new_transport->get_local_cid(); + } else { + src_cid = vsk->local_addr.svm_cid; + } + if (!new_transport->seqpacket_allow || - !new_transport->seqpacket_allow(remote_cid)) { + !new_transport->seqpacket_allow(src_cid, remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 998b22e5ce36..0bddcbd906a2 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -615,14 +615,14 @@ static struct virtio_transport virtio_transport = { .can_msgzerocopy = virtio_transport_can_msgzerocopy, }; -static bool virtio_transport_seqpacket_allow(u32 remote_cid) +static bool virtio_transport_seqpacket_allow(u32 src_cid, u32 remote_cid) { struct virtio_vsock *vsock; bool seqpacket_allow; seqpacket_allow = false; rcu_read_lock(); - vsock = rcu_dereference(the_virtio_vsock); + vsock = virtio_transport_get_virtio_vsock(src_cid); if (vsock) seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 6dea6119f5b2..b94358f5bb2c 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -46,7 +46,7 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) return 0; } -static bool vsock_loopback_seqpacket_allow(u32 remote_cid); +static bool vsock_loopback_seqpacket_allow(u32 src_cid, u32 remote_cid); static bool vsock_loopback_msgzerocopy_allow(void) { return true; @@ -104,7 +104,7 @@ static struct virtio_transport loopback_transport = { .send_pkt = vsock_loopback_send_pkt, }; -static bool vsock_loopback_seqpacket_allow(u32 remote_cid) +static bool vsock_loopback_seqpacket_allow(u32 src_cid, u32 remote_cid) { return true; } -- 2.34.1
Re: [RFC PATCH 5/5] vsock: Add an ioctl request to get all CIDs
On Fri, May 17, 2024 at 10:46:07PM GMT, Xuewei Niu wrote: The new request is called `IOCTL_VM_SOCKETS_GET_LOCAL_CIDS`. And the old one, `IOCTL_VM_SOCKETS_GET_LOCAL_CID` is retained. For the transport that supports multi-devices: * `IOCTL_VM_SOCKETS_GET_LOCAL_CID` returns "-1"; What about returning the default CID (lower prio)? * `IOCTL_VM_SOCKETS_GET_LOCAL_CIDS` returns a vector of CIDS. The usage is shown as following. ``` struct vsock_local_cids local_cids; if ((ret = ioctl(fd, IOCTL_VM_SOCKETS_GET_LOCAL_CIDS, &local_cids))) { perror("failed to get cids"); exit(1); } for (i = 0; i --- include/net/af_vsock.h | 7 +++ include/uapi/linux/vm_sockets.h | 8 net/vmw_vsock/af_vsock.c| 19 +++ 3 files changed, 34 insertions(+) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 25f7dc3d602d..2febc816e388 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -264,4 +264,11 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) { return t->msgzerocopy_allow && t->msgzerocopy_allow(); } + +/ IOCTL / +/* Type of return value of IOCTL_VM_SOCKETS_GET_LOCAL_CIDS. */ +struct vsock_local_cids { + int nr; + unsigned int data[MAX_VSOCK_NUM]; +}; #endif /* __AF_VSOCK_H__ */ diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index 36ca5023293a..01f73fb7af5a 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -195,8 +195,16 @@ struct sockaddr_vm { #define MAX_VSOCK_NUM 16 Okay, now I see why you need this in the UAPI, but pleace try to follow other defines. What about VM_SOCKETS_MAX_DEVS ? +/* Return actual context id if the transport not support vsock + * multi-devices. Otherwise, return `-1U`. + */ + #define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9) +/* Only available in transports that support multiple devices. */ + +#define IOCTL_VM_SOCKETS_GET_LOCAL_CIDS _IOR(7, 0xba, struct vsock_local_cids) + /* MSG_ZEROCOPY notifications are encoded in the standard error format, * sock_extended_err. See Documentation/networking/msg_zerocopy.rst in * kernel source tree for more details. diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 3b34be802bf2..2ea2ff52f15b 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -2454,6 +2454,7 @@ static long vsock_dev_do_ioctl(struct file *filp, u32 __user *p = ptr; u32 cid = VMADDR_CID_ANY; int retval = 0; + struct vsock_local_cids local_cids; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: @@ -2469,6 +2470,24 @@ static long vsock_dev_do_ioctl(struct file *filp, retval = -EFAULT; break; + case IOCTL_VM_SOCKETS_GET_LOCAL_CIDS: + if (!transport_g2h || !transport_g2h->get_local_cids) + goto fault; + + rcu_read_lock(); + local_cids.nr = transport_g2h->get_local_cids(local_cids.data); + rcu_read_unlock(); + + if (local_cids.nr < 0 || + copy_to_user(p, &local_cids, sizeof(local_cids))) + goto fault; + + break; + +fault: + retval = -EFAULT; + break; + default: retval = -ENOIOCTLCMD; } -- 2.34.1
Re: [RFC PATCH 0/5] vsock/virtio: Add support for multi-devices
Hi, thanks for this RFC! On Fri, May 17, 2024 at 10:46:02PM GMT, Xuewei Niu wrote: # Motivition Vsock is a lightweight and widely used data exchange mechanism between host and guest. Kata Containers, a secure container runtime, leverages the capability to exchange control data between the shim and the kata-agent. The Linux kernel only supports one vsock device for virtio-vsock transport, resulting in the following limitations: * Poor performance isolation: All vsock connections share the same virtqueue. This might be fixed if we implement multi-queue in virtio-vsock. * Cannot enable more than one backend: Virtio-vsock, vhost-vsock, and vhost-user-vsock cannot be enabled simultaneously on the transport. We’d like to transfer networking data, such as TSI (Transparent Socket Impersonation), over vsock via the vhost-user protocol to reduce overhead. However, by default, the vsock device is occupied by the kata-agent. # Usages Principle: **Supporting virtio-vsock multi-devices while also being compatible with existing ones.** ## Connection from Guest to Host There are two valuable questions to take about: 1. How to be compatible with the existing usages? 2. How do we specify a virtio-vsock device? ### Question 1 Before we delve into question 1, I'd like to provide a piece of pseudocode as an example of one of the existing use cases from the guest's perspective. Assuming there is one virtio-vsock device with CID 4. One of existing usages to connect to host is shown as following. ``` fd = socket(AF_VSOCK); connect(fd, 2, 1234); n = write(fd, buffer); ``` The result is that a connection is established from the guest (4, ?) to the host (2, 1234), where "?" denotes a random port. In the context of multi-devices, there are more than two devices. If the users don’t specify one CID explicitly, the kernel becomes confused about which device to use. The new implementation should be compatible with the old one. We expanded the virtio-vsock specification to address this issue. The specification now includes a new field called "order". ``` struct virtio_vsock_config { __le64 guest_cid; __le64 order; } _attribute_((packed)); ``` In the phase of virtio-vsock driver probing, the guest kernel reads from VMM to get the order of each device. **We stipulate that the device with the smallest order is regarded as the default device**(this mechanism functions as a 'default gateway' in networking). Assuming there are three virtio-vsock devices: device1 (CID=3), device2 (CID=4), and device3 (CID=5). The arrangement of the list is as follows from the perspective of the guest kernel: ``` virtio_vsock_list = virtio_vsock { cid: 4, order: 0 } -> virtio_vsock { cid: 3, order: 1 } -> virtio_vsock { cid: 5, order: 10 } ``` At this time, the guest kernel realizes that the device2 (CID=4) is the default device. Execute the same code as before. ``` fd = socket(AF_VSOCK); connect(fd, 2, 1234); n = write(fd, buffer); ``` A connection will be established from the guest (4, ?) to the host (2, 1234). It seems that only the one with order 0 is used here though, so what is the ordering for? Wouldn't it suffice to simply indicate the default device (e.g., like the default gateway for networking)? ### Question 2 Now, the user wants to specify a device instead of the default one. An explicit binding operation is required to be performed. Use the device (CID=3), where “-1” represents any port, the kernel will We have a macro: VMADDR_PORT_ANY (which is -1) search an available port automatically. ``` fd = socket(AF_VSOCK); bind(fd, 3, -1); connect(fd, 2, 1234);) n = write(fd, buffer); ``` Use the device (CID=4). ``` fd = socket(AF_VSOCK); bind(fd, 4, -1); connect(fd, 2, 1234); n = write(fd, buffer); ``` ## Connection from Host to Guest Connection from host to guest is quite similar to the existing usages. The device’s CID is specified by the bind operation. Listen at the device (CID=3)’s port 1. ``` fd = socket(AF_VSOCK); bind(fd, 3, 1); listen(fd); new_fd = accept(fd, &host_cid, &host_port); n = write(fd, buffer); ``` Listen at the device (CID=4)’s port 1. ``` fd = socket(AF_VSOCK); bind(fd, 4, 1); listen(fd); new_fd = accept(fd, &host_cid, &host_port); n = write(fd, buffer); ``` # Use Cases We've completed a POC with Kata Containers, Ztunnel, which is a purpose-built per-node proxy for Istio ambient mesh, and TSI. Please refer to the following link for more details. Link: https://bit.ly/4bdPJbU Thank you for this RFC, I left several comments in the patches, we still have some work to do, but I think it is something we can support :-) Here I summarize the things that I think we need to fix: 1. Avoid adding transport-specific things in af_vsock.c We need to have a generic API to allow other transports to implement the same functionality. 2. We need to add negotiation of a new feature in virtio/vhost transports We need to enable or disable support depending on whether t
Re: [RFC PATCH v1 1/2] virtio/vsock: rework deferred credit update logic
On Fri, Jun 21, 2024 at 10:25:40PM GMT, Arseniy Krasnov wrote: Previous calculation of 'free_space' was wrong (but worked as expected in most cases, see below), because it didn't account number of bytes in rx queue. Let's rework 'free_space' calculation in the following way: as this value is considered free space at rx side from tx point of view, it must be equal to return value of 'virtio_transport_get_credit()' at tx side. This function uses 'tx_cnt' counter and 'peer_fwd_cnt': first is number of transmitted bytes (without wrap), second is last 'fwd_cnt' value received from rx. So let's use same approach at rx side during 'free_space' calculation: add 'rx_cnt' counter which is number of received bytes (also without wrap) and subtract 'last_fwd_cnt' from it. Now we have: 1) 'rx_cnt' == 'tx_cnt' at both sides. 2) 'last_fwd_cnt' == 'peer_fwd_cnt' - because first is last 'fwd_cnt' sent to tx, while second is last 'fwd_cnt' received from rx. Now 'free_space' is handled correctly and also we don't need mmm, I don't know if it was wrong before, maybe we could say it was less accurate. That said, could we have the same problem now if we have a lot of producers and the virtqueue becomes full? 'low_rx_bytes' flag - this was more like a hack. Previous calculation of 'free_space' worked (in 99% cases), because if we take a look on behaviour of both expressions (new and previous): '(rx_cnt - last_fwd_cnt)' and '(fwd_cnt - last_fwd_cnt)' Both of them always grows up, with almost same "speed": only difference is that 'rx_cnt' is incremented earlier during packet is received, while 'fwd_cnt' in incremented when packet is read by user. So if 'rx_cnt' grows "faster", then resulting 'free_space' become smaller also, so we send credit updates a little bit more, but: * 'free_space' calculation based on 'rx_cnt' gives the same value, which tx sees as free space at rx side, so original idea of Ditto, what happen if the virtqueue is full? 'free_space' is now implemented as planned. * Hack with 'low_rx_bytes' now is not needed. Yeah, so this patch should also mitigate issue reported by Alex (added in CC), right? If yes, please mention that problem and add a Reported-by giving credit to Alex. Also here is some performance comparison between both versions of 'free_space' calculation: *--*--*--* | | 'rx_cnt' | previous | *--*--*--* |H -> G| 8.42 | 7.82 | *--*--*--* |G -> H| 11.6 | 12.1 | *--*--*--* How many seconds did you run it? How many repetitions? There's a little discrepancy anyway, but I can't tell if it's just noise. As benchmark 'vsock-iperf' with default arguments was used. There is no significant performance difference before and after this patch. Signed-off-by: Arseniy Krasnov --- include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport_common.c | 8 +++- 2 files changed, 4 insertions(+), 5 deletions(-) Thanks for working on this, I'll do more tests but the approach LGTM. Thanks, Stefano diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c82089dee0c8..3579491c411e 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -135,6 +135,7 @@ struct virtio_vsock_sock { u32 peer_buf_alloc; /* Protected by rx_lock */ + u32 rx_cnt; u32 fwd_cnt; u32 last_fwd_cnt; u32 rx_bytes; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 16ff976a86e3..1d4e2328e06e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -441,6 +441,7 @@ static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, return false; vvs->rx_bytes += len; + vvs->rx_cnt += len; return true; } @@ -558,7 +559,6 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; - bool low_rx_bytes; int err = -EFAULT; u32 free_space; @@ -603,9 +603,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; - free_space = vvs->buf_alloc - fwd_cnt_delta; - low_rx_bytes = (vvs->rx_bytes < - sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); + free_space = vvs->buf_alloc - (vvs->rx_cnt - vvs->last_fwd_cnt); spin_unlock_bh(&vvs->rx_lock); @@ -619,7 +617,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * number of bytes in rx queue is not enough to wake up reader. */ if (fwd_cnt_delta && - (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) + (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)) virtio_transport_send_credit_update(vsk); return total; --
Re: [PATCH net-next v3 1/3] vsock: add support for SIOCOUTQ ioctl for all vsock socket types.
nit: in theory in this patch we don't support it for any of the transports, so I wouldn't confuse and take that part out of the title. WDYT with someting like: vsock: add support for SIOCOUTQ ioctl On Wed, Jun 26, 2024 at 02:08:35PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Add support for ioctl(s) for SOCK_STREAM SOCK_SEQPACKET and SOCK_DGRAM in AF_VSOCK. The only ioctl available is SIOCOUTQ/TIOCOUTQ, which returns the number of unsent bytes in the socket. This information is transport-specific and is delegated to them using a callback. Suggested-by: Daan De Meyer Signed-off-by: Luigi Leonardi --- include/net/af_vsock.h | 3 +++ net/vmw_vsock/af_vsock.c | 60 +--- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 535701efc1e5..7b5375ae7827 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -169,6 +169,9 @@ struct vsock_transport { void (*notify_buffer_size)(struct vsock_sock *, u64 *); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); + /* SIOCOUTQ ioctl */ + size_t (*unsent_bytes)(struct vsock_sock *vsk); If you want to return also errors, maybe better returning ssize_t. This should fix one of the error reported by kernel bots. + /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4b040285aa78..d6140d73d122 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -112,6 +112,7 @@ #include #include #include +#include static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); @@ -1292,6 +1293,59 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); +static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, + int __user *arg) +{ + struct sock *sk = sock->sk; + struct vsock_sock *vsk; + int retval; + + vsk = vsock_sk(sk); + + switch (cmd) { + case SIOCOUTQ: { + size_t n_bytes; + + if (!vsk->transport || !vsk->transport->unsent_bytes) { + retval = -EOPNOTSUPP; + break; + } + + if (vsk->transport->unsent_bytes) { This if is not necessary after the check we did earlier, right? Removing it should fix the other issue reported by the bot. + if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { + retval = -EINVAL; + break; + } + + n_bytes = vsk->transport->unsent_bytes(vsk); + if (n_bytes < 0) { + retval = n_bytes; + break; + } + + retval = put_user(n_bytes, arg); + } + break; + } + default: + retval = -ENOIOCTLCMD; + } + + return retval; +} + +static int vsock_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + int ret; + + lock_sock(sock->sk); + ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); + release_sock(sock->sk); + + return ret; +} + static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, @@ -1302,7 +1356,7 @@ static const struct proto_ops vsock_dgram_ops = { .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, @@ -2286,7 +2340,7 @@ static const struct proto_ops vsock_stream_ops = { .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, @@ -2308,7 +2362,7 @@ static const struct proto_ops vsock_seqpacket_ops = { .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, -- 2.45.2
Re: [PATCH net-next v3 2/3] vsock/virtio: add SIOCOUTQ support for all virtio based transports
On Wed, Jun 26, 2024 at 02:08:36PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Introduce support for stream_bytes_unsent and seqpacket_bytes_unsent ioctl for virtio_transport, vhost_vsock and vsock_loopback. For all transports the unsent bytes counter is incremented in virtio_transport_get_credit. In the virtio_transport (G2H) the counter is decremented each time the host notifies the guest that it consumed the skbuffs. In vhost-vsock (H2G) the counter is decremented after the skbuff is queued in the virtqueue. In vsock_loopback the counter is decremented after the skbuff is dequeued. Signed-off-by: Luigi Leonardi --- drivers/vhost/vsock.c | 4 +++- include/linux/virtio_vsock.h| 7 +++ net/vmw_vsock/virtio_transport.c| 4 +++- net/vmw_vsock/virtio_transport_common.c | 35 + net/vmw_vsock/vsock_loopback.c | 7 +++ 5 files changed, 55 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index ec20ecff85c7..dba8b3ea37bf 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, restart_tx = true; } - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_bytes_unsent, The callback is named `unsent_bytes`, I'd use something similar also in the function name, so `virtio_transport_unsent_bytes`, or the opposite renaming the callback, as you prefer, but I'd use the same for both. + .read_skb = virtio_transport_read_skb, }, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c82089dee0c8..e74c12878213 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -134,6 +134,8 @@ struct virtio_vsock_sock { u32 peer_fwd_cnt; u32 peer_buf_alloc; Can you remove this extra empty line, so it's clear that it is protected by tx_lock? + size_t bytes_unsent; + /* Protected by rx_lock */ u32 fwd_cnt; u32 last_fwd_cnt; @@ -193,6 +195,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); +size_t virtio_transport_bytes_unsent(struct vsock_sock *vsk); + +void virtio_transport_consume_skb_sent(struct sk_buff *skb, + bool consume); + int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); int diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 43d405298857..fc62d2818c2c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct *work) virtqueue_disable_cb(vq); while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); added = true; } } while (!virtqueue_enable_cb(vq)); @@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_bytes_unsent, + .read_skb = virtio_transport_read_skb, }, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 16ff976a86e3..3a7fa36f306b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff * } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); +void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume) +{ + struct sock *s = skb->sk; + + if (s && skb->len) { + struct vsock_sock *vs = vsock_sk(s); + struct virtio_vsock_sock *vvs; + + vvs = vs->trans; + + spin_lock_bh(&vvs->tx_lock); + vvs->bytes_unsent -= skb->len; + spin_unlock_bh(&vvs->tx_lock); + } + + if (consume) + consume_skb(skb); +} +EXPORT_SYMBOL_G
Re: [RFC PATCH v1 1/2] virtio/vsock: rework deferred credit update logic
Hi Arseniy, On Fri, Jun 21, 2024 at 10:25:40PM GMT, Arseniy Krasnov wrote: Previous calculation of 'free_space' was wrong (but worked as expected in most cases, see below), because it didn't account number of bytes in rx queue. Let's rework 'free_space' calculation in the following way: as this value is considered free space at rx side from tx point of view, it must be equal to return value of 'virtio_transport_get_credit()' at tx side. This function uses 'tx_cnt' counter and 'peer_fwd_cnt': first is number of transmitted bytes (without wrap), second is last 'fwd_cnt' value received from rx. So let's use same approach at rx side during 'free_space' calculation: add 'rx_cnt' counter which is number of received bytes (also without wrap) and subtract 'last_fwd_cnt' from it. Now we have: 1) 'rx_cnt' == 'tx_cnt' at both sides. 2) 'last_fwd_cnt' == 'peer_fwd_cnt' - because first is last 'fwd_cnt' sent to tx, while second is last 'fwd_cnt' received from rx. Now 'free_space' is handled correctly and also we don't need 'low_rx_bytes' flag - this was more like a hack. Previous calculation of 'free_space' worked (in 99% cases), because if we take a look on behaviour of both expressions (new and previous): '(rx_cnt - last_fwd_cnt)' and '(fwd_cnt - last_fwd_cnt)' Both of them always grows up, with almost same "speed": only difference is that 'rx_cnt' is incremented earlier during packet is received, while 'fwd_cnt' in incremented when packet is read by user. So if 'rx_cnt' grows "faster", then resulting 'free_space' become smaller also, so we send credit updates a little bit more, but: * 'free_space' calculation based on 'rx_cnt' gives the same value, which tx sees as free space at rx side, so original idea of 'free_space' is now implemented as planned. * Hack with 'low_rx_bytes' now is not needed. Also here is some performance comparison between both versions of 'free_space' calculation: *--*--*--* | | 'rx_cnt' | previous | *--*--*--* |H -> G| 8.42 | 7.82 | *--*--*--* |G -> H| 11.6 | 12.1 | *--*--*--* I did some tests on an Intel(R) Xeon(R) Silver 4410Y using iperf-vsock: - kernel 6.9.0 pkt_size G->H H->G 4k4.6 6.4 64k 13.8 11.5 128k 13.4 11.7 - kernel 6.9.0 with this series applied pkt_size G->H H->G 4k 4.6 8.16 64k 12.2 8.9 128k 12.8 8.8 I see a big drop, especially on H->G with big packets. Can you try to replicate on your env? I'll try to understand more and also an i7 on the next days. Thanks, Stefano As benchmark 'vsock-iperf' with default arguments was used. There is no significant performance difference before and after this patch. Signed-off-by: Arseniy Krasnov --- include/linux/virtio_vsock.h| 1 + net/vmw_vsock/virtio_transport_common.c | 8 +++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c82089dee0c8..3579491c411e 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -135,6 +135,7 @@ struct virtio_vsock_sock { u32 peer_buf_alloc; /* Protected by rx_lock */ + u32 rx_cnt; u32 fwd_cnt; u32 last_fwd_cnt; u32 rx_bytes; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 16ff976a86e3..1d4e2328e06e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -441,6 +441,7 @@ static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, return false; vvs->rx_bytes += len; + vvs->rx_cnt += len; return true; } @@ -558,7 +559,6 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; - bool low_rx_bytes; int err = -EFAULT; u32 free_space; @@ -603,9 +603,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; - free_space = vvs->buf_alloc - fwd_cnt_delta; - low_rx_bytes = (vvs->rx_bytes < - sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX)); + free_space = vvs->buf_alloc - (vvs->rx_cnt - vvs->last_fwd_cnt); spin_unlock_bh(&vvs->rx_lock); @@ -619,7 +617,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, * number of bytes in rx queue is not enough to wake up reader. */ if (fwd_cnt_delta && - (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes)) + (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)) virtio_transport_send_credit_update(vsk); return total; -- 2.25.1
Re: [PATCH PATCH net-next v2 1/2] vsock/virtio: refactor virtio_transport_send_pkt_work
On Mon, Jul 01, 2024 at 04:28:02PM GMT, Luigi Leonardi via B4 Relay wrote: From: Marco Pinna Preliminary patch to introduce an optimization to the enqueue system. All the code used to enqueue a packet into the virtqueue is removed from virtio_transport_send_pkt_work() and moved to the new virtio_transport_send_skb() function. Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Marco Pinna --- net/vmw_vsock/virtio_transport.c | 133 +-- 1 file changed, 73 insertions(+), 60 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 43d405298857..a74083d28120 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -94,6 +94,77 @@ static u32 virtio_transport_get_local_cid(void) return ret; } +/* Caller need to hold vsock->tx_lock on vq */ +static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq, +struct virtio_vsock *vsock, bool *restart_rx) +{ + int ret, in_sg = 0, out_sg = 0; + struct scatterlist **sgs; + bool reply; + + reply = virtio_vsock_skb_reply(skb); + sgs = vsock->out_sgs; + sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), + sizeof(*virtio_vsock_hdr(skb))); + out_sg++; + + if (!skb_is_nonlinear(skb)) { + if (skb->len > 0) { + sg_init_one(sgs[out_sg], skb->data, skb->len); + out_sg++; + } + } else { + struct skb_shared_info *si; + int i; + + /* If skb is nonlinear, then its buffer must contain +* only header and nothing more. Data is stored in +* the fragged part. +*/ + WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); + + si = skb_shinfo(skb); + + for (i = 0; i < si->nr_frags; i++) { + skb_frag_t *skb_frag = &si->frags[i]; + void *va; + + /* We will use 'page_to_virt()' for the userspace page +* here, because virtio or dma-mapping layers will call +* 'virt_to_phys()' later to fill the buffer descriptor. +* We don't touch memory at "virtual" address of this page. +*/ + va = page_to_virt(skb_frag_page(skb_frag)); + sg_init_one(sgs[out_sg], + va + skb_frag_off(skb_frag), + skb_frag_size(skb_frag)); + out_sg++; + } + } + + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); + /* Usually this means that there is no more space available in +* the vq +*/ + if (ret < 0) + return ret; + + virtio_transport_deliver_tap_pkt(skb); + + if (reply) { + struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; + int val; + + val = atomic_dec_return(&vsock->queued_replies); + + /* Do we now have resources to resume rx processing? */ + if (val + 1 == virtqueue_get_vring_size(rx_vq)) + *restart_rx = true; + } Looking more closely at this patch, perhaps we can leave reply handling out of this refactoring, as it is only needed in the worker. IIUC, this is to prevent the RX worker from leaving room for the TX worker by handling too many replies. So when we have a large enough number of replies (equal to the size of the RX queue) in the queue of the TX worker ready to be queued in the virtqueue, we stop the RX worker and restart it only when the TX worker has had a chance to send replies. @Stefan can you confirm this since you were involved in the original implementation? If we skip the worker, we don't need this. Moreover, we know well that the worker has no queued elements, so we will only go to increment `queued_replies` and then decrement it immediately afterwards. Thanks, Stefano + + return 0; +} + static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -111,77 +182,19 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - int ret, in_sg = 0, out_sg = 0; - struct scatterlist **sgs; struct sk_buff *skb; - bool reply; + int ret; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) break; - reply = virtio_vsock_skb_reply(skb); - sgs = vsock->out_sgs; - sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), - sizeof(*virtio_vsock_hdr(skb))); -
Re: [PATCH PATCH net-next v2 2/2] vsock/virtio: avoid enqueue packets when work queue is empty
On Mon, Jul 01, 2024 at 04:49:41PM GMT, Luigi Leonardi wrote: Hi all, + /* Inside RCU, can't sleep! */ + ret = mutex_trylock(&vsock->tx_lock); + if (unlikely(ret == 0)) + goto out_worker; I just realized that here I don't release the tx_lock and that the email subject is "PATCH PATCH". I will fix this in the next version. What about adding a function to handle all these steps? So we can handle better the error path in this block code. IMHO to simplify the code, you can just return true or false if you queued it. Then if the driver is disappearing and we are still queuing it, it will be the release that will clean up all the queues, so we might not worry about this edge case. Thanks, Stefano Any feedback is welcome! Thanks, Luigi
Re: [PATCH PATCH net-next v2 2/2] vsock/virtio: avoid enqueue packets when work queue is empty
On Mon, Jul 01, 2024 at 04:28:03PM GMT, Luigi Leonardi via B4 Relay wrote: From: Marco Pinna Introduce an optimization in virtio_transport_send_pkt: when the work queue (send_pkt_queue) is empty the packet is put directly in the virtqueue reducing latency. In the following benchmark (pingpong mode) the host sends a payload to the guest and waits for the same payload back. All vCPUs pinned individually to pCPUs. vhost process pinned to a pCPU fio process pinned both inside the host and the guest system. Host CPU: Intel i7-10700KF CPU @ 3.80GHz Tool: Fio version 3.37-56 Env: Phys host + L1 Guest Payload: 512 Runtime-per-test: 50s Mode: pingpong (h-g-h) Test runs: 50 Type: SOCK_STREAM Before (Linux 6.8.11) -- mean(1st percentile):380.56 ns mean(overall): 780.83 ns mean(99th percentile): 8300.24 ns After -- mean(1st percentile): 370.59 ns mean(overall): 720.66 ns mean(99th percentile): 7600.27 ns Same setup, using 4K payload: Before (Linux 6.8.11) -- mean(1st percentile):458.84 ns mean(overall): 1650.17 ns mean(99th percentile): 42240.68 ns After -- mean(1st percentile):450.12 ns mean(overall): 1460.84 ns mean(99th percentile): 37632.45 ns virtqueue. Throughput: iperf-vsock Before (Linux 6.8.11) G2H 28.7 Gb/s After G2H 40.8 Gb/s Cool! I'd suggest to add the length of buffer (-l param) used, and also check more lenghts, like at least 4k, 64k, 128k. The performance improvement is related to this optimization, I checked that each packet was put directly on the vq avoiding the work queue. How? Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Marco Pinna I think you might want to change the author of this patch, since it's changed a lot from Marco's original one. Obviously if you both agree on this. Thanks, Stefano --- net/vmw_vsock/virtio_transport.c | 38 -- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index a74083d28120..3815aa8d956b 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -213,6 +213,7 @@ virtio_transport_send_pkt(struct sk_buff *skb) { struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; + bool use_worker = true; int len = skb->len; hdr = virtio_vsock_hdr(skb); @@ -234,8 +235,41 @@ virtio_transport_send_pkt(struct sk_buff *skb) if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); - virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); - queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + /* If the workqueue (send_pkt_queue) is empty there is no need to enqueue the packet. +* Just put it on the virtqueue using virtio_transport_send_skb. +*/ + if (skb_queue_empty_lockless(&vsock->send_pkt_queue)) { + bool restart_rx = false; + struct virtqueue *vq; + int ret; + + /* Inside RCU, can't sleep! */ + ret = mutex_trylock(&vsock->tx_lock); + if (unlikely(ret == 0)) + goto out_worker; + + /* Driver is being removed, no need to enqueue the packet */ + if (!vsock->tx_run) + goto out_rcu; + + vq = vsock->vqs[VSOCK_VQ_TX]; + + if (!virtio_transport_send_skb(skb, vq, vsock, &restart_rx)) { + use_worker = false; + virtqueue_kick(vq); + } + + mutex_unlock(&vsock->tx_lock); + + if (restart_rx) + queue_work(virtio_vsock_workqueue, &vsock->rx_work); + } + +out_worker: + if (use_worker) { + virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + } out_rcu: rcu_read_unlock(); -- 2.45.2
[PATCH] vdpa_sim_blk: add `capacity` module parameter
The vDPA block simulator always allocated a 128 MiB ram-disk, but some filesystems (e.g. XFS) may require larger minimum sizes (see https://issues.redhat.com/browse/RHEL-45951). So to allow us to test these filesystems, let's add a module parameter to control the size of the simulated virtio-blk devices. The value is mapped directly to the `capacity` field of the virtio-blk configuration space, so it must be expressed in sector numbers of 512 bytes. The default value (0x4) is the same as the previous value, so the behavior without setting `capacity` remains unchanged. Before this patch or with this patch without setting `capacity`: $ modprobe vdpa-sim-blk $ vdpa dev add mgmtdev vdpasim_blk name blk0 virtio_blk virtio6: 1/0/0 default/read/poll queues virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB) After this patch: $ modprobe vdpa-sim-blk capacity=614400 $ vdpa dev add mgmtdev vdpasim_blk name blk0 virtio_blk virtio6: 1/0/0 default/read/poll queues virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB) Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 25 + 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index b137f3679343..18f390149836 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -33,7 +33,6 @@ (1ULL << VIRTIO_BLK_F_DISCARD) | \ (1ULL << VIRTIO_BLK_F_WRITE_ZEROES)) -#define VDPASIM_BLK_CAPACITY 0x4 #define VDPASIM_BLK_SIZE_MAX 0x1000 #define VDPASIM_BLK_SEG_MAX32 #define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX @@ -43,6 +42,10 @@ #define VDPASIM_BLK_AS_NUM 1 #define VDPASIM_BLK_GROUP_NUM 1 +static unsigned long capacity = 0x4; +module_param(capacity, ulong, 0444); +MODULE_PARM_DESC(capacity, "virtio-blk device capacity (in 512-byte sectors)"); + struct vdpasim_blk { struct vdpasim vdpasim; void *buffer; @@ -79,10 +82,10 @@ static void vdpasim_blk_buffer_unlock(struct vdpasim_blk *blk) static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, u64 num_sectors, u64 max_sectors) { - if (start_sector > VDPASIM_BLK_CAPACITY) { + if (start_sector > capacity) { dev_dbg(&vdpasim->vdpa.dev, - "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n", - start_sector, VDPASIM_BLK_CAPACITY); + "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%lx\n", + start_sector, capacity); } if (num_sectors > max_sectors) { @@ -92,10 +95,10 @@ static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, return false; } - if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) { + if (num_sectors > capacity - start_sector) { dev_dbg(&vdpasim->vdpa.dev, - "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n", - start_sector, num_sectors, VDPASIM_BLK_CAPACITY); + "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%lx\n", + start_sector, num_sectors, capacity); return false; } @@ -369,7 +372,7 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) memset(config, 0, sizeof(struct virtio_blk_config)); - blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY); + blk_config->capacity = cpu_to_vdpasim64(vdpasim, capacity); blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX); blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX); blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM); @@ -437,8 +440,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, if (blk->shared_backend) { blk->buffer = shared_buffer; } else { - blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, - GFP_KERNEL); + blk->buffer = kvzalloc(capacity << SECTOR_SHIFT, GFP_KERNEL); if (!blk->buffer) { ret = -ENOMEM; goto put_dev; @@ -495,8 +497,7 @@ static int __init vdpasim_blk_init(void) goto parent_err; if (shared_backend) { - shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, -GFP_KERNEL); +
Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter
On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote: On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote: The vDPA block simulator always allocated a 128 MiB ram-disk, but some filesystems (e.g. XFS) may require larger minimum sizes (see https://issues.redhat.com/browse/RHEL-45951). So to allow us to test these filesystems, let's add a module parameter to control the size of the simulated virtio-blk devices. The value is mapped directly to the `capacity` field of the virtio-blk configuration space, so it must be expressed in sector numbers of 512 bytes. The default value (0x4) is the same as the previous value, so the behavior without setting `capacity` remains unchanged. Before this patch or with this patch without setting `capacity`: $ modprobe vdpa-sim-blk $ vdpa dev add mgmtdev vdpasim_blk name blk0 virtio_blk virtio6: 1/0/0 default/read/poll queues virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB) After this patch: $ modprobe vdpa-sim-blk capacity=614400 $ vdpa dev add mgmtdev vdpasim_blk name blk0 virtio_blk virtio6: 1/0/0 default/read/poll queues virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB) Signed-off-by: Stefano Garzarella What a hack. Cindy was working on adding control over config space, why can't that be used? If it can be used easily with virtio-blk device too, it will be great. @Cindy do you plan to support that changes for a virtio-blk device too? In the mean time, for the simulator I thought that this change was fine. It's just used for testing and debugging... My main question is how to use that when we have `shared_backend` set to true, since we use that setting to test for example live migration. In that case, how do we handle the size of the shared ramdisk between devices? Thanks, Stefano
Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter
Hi Cindy, Jason, On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote: On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu wrote: On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella wrote: > > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote: > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote: > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but some > >> filesystems (e.g. XFS) may require larger minimum sizes (see > >> https://issues.redhat.com/browse/RHEL-45951). > >> > >> So to allow us to test these filesystems, let's add a module parameter > >> to control the size of the simulated virtio-blk devices. > >> The value is mapped directly to the `capacity` field of the virtio-blk > >> configuration space, so it must be expressed in sector numbers of 512 > >> bytes. > >> > >> The default value (0x4) is the same as the previous value, so the > >> behavior without setting `capacity` remains unchanged. > >> > >> Before this patch or with this patch without setting `capacity`: > >> $ modprobe vdpa-sim-blk > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 > >> virtio_blk virtio6: 1/0/0 default/read/poll queues > >> virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB) > >> > >> After this patch: > >> $ modprobe vdpa-sim-blk capacity=614400 > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 > >> virtio_blk virtio6: 1/0/0 default/read/poll queues > >> virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB) > >> > >> Signed-off-by: Stefano Garzarella > > > >What a hack. Cindy was working on adding control over config > >space, why can't that be used? > > If it can be used easily with virtio-blk device too, it will be great. > @Cindy do you plan to support that changes for a virtio-blk device too? > Hi Stefano I plan to add support to change the vdpa device's configuration after it is created. I think for Stefano's case, we can just implement it via provisioning parameters? Yep, I think we don't need to change it after creation, but specifying while creating should be enough. So, IIUC we can already do it, implementing something similar to vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right? What about when we have `shared_backend` set to true for the vdpa_sim_blk.ko? In this case the backend is supposed to be shared between all the devices to test live migration. Maybe we can just change the size of the shared ramdisk to be reflected to all devices. Suggestions? @Cindy do you want to work on this for blk as well? If you don't have time, I'll look at it when I can allocate some time. Thanks In the first step, I want to use the vdpa tool to add support for changing the MAC address for the network device. the next step will also add MTU settings etc here is the link https://lore.kernel.org/all/20240708064820.88955-1-l...@redhat.com/T/#t I'll take a look, thanks for ccing me! Stefano in the device part, the device needs to implement its function of int (*dev_set_attr)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev, const struct vdpa_dev_set_config *config); the configuration will be passed by struct vdpa_dev_set_config. I'm not sure if this kind of design is suitable for you? Really thanks and any comments are welcome thanks Cindy > In the mean time, for the simulator I thought that this change was fine. > It's just used for testing and debugging... > > My main question is how to use that when we have `shared_backend` set to > true, since we use that setting to test for example live migration. In > that case, how do we handle the size of the shared ramdisk between > devices? > > Thanks, > Stefano >
Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter
On Tue, Jul 09, 2024 at 10:56:16AM GMT, Jason Wang wrote: On Mon, Jul 8, 2024 at 4:15 PM Stefano Garzarella wrote: Hi Cindy, Jason, On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote: >On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu wrote: >> >> On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella wrote: >> > >> > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote: >> > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote: >> > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but some >> > >> filesystems (e.g. XFS) may require larger minimum sizes (see >> > >> https://issues.redhat.com/browse/RHEL-45951). >> > >> >> > >> So to allow us to test these filesystems, let's add a module parameter >> > >> to control the size of the simulated virtio-blk devices. >> > >> The value is mapped directly to the `capacity` field of the virtio-blk >> > >> configuration space, so it must be expressed in sector numbers of 512 >> > >> bytes. >> > >> >> > >> The default value (0x4) is the same as the previous value, so the >> > >> behavior without setting `capacity` remains unchanged. >> > >> >> > >> Before this patch or with this patch without setting `capacity`: >> > >> $ modprobe vdpa-sim-blk >> > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 >> > >> virtio_blk virtio6: 1/0/0 default/read/poll queues >> > >> virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB) >> > >> >> > >> After this patch: >> > >> $ modprobe vdpa-sim-blk capacity=614400 >> > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 >> > >> virtio_blk virtio6: 1/0/0 default/read/poll queues >> > >> virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB) >> > >> >> > >> Signed-off-by: Stefano Garzarella >> > > >> > >What a hack. Cindy was working on adding control over config >> > >space, why can't that be used? >> > >> > If it can be used easily with virtio-blk device too, it will be great. >> > @Cindy do you plan to support that changes for a virtio-blk device too? >> > >> Hi Stefano >> I plan to add support to change the vdpa device's configuration after >> it is created. > >I think for Stefano's case, we can just implement it via provisioning >parameters? Yep, I think we don't need to change it after creation, but specifying while creating should be enough. So, IIUC we can already do it, implementing something similar to vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right? Right. What about when we have `shared_backend` set to true for the vdpa_sim_blk.ko? In this case the backend is supposed to be shared between all the devices to test live migration. This seems to be another topic. Yep, but really related. I think we need to handle that case when supporting the `capacity` setting. Maybe we can just change the size of the shared ramdisk to be reflected to all devices. Suggestions? Could we specify the path to tmpfs or others during provisioning instead? It seems more general (but more work). Then it would almost become a real device, no longer just a simulator. It's enough work, though, as you said, but at that point we'd just have to specify the backend file to use for the device. In that case what API would we need to use to allow the user to set the backend file? Thanks, Stefano
Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter
On Wed, Jul 10, 2024 at 11:08:48AM GMT, Jason Wang wrote: On Tue, Jul 9, 2024 at 8:41 PM Stefano Garzarella wrote: On Tue, Jul 09, 2024 at 10:56:16AM GMT, Jason Wang wrote: >On Mon, Jul 8, 2024 at 4:15 PM Stefano Garzarella wrote: >> >> Hi Cindy, Jason, >> >> On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote: >> >On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu wrote: >> >> >> >> On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella wrote: >> >> > >> >> > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote: >> >> > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote: >> >> > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but some >> >> > >> filesystems (e.g. XFS) may require larger minimum sizes (see >> >> > >> https://issues.redhat.com/browse/RHEL-45951). >> >> > >> >> >> > >> So to allow us to test these filesystems, let's add a module parameter >> >> > >> to control the size of the simulated virtio-blk devices. >> >> > >> The value is mapped directly to the `capacity` field of the virtio-blk >> >> > >> configuration space, so it must be expressed in sector numbers of 512 >> >> > >> bytes. >> >> > >> >> >> > >> The default value (0x4) is the same as the previous value, so the >> >> > >> behavior without setting `capacity` remains unchanged. >> >> > >> >> >> > >> Before this patch or with this patch without setting `capacity`: >> >> > >> $ modprobe vdpa-sim-blk >> >> > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 >> >> > >> virtio_blk virtio6: 1/0/0 default/read/poll queues >> >> > >> virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB) >> >> > >> >> >> > >> After this patch: >> >> > >> $ modprobe vdpa-sim-blk capacity=614400 >> >> > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 >> >> > >> virtio_blk virtio6: 1/0/0 default/read/poll queues >> >> > >> virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB) >> >> > >> >> >> > >> Signed-off-by: Stefano Garzarella >> >> > > >> >> > >What a hack. Cindy was working on adding control over config >> >> > >space, why can't that be used? >> >> > >> >> > If it can be used easily with virtio-blk device too, it will be great. >> >> > @Cindy do you plan to support that changes for a virtio-blk device too? >> >> > >> >> Hi Stefano >> >> I plan to add support to change the vdpa device's configuration after >> >> it is created. >> > >> >I think for Stefano's case, we can just implement it via provisioning >> >parameters? >> >> Yep, I think we don't need to change it after creation, but specifying >> while creating should be enough. >> >> So, IIUC we can already do it, implementing something similar to >> vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right? > >Right. > >> >> What about when we have `shared_backend` set to true for the >> vdpa_sim_blk.ko? In this case the backend is supposed to be shared >> between all the devices to test live migration. > >This seems to be another topic. Yep, but really related. I think we need to handle that case when supporting the `capacity` setting. Ok, so if I was not wrong, the goal is to test migration. Sorry, I was not clear, I try to rephrase: vdpa_sim_blk already supports a module parameter called `shared_backend` introduced mainly to test live migration on the same host. When that parameter is on, all the created devices share the same backend and so we can easily do migration from one to another. With that parameter on or off, the device is always 128 MB, but now that's a problem for testing, because it looks like XFS requires at least 300 MB: https://issues.redhat.com/browse/RHEL-45951 That's why I sent this patch. When `shared_backend` is off (default), using the provisioning parameters seems feasible to me, but when it's on, how do I deal with it? Being a simulator, we can maybe make it so that only the first device can change the size for example, or that all devices control the size, but then we would have to handle the size change at r
Re: [PATCH] test/vsock: add install target
On Tue, Jul 09, 2024 at 09:50:51PM GMT, Peng Fan (OSS) wrote: From: Peng Fan Add install target for vsock to make Yocto easy to install the images. Signed-off-by: Peng Fan --- tools/testing/vsock/Makefile | 12 1 file changed, 12 insertions(+) diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile index a7f56a09ca9f..5c8442fa9460 100644 --- a/tools/testing/vsock/Makefile +++ b/tools/testing/vsock/Makefile @@ -8,8 +8,20 @@ vsock_perf: vsock_perf.o msg_zerocopy_common.o vsock_uring_test: LDLIBS = -luring vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o msg_zerocopy_common.o +VSOCK_INSTALL_PATH ?= $(abspath .) +# Avoid changing the rest of the logic here and lib.mk. +INSTALL_PATH := $(VSOCK_INSTALL_PATH) + CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -D_GNU_SOURCE .PHONY: all test clean clean: ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test -include *.d + +install: all + @# Ask all targets to install their files + mkdir -p $(INSTALL_PATH)/vsock why using the "vsock" subdir? IIUC you were inspired by selftests/Makefile, but it installs under $(INSTALL_PATH)/kselftest/ the scripts used by the main one `run_kselftest.sh`, which is installed in $(INSTALL_PATH instead. So in this case I would install everything in $(INSTALL_PATH). WDYT? + install -m 744 vsock_test $(INSTALL_PATH)/vsock/ + install -m 744 vsock_perf $(INSTALL_PATH)/vsock/ + install -m 744 vsock_diag_test $(INSTALL_PATH)/vsock/ + install -m 744 vsock_uring_test $(INSTALL_PATH)/vsock/ Also from selftests/Makefile, what about using the ifdef instead of using $(abspath .) as default place? I mean this: install: all ifdef INSTALL_PATH ... else $(error Error: set INSTALL_PATH to use install) endif Thanks, Stefano
Re: [PATCH] vdpa_sim_blk: add `capacity` module parameter
On Wed, Jul 10, 2024 at 03:28:31PM GMT, Jason Wang wrote: On Wed, Jul 10, 2024 at 3:19 PM Stefano Garzarella wrote: On Wed, Jul 10, 2024 at 11:08:48AM GMT, Jason Wang wrote: >On Tue, Jul 9, 2024 at 8:41 PM Stefano Garzarella wrote: >> >> On Tue, Jul 09, 2024 at 10:56:16AM GMT, Jason Wang wrote: >> >On Mon, Jul 8, 2024 at 4:15 PM Stefano Garzarella wrote: >> >> >> >> Hi Cindy, Jason, >> >> >> >> On Mon, Jul 08, 2024 at 03:59:34PM GMT, Jason Wang wrote: >> >> >On Mon, Jul 8, 2024 at 3:06 PM Cindy Lu wrote: >> >> >> >> >> >> On Fri, 5 Jul 2024 at 20:42, Stefano Garzarella wrote: >> >> >> > >> >> >> > On Fri, Jul 05, 2024 at 07:30:51AM GMT, Michael S. Tsirkin wrote: >> >> >> > >On Fri, Jul 05, 2024 at 01:28:21PM +0200, Stefano Garzarella wrote: >> >> >> > >> The vDPA block simulator always allocated a 128 MiB ram-disk, but some >> >> >> > >> filesystems (e.g. XFS) may require larger minimum sizes (see >> >> >> > >> https://issues.redhat.com/browse/RHEL-45951). >> >> >> > >> >> >> >> > >> So to allow us to test these filesystems, let's add a module parameter >> >> >> > >> to control the size of the simulated virtio-blk devices. >> >> >> > >> The value is mapped directly to the `capacity` field of the virtio-blk >> >> >> > >> configuration space, so it must be expressed in sector numbers of 512 >> >> >> > >> bytes. >> >> >> > >> >> >> >> > >> The default value (0x4) is the same as the previous value, so the >> >> >> > >> behavior without setting `capacity` remains unchanged. >> >> >> > >> >> >> >> > >> Before this patch or with this patch without setting `capacity`: >> >> >> > >> $ modprobe vdpa-sim-blk >> >> >> > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 >> >> >> > >> virtio_blk virtio6: 1/0/0 default/read/poll queues >> >> >> > >> virtio_blk virtio6: [vdb] 262144 512-byte logical blocks (134 MB/128 MiB) >> >> >> > >> >> >> >> > >> After this patch: >> >> >> > >> $ modprobe vdpa-sim-blk capacity=614400 >> >> >> > >> $ vdpa dev add mgmtdev vdpasim_blk name blk0 >> >> >> > >> virtio_blk virtio6: 1/0/0 default/read/poll queues >> >> >> > >> virtio_blk virtio6: [vdb] 614400 512-byte logical blocks (315 MB/300 MiB) >> >> >> > >> >> >> >> > >> Signed-off-by: Stefano Garzarella >> >> >> > > >> >> >> > >What a hack. Cindy was working on adding control over config >> >> >> > >space, why can't that be used? >> >> >> > >> >> >> > If it can be used easily with virtio-blk device too, it will be great. >> >> >> > @Cindy do you plan to support that changes for a virtio-blk device too? >> >> >> > >> >> >> Hi Stefano >> >> >> I plan to add support to change the vdpa device's configuration after >> >> >> it is created. >> >> > >> >> >I think for Stefano's case, we can just implement it via provisioning >> >> >parameters? >> >> >> >> Yep, I think we don't need to change it after creation, but specifying >> >> while creating should be enough. >> >> >> >> So, IIUC we can already do it, implementing something similar to >> >> vdpasim_net_setup_config() to call during vdpasim_blk_dev_add(), right? >> > >> >Right. >> > >> >> >> >> What about when we have `shared_backend` set to true for the >> >> vdpa_sim_blk.ko? In this case the backend is supposed to be shared >> >> between all the devices to test live migration. >> > >> >This seems to be another topic. >> >> Yep, but really related. I think we need to handle that case when >> supporting the `capacity` setting. > >Ok, so if I was not wrong, the goal is to test migration. Sorry, I was not clear, I try to rephrase: vdpa_sim_blk already supports a module parameter called `shared_backend` introduced m
Re: [PATCH] test/vsock: add install target
On Wed, Jul 10, 2024 at 08:11:32AM GMT, Peng Fan wrote: Subject: Re: [PATCH] test/vsock: add install target On Tue, Jul 09, 2024 at 09:50:51PM GMT, Peng Fan (OSS) wrote: >From: Peng Fan > >Add install target for vsock to make Yocto easy to install the images. > >Signed-off-by: Peng Fan >--- > tools/testing/vsock/Makefile | 12 > 1 file changed, 12 insertions(+) > >diff --git a/tools/testing/vsock/Makefile >b/tools/testing/vsock/Makefile index a7f56a09ca9f..5c8442fa9460 100644 >--- a/tools/testing/vsock/Makefile >+++ b/tools/testing/vsock/Makefile >@@ -8,8 +8,20 @@ vsock_perf: vsock_perf.o msg_zerocopy_common.o > vsock_uring_test: LDLIBS = -luring > vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o >msg_zerocopy_common.o > >+VSOCK_INSTALL_PATH ?= $(abspath .) >+# Avoid changing the rest of the logic here and lib.mk. >+INSTALL_PATH := $(VSOCK_INSTALL_PATH) >+ > CFLAGS += -g -O2 -Werror -Wall -I. -I../../include > -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow > -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE - D_GNU_SOURCE > .PHONY: all test clean > clean: >${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test > -include *.d >+ >+install: all >+ @# Ask all targets to install their files >+ mkdir -p $(INSTALL_PATH)/vsock why using the "vsock" subdir? IIUC you were inspired by selftests/Makefile, but it installs under $(INSTALL_PATH)/kselftest/ the scripts used by the main one `run_kselftest.sh`, which is installed in $(INSTALL_PATH instead. So in this case I would install everything in $(INSTALL_PATH). WDYT? I agree. >+ install -m 744 vsock_test $(INSTALL_PATH)/vsock/ >+ install -m 744 vsock_perf $(INSTALL_PATH)/vsock/ >+ install -m 744 vsock_diag_test $(INSTALL_PATH)/vsock/ >+ install -m 744 vsock_uring_test $(INSTALL_PATH)/vsock/ Also from selftests/Makefile, what about using the ifdef instead of using $(abspath .) as default place? I mean this: install: all ifdef INSTALL_PATH ... else $(error Error: set INSTALL_PATH to use install) endif Is the following looks good to you? # Avoid conflict with INSTALL_PATH set by the main Makefile VSOCK_INSTALL_PATH ?= INSTALL_PATH := $(VSOCK_INSTALL_PATH) I'm not a super Makefile expert, but why do we need both VSOCK_INSTALL_PATH and INSTALL_PATH? Stefano install: all ifdef INSTALL_PATH mkdir -p $(INSTALL_PATH) install -m 744 vsock_test $(INSTALL_PATH) install -m 744 vsock_perf $(INSTALL_PATH) install -m 744 vsock_diag_test $(INSTALL_PATH) install -m 744 vsock_uring_test $(INSTALL_PATH) else $(error Error: set INSTALL_PATH to use install) Endif Thanks, Peng. Thanks, Stefano
Re: [PATCH] test/vsock: add install target
On Wed, Jul 10, 2024 at 11:34:05AM GMT, Peng Fan wrote: Subject: Re: [PATCH] test/vsock: add install target On Wed, Jul 10, 2024 at 08:11:32AM GMT, Peng Fan wrote: >> Subject: Re: [PATCH] test/vsock: add install target >> >> On Tue, Jul 09, 2024 at 09:50:51PM GMT, Peng Fan (OSS) wrote: >> >From: Peng Fan >> > >> >Add install target for vsock to make Yocto easy to install the images. >> > >> >Signed-off-by: Peng Fan >> >--- >> > tools/testing/vsock/Makefile | 12 >> > 1 file changed, 12 insertions(+) >> > >> >diff --git a/tools/testing/vsock/Makefile >> >b/tools/testing/vsock/Makefile index a7f56a09ca9f..5c8442fa9460 >> 100644 >> >--- a/tools/testing/vsock/Makefile >> >+++ b/tools/testing/vsock/Makefile >> >@@ -8,8 +8,20 @@ vsock_perf: vsock_perf.o >> msg_zerocopy_common.o >> > vsock_uring_test: LDLIBS = -luring >> > vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o >> >msg_zerocopy_common.o >> > >> >+VSOCK_INSTALL_PATH ?= $(abspath .) >> >+# Avoid changing the rest of the logic here and lib.mk. >> >+INSTALL_PATH := $(VSOCK_INSTALL_PATH) >> >+ >> > CFLAGS += -g -O2 -Werror -Wall -I. -I../../include >> > -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow >> > -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE - >> D_GNU_SOURCE >> > .PHONY: all test clean >> > clean: >> > ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf >> vsock_uring_test >> > -include *.d >> >+ >> >+install: all >> >+ @# Ask all targets to install their files >> >+ mkdir -p $(INSTALL_PATH)/vsock >> >> why using the "vsock" subdir? >> >> IIUC you were inspired by selftests/Makefile, but it installs under >> $(INSTALL_PATH)/kselftest/ the scripts used by the main one >> `run_kselftest.sh`, which is installed in $(INSTALL_PATH instead. >> So in this case I would install everything in $(INSTALL_PATH). >> >> WDYT? > >I agree. > >> >> >+ install -m 744 vsock_test $(INSTALL_PATH)/vsock/ >> >+ install -m 744 vsock_perf $(INSTALL_PATH)/vsock/ >> >+ install -m 744 vsock_diag_test $(INSTALL_PATH)/vsock/ >> >+ install -m 744 vsock_uring_test $(INSTALL_PATH)/vsock/ >> >> Also from selftests/Makefile, what about using the ifdef instead of >> using $(abspath .) as default place? >> >> I mean this: >> >> install: all >> ifdef INSTALL_PATH >>... >> else >>$(error Error: set INSTALL_PATH to use install) endif > >Is the following looks good to you? > ># Avoid conflict with INSTALL_PATH set by the main Makefile >VSOCK_INSTALL_PATH ?= INSTALL_PATH := $(VSOCK_INSTALL_PATH) I'm not a super Makefile expert, but why do we need both VSOCK_INSTALL_PATH and INSTALL_PATH? INSTALL_PATH is exported by kernel root directory makefile. So to user, we need to avoid export INSTALL_PATH here. So I just follow selftests/Makefile using KSFT_INSTALL_PATH There is a comment there: # Avoid changing the rest of the logic here and lib.mk. Added by commit 17eac6c2db8b2cdfe33d40229bdda2acd86b304a. IIUC they re-used INSTALL_PATH, just to avoid too many changes in that file and in tools/testing/selftests/lib.mk So, IMHO we should not care about it and only use VSOCK_INSTALL_PATH if you don't want to conflict with INSTALL_PATH. Stefano
Re: [PATCH V2] test/vsock: add install target
On Wed, Jul 10, 2024 at 08:27:28PM GMT, Peng Fan (OSS) wrote: From: Peng Fan Add install target for vsock to make Yocto easy to install the images. Signed-off-by: Peng Fan --- LGTM! This is a net-next material, so next time better to specify it (e.g. [PATCH net-next]). If not queued within a week, please resend specifying net-next. Reviewed-by: Stefano Garzarella V2: Use VSOCK_INSTALL_PATH, drop INSTALL_PATH tools/testing/vsock/Makefile | 13 + 1 file changed, 13 insertions(+) diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile index a7f56a09ca9f..6e0b4e95e230 100644 --- a/tools/testing/vsock/Makefile +++ b/tools/testing/vsock/Makefile @@ -13,3 +13,16 @@ CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include -Wno-p clean: ${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test -include *.d + +VSOCK_INSTALL_PATH ?= + +install: all +ifdef VSOCK_INSTALL_PATH + mkdir -p $(VSOCK_INSTALL_PATH) + install -m 744 vsock_test $(VSOCK_INSTALL_PATH) + install -m 744 vsock_perf $(VSOCK_INSTALL_PATH) + install -m 744 vsock_diag_test $(VSOCK_INSTALL_PATH) + install -m 744 vsock_uring_test $(VSOCK_INSTALL_PATH) +else + $(error Error: set VSOCK_INSTALL_PATH to use install) +endif -- 2.37.1
Re: [PATCH] test/vsock: add install target
CCing Stefan. On Wed, Jul 10, 2024 at 07:00:59PM GMT, Jakub Kicinski wrote: On Wed, 10 Jul 2024 13:58:39 +0200 Stefano Garzarella wrote: There is a comment there: # Avoid changing the rest of the logic here and lib.mk. Added by commit 17eac6c2db8b2cdfe33d40229bdda2acd86b304a. IIUC they re-used INSTALL_PATH, just to avoid too many changes in that file and in tools/testing/selftests/lib.mk So, IMHO we should not care about it and only use VSOCK_INSTALL_PATH if you don't want to conflict with INSTALL_PATH. Any reason why vsock isn't part of selftests in the first place? Usually vsock tests test both the driver (virtio-vsock) in the guest and the device in the host kernel (vhost-vsock). So I usually run the tests in 2 nested VMs to test the latest changes for both the guest and the host. I don't know enough selftests, but do you think it is possible to integrate them? CCing Stefan who is the original author and may remember more reasons about this choice. Thanks, Stefano
Re: [PATCH] test/vsock: add install target
On Thu, Jul 11, 2024 at 07:14:55AM GMT, Jakub Kicinski wrote: On Thu, 11 Jul 2024 15:38:01 +0200 Stefan Hajnoczi wrote: > Usually vsock tests test both the driver (virtio-vsock) in the guest and the > device in the host kernel (vhost-vsock). So I usually run the tests in 2 > nested VMs to test the latest changes for both the guest and the host. > > I don't know enough selftests, but do you think it is possible to integrate > them? > > CCing Stefan who is the original author and may remember more reasons about > this choice. It's probably because of the manual steps in tools/testing/vsock/README: The following prerequisite steps are not automated and must be performed prior to running tests: 1. Build the kernel, make headers_install, and build these tests. 2. Install the kernel and tests on the host. 3. Install the kernel and tests inside the guest. 4. Boot the guest and ensure that the AF_VSOCK transport is enabled. If you want to automate this for QEMU, VMware, and Hyper-V that would be great. It relies on having a guest running under these hypervisors and that's not trivial to automate (plus it involves proprietary software for VMware and Hyper-V that may not be available without additional license agreements and/or payment). Not sure if there's a requirement that full process is automated. Or at least if there is we are already breaking it in networking because for some tests we need user to export some env variables to point the test to the right interfaces and even a remote machine to generate traffic. If the env isn't set up tests return 4 (SKIP). I don't feel strongly that ksft + env approach is better but at least it gives us easy access to the basic build and packaging features from ksft. Up to you but thought I'd ask. Yeah, I'll try to allocate some cycles to look into that. Tracking it here: https://gitlab.com/vsock/vsock/-/issues/13 What about this patch, can we queue it for now? Thanks, Stefano
Re: [PATCH net-next v3 1/2] vsock/virtio: refactor virtio_transport_send_pkt_work
On Thu, Jul 11, 2024 at 04:58:46PM GMT, Luigi Leonardi via B4 Relay wrote: From: Marco Pinna Preliminary patch to introduce an optimization to the enqueue system. All the code used to enqueue a packet into the virtqueue is removed from virtio_transport_send_pkt_work() and moved to the new virtio_transport_send_skb() function. Co-developed-by: Luigi Leonardi Signed-off-by: Luigi Leonardi Signed-off-by: Marco Pinna --- net/vmw_vsock/virtio_transport.c | 105 ++- 1 file changed, 59 insertions(+), 46 deletions(-) LGTM Reviewed-by: Stefano Garzarella diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 43d405298857..c4205c22f40b 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -94,6 +94,63 @@ static u32 virtio_transport_get_local_cid(void) return ret; } +/* Caller need to hold vsock->tx_lock on vq */ +static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq, +struct virtio_vsock *vsock) +{ + int ret, in_sg = 0, out_sg = 0; + struct scatterlist **sgs; + + sgs = vsock->out_sgs; + sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), + sizeof(*virtio_vsock_hdr(skb))); + out_sg++; + + if (!skb_is_nonlinear(skb)) { + if (skb->len > 0) { + sg_init_one(sgs[out_sg], skb->data, skb->len); + out_sg++; + } + } else { + struct skb_shared_info *si; + int i; + + /* If skb is nonlinear, then its buffer must contain +* only header and nothing more. Data is stored in +* the fragged part. +*/ + WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); + + si = skb_shinfo(skb); + + for (i = 0; i < si->nr_frags; i++) { + skb_frag_t *skb_frag = &si->frags[i]; + void *va; + + /* We will use 'page_to_virt()' for the userspace page +* here, because virtio or dma-mapping layers will call +* 'virt_to_phys()' later to fill the buffer descriptor. +* We don't touch memory at "virtual" address of this page. +*/ + va = page_to_virt(skb_frag_page(skb_frag)); + sg_init_one(sgs[out_sg], + va + skb_frag_off(skb_frag), + skb_frag_size(skb_frag)); + out_sg++; + } + } + + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); + /* Usually this means that there is no more space available in +* the vq +*/ + if (ret < 0) + return ret; + + virtio_transport_deliver_tap_pkt(skb); + return 0; +} + static void virtio_transport_send_pkt_work(struct work_struct *work) { @@ -111,66 +168,22 @@ virtio_transport_send_pkt_work(struct work_struct *work) vq = vsock->vqs[VSOCK_VQ_TX]; for (;;) { - int ret, in_sg = 0, out_sg = 0; - struct scatterlist **sgs; struct sk_buff *skb; bool reply; + int ret; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) break; reply = virtio_vsock_skb_reply(skb); - sgs = vsock->out_sgs; - sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), - sizeof(*virtio_vsock_hdr(skb))); - out_sg++; - - if (!skb_is_nonlinear(skb)) { - if (skb->len > 0) { - sg_init_one(sgs[out_sg], skb->data, skb->len); - out_sg++; - } - } else { - struct skb_shared_info *si; - int i; - - /* If skb is nonlinear, then its buffer must contain -* only header and nothing more. Data is stored in -* the fragged part. -*/ - WARN_ON_ONCE(skb_headroom(skb) != sizeof(*virtio_vsock_hdr(skb))); - - si = skb_shinfo(skb); - for (i = 0; i < si->nr_frags; i++) { - skb_frag_t *skb_frag = &si->frags[i]; - void *va; - - /* We will use 'page_to_virt()' for the userspace page -* here, because virtio or dma-mapping layers will call -
Re: [PATCH net-next v3 2/2] vsock/virtio: avoid queuing packets when work queue is empty
On Thu, Jul 11, 2024 at 04:58:47PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Introduce an optimization in virtio_transport_send_pkt: when the work queue (send_pkt_queue) is empty the packet is Note: send_pkt_queue is just a queue of sk_buff, is not really a work queue. put directly in the virtqueue increasing the throughput. Why? I'd write something like this, but feel free to change it: When the driver needs to send new packets to the device, it always queues the new sk_buffs into an intermediate queue (send_pkt_queue) and schedules a worker (send_pkt_work) to then queue them into the virtqueue exposed to the device. This increases the chance of batching, but also introduces a lot of latency into the communication. So we can optimize this path by adding a fast path to be taken when there is no element in the intermediate queue, there is space available in the virtqueue, and no other process that is sending packets (tx_lock held). In the following benchmark (pingpong mode) the host sends "fio benchmark" a payload to the guest and waits for the same payload back. All vCPUs pinned individually to pCPUs. vhost process pinned to a pCPU fio process pinned both inside the host and the guest system. Host CPU: Intel i7-10700KF CPU @ 3.80GHz Tool: Fio version 3.37-56 Env: Phys host + L1 Guest Runtime-per-test: 50s Mode: pingpong (h-g-h) Test runs: 50 Type: SOCK_STREAM Before: Linux 6.9.7 Payload 512B: 1st perc. overall 99th perc. Before 370 810.15 8656ns After 374 780.29 8741ns Payload 4K: 1st perc. overall 99th perc. Before 460 1720.23 42752 ns After 460 1520.84 36096 ns The performance improvement is related to this optimization, I used ebpf to check that each packet was sent directly to the virtqueue. Throughput: iperf-vsock I would reorganize the description for a moment because it's a little confusing. For example like this: The following benchmarks were run to check improvements in latency and throughput. The test bed is a host with Intel i7-10700KF CPU @ 3.80GHz and L1 guest running on QEMU/KVM. - Latency Tool: ... - Throughput Tool: ... The size represents the buffer length (-l) to read/write P represents the number parallel streams P=1 4K 64K 128K Before 6.8729.329.5 Gb/s After 10.539.439.9 Gb/s P=2 4K 64K 128K Before 10.532.833.2 Gb/s After 17.847.748.5 Gb/s P=4 4K 64K 128K Before 12.733.634.2 Gb/s After 16.948.150.5 Gb/s Wow, great! I'm a little surprised that the latency is not much affected, but the throughput benefits so much with that kind of optimization. Maybe we can check the latency with smaller payloads like 64 bytes or even smaller. Co-developed-by: Marco Pinna Signed-off-by: Marco Pinna Signed-off-by: Luigi Leonardi --- net/vmw_vsock/virtio_transport.c | 38 ++ 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index c4205c22f40b..d75727fdc35f 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -208,6 +208,29 @@ virtio_transport_send_pkt_work(struct work_struct *work) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +/* Caller need to hold RCU for vsock. + * Returns 0 if the packet is successfully put on the vq. + */ +static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struct sk_buff *skb) +{ + struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; + int ret; + + /* Inside RCU, can't sleep! */ + ret = mutex_trylock(&vsock->tx_lock); + if (unlikely(ret == 0)) + return -EBUSY; + + ret = virtio_transport_send_skb(skb, vq, vsock); + + mutex_unlock(&vsock->tx_lock); + + /* Kick if virtio_transport_send_skb succeeded */ Superfluous comment, we can remove it. + if (ret == 0) + virtqueue_kick(vq); nit: I'd add a blank line here after the if block to highlight that the return is out. + return ret; +} + static int virtio_transport_send_pkt(struct sk_buff *skb) { @@ -231,11 +254,18 @@ virtio_transport_send_pkt(struct sk_buff *skb) goto out_rcu; } - if (virtio_vsock_skb_reply(skb)) - atomic_inc(&vsock->queued_replies); + /* If the workqueue (send_pkt_queue) is empty there is no need to enqueue the packet. Again, send_pkt_queue is not a workqueue. Here I would explain more why there is no need, the fact that we are not doing this is clear. +* Just put it on the virtqueue using virtio_transport_send_skb_fast_path. +*/ nit: here I would instead remove the blank line to make it clear tha
Re: [PATCH net-next v3 3/3] test/vsock: add ioctl unsent bytes test
On Wed, Jun 26, 2024 at 02:08:37PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET, which checks after a packet is delivered, that the number of unsent bytes is zero, using ioctl SIOCOUTQ. Signed-off-by: Luigi Leonardi --- tools/testing/vsock/util.c | 6 +-- tools/testing/vsock/util.h | 3 ++ tools/testing/vsock/vsock_test.c | 85 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 554b290fefdc..a3d448a075e3 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po } /* Connect to and return the file descriptor. */ -static int vsock_connect(unsigned int cid, unsigned int port, int type) +int vsock_connect(unsigned int cid, unsigned int port, int type) { union { struct sockaddr sa; @@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type) /* Listen on and return the first incoming connection. The remote * address is stored to clientaddrp. clientaddrp may be NULL. */ -static int vsock_accept(unsigned int cid, unsigned int port, - struct sockaddr_vm *clientaddrp, int type) +int vsock_accept(unsigned int cid, unsigned int port, +struct sockaddr_vm *clientaddrp, int type) { union { struct sockaddr sa; diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index e95e62485959..fff22d4a14c0 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -39,6 +39,9 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); unsigned int parse_port(const char *str); +int vsock_connect(unsigned int cid, unsigned int port, int type); +int vsock_accept(unsigned int cid, unsigned int port, +struct sockaddr_vm *clientaddrp, int type); I'd mention in the commit description that you need these functions to be more generic. Maybe in the future we can re-use them where we share the same test for both SEQPACKET and STREAM. The rest LGTM. Thanks, Stefano int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index f851f8961247..76bd17b4b291 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "vsock_test_zerocopy.h" #include "timeout.h" @@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define MSG_BUF_IOCTL_LEN 64 +static void test_unsent_bytes_server(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int client_fd; + + client_fd = vsock_accept(VMADDR_CID_ANY, 1234, NULL, type); + if (client_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf)); + control_writeln("RECEIVED"); + + close(client_fd); +} + +static void test_unsent_bytes_client(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int ret, fd, sock_bytes_unsent; + + fd = vsock_connect(opts->peer_cid, 1234, type); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < sizeof(buf); i++) + buf[i] = rand() & 0xFF; + + send_buf(fd, buf, sizeof(buf), 0, sizeof(buf)); + control_expectln("RECEIVED"); + + ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + fprintf(stderr, "Test skipped\n"); + } else { + perror("ioctl"); + exit(EXIT_FAILURE); + } + } else if (ret == 0 && sock_bytes_unsent != 0) { + fprintf(stderr, + "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n", + sock_bytes_unsent); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_STREAM); +} + +static void test_stream_unsent_bytes_server(const struct test_opts *opts) +{ + test_unsent_bytes_server(opts, SOCK_STREAM); +} + +static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_SEQPACKET); +} + +static void test_seqpacket_unsent_bytes_server(const struct test_opt
Re: [PATCH v1] MAINTAINERS: add me as reviewer of AF_VSOCK and virtio-vsock
On Sun, Jul 28, 2024 at 09:33:25PM GMT, Arseniy Krasnov wrote: I'm working on AF_VSOCK and virtio-vsock. Yeah, thanks for the help! Signed-off-by: Arseniy Krasnov --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) Reviewed-by: Stefano Garzarella diff --git a/MAINTAINERS b/MAINTAINERS index c0a3d9e93689..2bf0987d87ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24131,6 +24131,7 @@ F: virt/lib/ VIRTIO AND VHOST VSOCK DRIVER M: Stefan Hajnoczi M: Stefano Garzarella +R: Arseniy Krasnov L: k...@vger.kernel.org L: virtualizat...@lists.linux.dev L: net...@vger.kernel.org @@ -24370,6 +24371,7 @@ F: drivers/media/test-drivers/vivid/* VM SOCKETS (AF_VSOCK) M: Stefano Garzarella +R: Arseniy Krasnov L: virtualizat...@lists.linux.dev L: net...@vger.kernel.org S: Maintained -- 2.35.0
Re: [PATCH net-next v4 1/3] vsock: add support for SIOCOUTQ ioctl
On Tue, Jul 30, 2024 at 09:43:06PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Add support for ioctl(s) in AF_VSOCK. The only ioctl available is SIOCOUTQ/TIOCOUTQ, which returns the number of unsent bytes in the socket. This information is transport-specific and is delegated to them using a callback. Suggested-by: Daan De Meyer Signed-off-by: Luigi Leonardi --- include/net/af_vsock.h | 3 +++ net/vmw_vsock/af_vsock.c | 58 +--- 2 files changed, 58 insertions(+), 3 deletions(-) LGTM! Reviewed-by: Stefano Garzarella diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 535701efc1e5..fc504d2da3d0 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -169,6 +169,9 @@ struct vsock_transport { void (*notify_buffer_size)(struct vsock_sock *, u64 *); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); + /* SIOCOUTQ ioctl */ + ssize_t (*unsent_bytes)(struct vsock_sock *vsk); + /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4b040285aa78..58e639e82942 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -112,6 +112,7 @@ #include #include #include +#include static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); @@ -1292,6 +1293,57 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); +static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, + int __user *arg) +{ + struct sock *sk = sock->sk; + struct vsock_sock *vsk; + int ret; + + vsk = vsock_sk(sk); + + switch (cmd) { + case SIOCOUTQ: { + ssize_t n_bytes; + + if (!vsk->transport || !vsk->transport->unsent_bytes) { + ret = -EOPNOTSUPP; + break; + } + + if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { + ret = -EINVAL; + break; + } + + n_bytes = vsk->transport->unsent_bytes(vsk); + if (n_bytes < 0) { + ret = n_bytes; + break; + } + + ret = put_user(n_bytes, arg); + break; + } + default: + ret = -ENOIOCTLCMD; + } + + return ret; +} + +static int vsock_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + int ret; + + lock_sock(sock->sk); + ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); + release_sock(sock->sk); + + return ret; +} + static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, @@ -1302,7 +1354,7 @@ static const struct proto_ops vsock_dgram_ops = { .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, @@ -2286,7 +2338,7 @@ static const struct proto_ops vsock_stream_ops = { .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, @@ -2308,7 +2360,7 @@ static const struct proto_ops vsock_seqpacket_ops = { .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, -- 2.45.2
Re: [PATCH net-next v4 2/3] vsock/virtio: add SIOCOUTQ support for all virtio based transports
On Tue, Jul 30, 2024 at 09:43:07PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Introduce support for virtio_transport_unsent_bytes ioctl for virtio_transport, vhost_vsock and vsock_loopback. For all transports the unsent bytes counter is incremented in virtio_transport_get_credit. In virtio_transport (G2H) and in vhost-vsock (H2G) the counter is decremented when the skbuff is consumed. In vsock_loopback the same skbuff is passed from the transmitter to the receiver, so the counter is decremented before queuing the skbuff to the receiver. Signed-off-by: Luigi Leonardi --- drivers/vhost/vsock.c | 4 +++- include/linux/virtio_vsock.h| 6 ++ net/vmw_vsock/virtio_transport.c| 4 +++- net/vmw_vsock/virtio_transport_common.c | 35 + net/vmw_vsock/vsock_loopback.c | 6 ++ 5 files changed, 53 insertions(+), 2 deletions(-) Reviewed-by: Stefano Garzarella diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index bf664ec9341b..802153e23073 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -244,7 +244,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, restart_tx = true; } - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_unsent_bytes, + .read_skb = virtio_transport_read_skb, }, diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c82089dee0c8..0387d64e2c66 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -133,6 +133,7 @@ struct virtio_vsock_sock { u32 tx_cnt; u32 peer_fwd_cnt; u32 peer_buf_alloc; + size_t bytes_unsent; /* Protected by rx_lock */ u32 fwd_cnt; @@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); +ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk); + +void virtio_transport_consume_skb_sent(struct sk_buff *skb, + bool consume); + int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); int diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 64a07acfef12..e0160da4ef43 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct *work) virtqueue_disable_cb(vq); while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); added = true; } } while (!virtqueue_enable_cb(vq)); @@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_unsent_bytes, + .read_skb = virtio_transport_read_skb, }, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 16ff976a86e3..884ee128851e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff * } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); +void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume) +{ + struct sock *s = skb->sk; + + if (s && skb->len) { + struct vsock_sock *vs = vsock_sk(s); + struct virtio_vsock_sock *vvs; + + vvs = vs->trans; + + spin_lock_bh(&vvs->tx_lock); + vvs->bytes_unsent -= skb->len; + spin_unlock_bh(&vvs->tx_lock); + } + + if (consume) + consume_skb(skb); +} +EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent); + u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) { u32 ret; @@ -475,6 +495,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) if (ret > credit)
Re: [PATCH net-next v4 3/3] test/vsock: add ioctl unsent bytes test
On Tue, Jul 30, 2024 at 09:43:08PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi Introduce two tests, one for SOCK_STREAM and one for SOCK_SEQPACKET, which use SIOCOUTQ ioctl to check that the number of unsent bytes is zero after delivering a packet. vsock_connect and vsock_accept are no longer static: this is to create more generic tests, allowing code to be reused for SEQPACKET and STREAM. Yeah, good idea. We should use them for other tests as well. (for the future) Signed-off-by: Luigi Leonardi --- tools/testing/vsock/util.c | 6 +-- tools/testing/vsock/util.h | 3 ++ tools/testing/vsock/vsock_test.c | 85 3 files changed, 91 insertions(+), 3 deletions(-) LGTM and I ran them. All good :-) Reviewed-by: Stefano Garzarella diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c index 554b290fefdc..a3d448a075e3 100644 --- a/tools/testing/vsock/util.c +++ b/tools/testing/vsock/util.c @@ -139,7 +139,7 @@ int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_po } /* Connect to and return the file descriptor. */ -static int vsock_connect(unsigned int cid, unsigned int port, int type) +int vsock_connect(unsigned int cid, unsigned int port, int type) { union { struct sockaddr sa; @@ -226,8 +226,8 @@ static int vsock_listen(unsigned int cid, unsigned int port, int type) /* Listen on and return the first incoming connection. The remote * address is stored to clientaddrp. clientaddrp may be NULL. */ -static int vsock_accept(unsigned int cid, unsigned int port, - struct sockaddr_vm *clientaddrp, int type) +int vsock_accept(unsigned int cid, unsigned int port, +struct sockaddr_vm *clientaddrp, int type) { union { struct sockaddr sa; diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h index e95e62485959..fff22d4a14c0 100644 --- a/tools/testing/vsock/util.h +++ b/tools/testing/vsock/util.h @@ -39,6 +39,9 @@ struct test_case { void init_signals(void); unsigned int parse_cid(const char *str); unsigned int parse_port(const char *str); +int vsock_connect(unsigned int cid, unsigned int port, int type); +int vsock_accept(unsigned int cid, unsigned int port, +struct sockaddr_vm *clientaddrp, int type); int vsock_stream_connect(unsigned int cid, unsigned int port); int vsock_bind_connect(unsigned int cid, unsigned int port, unsigned int bind_port, int type); diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index f851f8961247..8d38dbf8f41f 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "vsock_test_zerocopy.h" #include "timeout.h" @@ -1238,6 +1240,79 @@ static void test_double_bind_connect_client(const struct test_opts *opts) } } +#define MSG_BUF_IOCTL_LEN 64 +static void test_unsent_bytes_server(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int client_fd; + + client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type); + if (client_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf)); + control_writeln("RECEIVED"); + + close(client_fd); +} + +static void test_unsent_bytes_client(const struct test_opts *opts, int type) +{ + unsigned char buf[MSG_BUF_IOCTL_LEN]; + int ret, fd, sock_bytes_unsent; + + fd = vsock_connect(opts->peer_cid, opts->peer_port, type); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < sizeof(buf); i++) + buf[i] = rand() & 0xFF; + + send_buf(fd, buf, sizeof(buf), 0, sizeof(buf)); + control_expectln("RECEIVED"); + + ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n"); + } else { + perror("ioctl"); + exit(EXIT_FAILURE); + } + } else if (ret == 0 && sock_bytes_unsent != 0) { + fprintf(stderr, + "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n", + sock_bytes_unsent); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_unsent_bytes_client(const struct test_opts *opts) +{ + test_unsent_bytes_client(opts, SOCK_STREAM); +} + +static void test_stream_unsent_bytes_server(const struct test_opts *opts)
Re: [PATCH net-next v4 2/2] vsock/virtio: avoid queuing packets when intermediate queue is empty
On Tue, Jul 30, 2024 at 09:47:32PM GMT, Luigi Leonardi via B4 Relay wrote: From: Luigi Leonardi When the driver needs to send new packets to the device, it always queues the new sk_buffs into an intermediate queue (send_pkt_queue) and schedules a worker (send_pkt_work) to then queue them into the virtqueue exposed to the device. This increases the chance of batching, but also introduces a lot of latency into the communication. So we can optimize this path by adding a fast path to be taken when there is no element in the intermediate queue, there is space available in the virtqueue, and no other process that is sending packets (tx_lock held). The following benchmarks were run to check improvements in latency and throughput. The test bed is a host with Intel i7-10700KF CPU @ 3.80GHz and L1 guest running on QEMU/KVM with vhost process and all vCPUs pinned individually to pCPUs. - Latency Tool: Fio version 3.37-56 Mode: pingpong (h-g-h) Test runs: 50 Runtime-per-test: 50s Type: SOCK_STREAM In the following fio benchmark (pingpong mode) the host sends a payload to the guest and waits for the same payload back. fio process pinned both inside the host and the guest system. Before: Linux 6.9.8 Payload 64B: 1st perc. overall 99th perc. Before 12.91 16.78 42.24 us After 9.7713.57 39.17 us Payload 512B: 1st perc. overall 99th perc. Before 13.35 17.35 41.52 us After 10.25 14.11 39.58 us Payload 4K: 1st perc. overall 99th perc. Before 14.71 19.87 41.52 us After 10.51 14.96 40.81 us - Throughput Tool: iperf-vsock The size represents the buffer length (-l) to read/write P represents the number of parallel streams P=1 4K 64K 128K Before 6.8729.329.5 Gb/s After 10.539.439.9 Gb/s P=2 4K 64K 128K Before 10.532.833.2 Gb/s After 17.847.748.5 Gb/s P=4 4K 64K 128K Before 12.733.634.2 Gb/s After 16.948.150.5 Gb/s Great improvement! Thanks again for this work! The performance improvement is related to this optimization, I used a ebpf kretprobe on virtio_transport_send_skb to check that each packet was sent directly to the virtqueue Co-developed-by: Marco Pinna Signed-off-by: Marco Pinna Signed-off-by: Luigi Leonardi --- net/vmw_vsock/virtio_transport.c | 39 +++ 1 file changed, 35 insertions(+), 4 deletions(-) All my comments have been resolved. I let iperf run bidirectionally for a long time and saw no problems, so: Reviewed-by: Stefano Garzarella diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f641e906f351..f992f9a216f0 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -208,6 +208,28 @@ virtio_transport_send_pkt_work(struct work_struct *work) queue_work(virtio_vsock_workqueue, &vsock->rx_work); } +/* Caller need to hold RCU for vsock. + * Returns 0 if the packet is successfully put on the vq. + */ +static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struct sk_buff *skb) +{ + struct virtqueue *vq = vsock->vqs[VSOCK_VQ_TX]; + int ret; + + /* Inside RCU, can't sleep! */ + ret = mutex_trylock(&vsock->tx_lock); + if (unlikely(ret == 0)) + return -EBUSY; + + ret = virtio_transport_send_skb(skb, vq, vsock); + if (ret == 0) + virtqueue_kick(vq); + + mutex_unlock(&vsock->tx_lock); + + return ret; +} + static int virtio_transport_send_pkt(struct sk_buff *skb) { @@ -231,11 +253,20 @@ virtio_transport_send_pkt(struct sk_buff *skb) goto out_rcu; } - if (virtio_vsock_skb_reply(skb)) - atomic_inc(&vsock->queued_replies); + /* If send_pkt_queue is empty, we can safely bypass this queue +* because packet order is maintained and (try) to put the packet +* on the virtqueue using virtio_transport_send_skb_fast_path. +* If this fails we simply put the packet on the intermediate +* queue and schedule the worker. +*/ + if (!skb_queue_empty_lockless(&vsock->send_pkt_queue) || + virtio_transport_send_skb_fast_path(vsock, skb)) { + if (virtio_vsock_skb_reply(skb)) + atomic_inc(&vsock->queued_replies); - virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); - queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + } out_rcu: rcu_read_unlock(); -- 2.45.2
Re: [PATCH net-next v4 0/2] vsock: avoid queuing on intermediate queue if possible
Hi Michael, this series is marked as "Not Applicable" for the net-next tree: https://patchwork.kernel.org/project/netdevbpf/patch/20240730-pinna-v4-2-5c9179164...@outlook.com/ Actually this is more about the virtio-vsock driver, so can you queue this on your tree? Thanks, Stefano On Tue, Jul 30, 2024 at 09:47:30PM GMT, Luigi Leonardi via B4 Relay wrote: This series introduces an optimization for vsock/virtio to reduce latency and increase the throughput: When the guest sends a packet to the host, and the intermediate queue (send_pkt_queue) is empty, if there is enough space, the packet is put directly in the virtqueue. v3->v4 While running experiments on fio with 64B payload, I realized that there was a mistake in my fio configuration, so I re-ran all the experiments and now the latency numbers are indeed lower with the patch applied. I also noticed that I was kicking the host without the lock. - Fixed a configuration mistake on fio and re-ran all experiments. - Fio latency measurement using 64B payload. - virtio_transport_send_skb_fast_path sends kick with the tx_lock acquired - Addressed all minor style changes requested by maintainer. - Rebased on latest net-next - Link to v3: https://lore.kernel.org/r/20240711-pinna-v3-0-697d4164f...@outlook.com v2->v3 - Performed more experiments using iperf3 using multiple streams - Handling of reply packets removed from virtio_transport_send_skb, as is needed just by the worker. - Removed atomic_inc/atomic_sub when queuing directly to the vq. - Introduced virtio_transport_send_skb_fast_path that handles the steps for sending on the vq. - Fixed a missing mutex_unlock in error path. - Changed authorship of the second commit - Rebased on latest net-next v1->v2 In this v2 I replaced a mutex_lock with a mutex_trylock because it was insidea RCU critical section. I also added a check on tx_run, so if the module is being removed the packet is not queued. I'd like to thank Stefano for reporting the tx_run issue. Applied all Stefano's suggestions: - Minor code style changes - Minor commit text rewrite Performed more experiments: - Check if all the packets go directly to the vq (Matias' suggestion) - Used iperf3 to see if there is any improvement in overall throughput from guest to host - Pinned the vhost process to a pCPU. - Run fio using 512B payload Rebased on latest net-next --- Luigi Leonardi (1): vsock/virtio: avoid queuing packets when intermediate queue is empty Marco Pinna (1): vsock/virtio: refactor virtio_transport_send_pkt_work net/vmw_vsock/virtio_transport.c | 144 +-- 1 file changed, 94 insertions(+), 50 deletions(-) --- base-commit: 1722389b0d863056d78287a120a1d6cadb8d4f7b change-id: 20240730-pinna-db8cc1b8b037 Best regards, -- Luigi Leonardi
Re: BUG: stack guard page was hit in vsock_connectible_recvmsg
Hi, On Mon, Aug 05, 2024 at 08:44:11AM GMT, Ubisectech Sirius wrote: Hello. We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. Recently, our team has discovered a issue in Linux kernel 6.8. Attached to the email were a PoC file of the issue. Thanks for the report! It looks like this is releated to the net/vmw_vsock/vsock_bpf.c, so I'm CCing Bobby who developed that. @Bobby if you have time, please take a look. I'm trying to replicate on a VM with 6.8 kernel, but for now I can't reproduce it. How reproducible is it in your system? I see that the reproducer was generated by syzkaller. Is that internal or public instance? In the second case, do you have a link to the report? From the report I see that you're using 6.8.0. Is it the upstream version (commit e8f897f4afef0031fe618a8e94127a0934896aba)? Can you replicate this with more recent versions as well? Thanks, Stefano Stack dump: BUG: TASK stack guard page was hit at c90001b27f88 (stack is c90001b28000..c90001b3) stack guard page: [#1] PREEMPT SMP KASAN NOPTI CPU: 0 PID: 8069 Comm: syz-executor293 Not tainted 6.8.0 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:mark_lock+0x25/0xd60 kernel/locking/lockdep.c:4639 Code: 90 90 90 90 90 55 48 89 e5 41 57 41 56 41 55 41 54 41 89 d4 48 ba 00 00 00 00 00 fc ff df 53 48 83 e4 f0 48 81 ec 10 01 00 00 <48> c7 44 24 30 b3 8a b5 41 48 8d 5c 24 30 48 c7 44 24 38 00 88 b9 RSP: 0018:c90001b27f90 EFLAGS: 00010086 RAX: 0004 RBX: 888042cd2fa2 RCX: 888042cd2f64 RDX: dc00 RSI: 888042cd2f80 RDI: 888042cd24c0 RBP: c90001b280c8 R08: 0001 R09: fbfff2711214 R10: 938890a7 R11: R12: 0002 R13: R14: 888042cd24c0 R15: 0004073c FS: 558f13c0() GS:88802c60() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: c90001b27f88 CR3: 48cb8000 CR4: 00750ef0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 PKRU: 5554 Call Trace: <#DF> mark_usage kernel/locking/lockdep.c:4587 [inline] __lock_acquire+0x91e/0x3bc0 kernel/locking/lockdep.c:5091 lock_acquire kernel/locking/lockdep.c:5754 [inline] lock_acquire+0x1b1/0x530 kernel/locking/lockdep.c:5719 lock_sock_nested+0x3a/0xf0 net/core/sock.c:3523 lock_sock include/net/sock.h:1691 [inline] vsock_connectible_recvmsg+0xdd/0xba0 net/vmw_vsock/af_vsock.c:2196 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg net/vmw_vsock/vsock_bpf.c:67 [inline] vsock_bpf_recvmsg+0xb41/0x11a0 net/vmw_vsock/vsock_bpf.c:105 vsock_connectible_recvmsg+0x92b/0xba0 net/vmw_vsock/af_vsock.c:2240 __vsock_recvmsg
Re: [PATCH net-next v4 0/2] vsock: avoid queuing on intermediate queue if possible
On Tue, Aug 06, 2024 at 09:02:57AM GMT, Jakub Kicinski wrote: On Mon, 5 Aug 2024 10:39:23 +0200 Stefano Garzarella wrote: this series is marked as "Not Applicable" for the net-next tree: https://patchwork.kernel.org/project/netdevbpf/patch/20240730-pinna-v4-2-5c9179164...@outlook.com/ Actually this is more about the virtio-vsock driver, so can you queue this on your tree? We can revive it in our patchwork, too, if that's easier. That's perfectly fine with me, if Michael hasn't already queued it. Not entirely sure why it was discarded, seems borderline. Yes, even to me it's not super clear when to expect net and when virtio. Usually the other vsock transports (VMCI and HyperV) go with net, so virtio-vsock is a bit of an exception. I don't have any particular preferences, so how it works best for you and Michael is fine with me. Thanks, Stefano
Re: [PATCH net-next v4 0/2] vsock: avoid queuing on intermediate queue if possible
On Thu, Aug 29, 2024 at 08:19:31AM GMT, Michael S. Tsirkin wrote: On Thu, Aug 29, 2024 at 01:00:37PM +0200, Luigi Leonardi wrote: Hi All, It has been a while since the last email and this patch has not been merged yet. This is just a gentle ping :) Thanks, Luigi ok I can queue it for next. Next time pls remember to CC all maintainers. Thanks! Thank for queueing it! BTW, it looks like the virtio-vsock driver is listed in "VIRTIO AND VHOST VSOCK DRIVER" but not listed under "VIRTIO CORE AND NET DRIVERS", so running get_maintainer.pl I have this list: $ ./scripts/get_maintainer.pl -f net/vmw_vsock/virtio_transport.c Stefan Hajnoczi (maintainer:VIRTIO AND VHOST VSOCK DRIVER) Stefano Garzarella (maintainer:VIRTIO AND VHOST VSOCK DRIVER) "David S. Miller" (maintainer:NETWORKING [GENERAL]) Eric Dumazet (maintainer:NETWORKING [GENERAL]) Jakub Kicinski (maintainer:NETWORKING [GENERAL]) Paolo Abeni (maintainer:NETWORKING [GENERAL]) k...@vger.kernel.org (open list:VIRTIO AND VHOST VSOCK DRIVER) virtualizat...@lists.linux.dev (open list:VIRTIO AND VHOST VSOCK DRIVER) net...@vger.kernel.org (open list:VIRTIO AND VHOST VSOCK DRIVER) linux-kernel@vger.kernel.org (open list) Should we add net/vmw_vsock/virtio_transport.c and related files also under "VIRTIO CORE AND NET DRIVERS" ? Thanks, Stefano >Hi Michael, >this series is marked as "Not Applicable" for the net-next tree: >https://patchwork.kernel.org/project/netdevbpf/patch/20240730-pinna-v4-2-5c9179164...@outlook.com/ >Actually this is more about the virtio-vsock driver, so can you queue >this on your tree? >Thanks, >Stefano
[PATCH] MAINTAINERS: add virtio-vsock driver in the VIRTIO CORE section
The virtio-vsock driver is already under VM SOCKETS (AF_VSOCK), managed pricipally with the net tree, and VIRTIO AND VHOST VSOCK DRIVER. However, changes that only affect the virtio part usually go with Michael's tree, so let's also put the driver in the VIRTIO CORE section to have its maintainers in CC for changes to the virtio-vsock driver. Cc: "Michael S. Tsirkin" Cc: Jason Wang Signed-off-by: Stefano Garzarella --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 878dcd23b331..6dcea63f396e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24218,6 +24218,7 @@ F: include/linux/vdpa.h F: include/linux/virtio*.h F: include/linux/vringh.h F: include/uapi/linux/virtio_*.h +F: net/vmw_vsock/virtio* F: tools/virtio/ F: tools/testing/selftests/drivers/net/virtio_net/ -- 2.46.0
Re: [PATCH for 5.10] vdpa_sim: fix param validation in vdpasim_get_config()
On Mon, Feb 15, 2021 at 03:32:19PM +0100, Greg KH wrote: On Thu, Feb 11, 2021 at 05:25:19PM +0100, Stefano Garzarella wrote: Commit 65b709586e222fa6ffd4166ac7fdb5d5dad113ee upstream. No, this really is not that commit, so please do not say it is. Oops, sorry. Before this patch, if 'offset + len' was equal to sizeof(struct virtio_net_config), the entire buffer wasn't filled, returning incorrect values to the caller. Since 'vdpasim->config' type is 'struct virtio_net_config', we can safely copy its content under this condition. Commit 65b709586e22 ("vdpa_sim: add get_config callback in vdpasim_dev_attr") unintentionally solved it upstream while refactoring vdpa_sim.c to support multiple devices. But we don't want to backport it to stable branches as it contains many changes. Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator") Cc: # 5.10.x Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6a90fdb9cbfc..8ca178d7b02f 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -572,7 +572,7 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - if (offset + len < sizeof(struct virtio_net_config)) + if (offset + len <= sizeof(struct virtio_net_config)) memcpy(buf, (u8 *)&vdpasim->config + offset, len); } I'll be glad to take a one-off patch, but why can't we take the real upstream patch? That is always the better long-term solution, right? Because that patch depends on the following patches merged in v5.11-rc1 while refactoring vdpa_sim: f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer a13b5918fdd0 vdpa_sim: add work_fn in vdpasim_dev_attr 011c35bac5ef vdpa_sim: add supported_features field in vdpasim_dev_attr 2f8f46188805 vdpa_sim: add device id field in vdpasim_dev_attr 6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes 36a9c3063025 vdpa_sim: rename vdpasim_config_ops variables 423248d60d2b vdpa_sim: remove hard-coded virtq count Maybe we can skip some of them, but IMHO should be less risky to apply only this change. If you want I can try to figure out the minimum sub-set of patches needed for 65b709586e22 ("vdpa_sim: add get_config callback in vdpasim_dev_attr"). Thanks, Stefano
[RFC PATCH 01/10] vdpa: add get_config_size callback in vdpa_config_ops
This new callback is used to get the size of the configuration space of vDPA devices. Signed-off-by: Stefano Garzarella --- include/linux/vdpa.h | 4 drivers/vdpa/ifcvf/ifcvf_main.c | 6 ++ drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 9 + 4 files changed, 25 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 4ab5494503a8..fddf42b17573 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -150,6 +150,9 @@ struct vdpa_iova_range { * @set_status:Set the device status * @vdev: vdpa device * @status: virtio device status + * @get_config_size: Get the size of the configuration space + * @vdev: vdpa device + * Returns size_t: configuration size * @get_config:Read from device specific configuration space * @vdev: vdpa device * @offset: offset from the beginning of @@ -231,6 +234,7 @@ struct vdpa_config_ops { u32 (*get_vendor_id)(struct vdpa_device *vdev); u8 (*get_status)(struct vdpa_device *vdev); void (*set_status)(struct vdpa_device *vdev, u8 status); + size_t (*get_config_size)(struct vdpa_device *vdev); void (*get_config)(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); void (*set_config)(struct vdpa_device *vdev, unsigned int offset, diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 7c8bbfcf6c3e..2443271e17d2 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -332,6 +332,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) return IFCVF_QUEUE_ALIGNMENT; } +static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev) +{ + return sizeof(struct virtio_net_config); +} + static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf, unsigned int len) @@ -392,6 +397,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .get_device_id = ifcvf_vdpa_get_device_id, .get_vendor_id = ifcvf_vdpa_get_vendor_id, .get_vq_align = ifcvf_vdpa_get_vq_align, + .get_config_size= ifcvf_vdpa_get_config_size, .get_config = ifcvf_vdpa_get_config, .set_config = ifcvf_vdpa_set_config, .set_config_cb = ifcvf_vdpa_set_config_cb, diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 10e9b09932eb..78043ee567b6 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1814,6 +1814,11 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; } +static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev) +{ + return sizeof(struct virtio_net_config); +} + static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { @@ -1900,6 +1905,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vendor_id = mlx5_vdpa_get_vendor_id, .get_status = mlx5_vdpa_get_status, .set_status = mlx5_vdpa_set_status, + .get_config_size = mlx5_vdpa_get_config_size, .get_config = mlx5_vdpa_get_config, .set_config = mlx5_vdpa_set_config, .get_generation = mlx5_vdpa_get_generation, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index d5942842432d..779ae6c144d7 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -439,6 +439,13 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) spin_unlock(&vdpasim->lock); } +static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.config_size; +} + static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, void *buf, unsigned int len) { @@ -566,6 +573,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .get_config_size= vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, @@ -593,6 +601,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_vendor_id = vdpasim_get_vendor_id,
[RFC PATCH 02/10] vdpa: check vdpa_get_config() parameters and return bytes read
Now we have the 'get_config_size()' callback available, so we can check that 'offset' and 'len' parameters are valid. When these exceed boundaries, we limit the reading to the available configuration space in the device, and we return the amount of bytes read. We also move vdpa_get_config() implementation in drivers/vdpa/vdpa.c, since the function are growing. Signed-off-by: Stefano Garzarella --- include/linux/vdpa.h | 16 ++-- drivers/vdpa/vdpa.c | 35 +++ 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index fddf42b17573..8a679c98f8b1 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -332,20 +332,8 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) return ops->set_features(vdev, features); } - -static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, - void *buf, unsigned int len) -{ -const struct vdpa_config_ops *ops = vdev->config; - - /* -* Config accesses aren't supposed to trigger before features are set. -* If it does happen we assume a legacy guest. -*/ - if (!vdev->features_valid) - vdpa_set_features(vdev, 0); - ops->get_config(vdev, offset, buf, len); -} +int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len); /** * vdpa_mgmtdev_ops - vdpa device ops diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 3d997b389345..9ed6c779c63c 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -51,6 +51,41 @@ static struct bus_type vdpa_bus = { .remove = vdpa_dev_remove, }; +static int vdpa_config_size_wrap(struct vdpa_device *vdev, unsigned int offset, +unsigned int len) +{ + const struct vdpa_config_ops *ops = vdev->config; + unsigned int config_size = ops->get_config_size(vdev); + + if (offset > config_size || len > config_size) + return -1; + + return min(len, config_size - offset); +} + +int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len) +{ + const struct vdpa_config_ops *ops = vdev->config; + int bytes_get; + + bytes_get = vdpa_config_size_wrap(vdev, offset, len); + if (bytes_get <= 0) + return bytes_get; + + /* +* Config accesses aren't supposed to trigger before features are set. +* If it does happen we assume a legacy guest. +*/ + if (!vdev->features_valid) + vdpa_set_features(vdev, 0); + + ops->get_config(vdev, offset, buf, bytes_get); + + return bytes_get; +} +EXPORT_SYMBOL_GPL(vdpa_get_config); + static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); -- 2.29.2
[RFC PATCH 00/10] vdpa: get/set_config() rework
Following the discussion with Michael and Jason [1], I reworked a bit get/set_config() in vdpa. I changed vdpa_get_config() to check the boundaries and added vdpa_set_config(). When 'offset' or 'len' parameters exceed boundaries, we limit the reading to the available configuration space in the device, and we return the amount of bytes read/written. In this way the user space can pass buffers bigger than config space. I also returned the amount of bytes read and written to user space. Patches also available here: https://github.com/stefano-garzarella/linux/tree/vdpa-get-set-config-refactoring Thanks for your comments, Stefano [1] https://lkml.org/lkml/2021/2/10/350 Stefano Garzarella (10): vdpa: add get_config_size callback in vdpa_config_ops vdpa: check vdpa_get_config() parameters and return bytes read vdpa: add vdpa_set_config() helper vdpa: remove param checks in the get/set_config callbacks vdpa: remove WARN_ON() in the get/set_config callbacks virtio_vdpa: use vdpa_set_config() vhost/vdpa: use vdpa_set_config() vhost/vdpa: allow user space to pass buffers bigger than config space vhost/vdpa: use get_config_size callback in vhost_vdpa_config_validate() vhost/vdpa: return configuration bytes read and written to user space include/linux/vdpa.h | 22 --- drivers/vdpa/ifcvf/ifcvf_base.c | 3 +- drivers/vdpa/ifcvf/ifcvf_main.c | 8 +++- drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 - drivers/vdpa/vdpa.c | 51 drivers/vdpa/vdpa_sim/vdpa_sim.c | 15 +--- drivers/vhost/vdpa.c | 64 --- drivers/virtio/virtio_vdpa.c | 3 +- 8 files changed, 116 insertions(+), 59 deletions(-) -- 2.29.2
[RFC PATCH 03/10] vdpa: add vdpa_set_config() helper
Let's add a function similar to vpda_get_config() to check the 'offset' and 'len' parameters, call the set_config() device callback, and return the amount of bytes written. Signed-off-by: Stefano Garzarella --- include/linux/vdpa.h | 2 ++ drivers/vdpa/vdpa.c | 16 2 files changed, 18 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 8a679c98f8b1..562fcd14f4b5 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -334,6 +334,8 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); +int vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, + const void *buf, unsigned int len); /** * vdpa_mgmtdev_ops - vdpa device ops diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 9ed6c779c63c..825afc690a7e 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -86,6 +86,22 @@ int vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, } EXPORT_SYMBOL_GPL(vdpa_get_config); +int vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, + const void *buf, unsigned int len) +{ + const struct vdpa_config_ops *ops = vdev->config; + int bytes_set; + + bytes_set = vdpa_config_size_wrap(vdev, offset, len); + if (bytes_set <= 0) + return bytes_set; + + ops->set_config(vdev, offset, buf, bytes_set); + + return bytes_set; +} +EXPORT_SYMBOL_GPL(vdpa_set_config); + static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); -- 2.29.2
[RFC PATCH 04/10] vdpa: remove param checks in the get/set_config callbacks
vdpa_get_config() and vdpa_set_config() now check parameters before calling callbacks, so we can remove these redundant checks. Signed-off-by: Stefano Garzarella --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +-- drivers/vdpa/vdpa_sim/vdpa_sim.c | 6 -- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 78043ee567b6..ab63dc9b8432 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1825,8 +1825,7 @@ static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - if (offset + len <= sizeof(struct virtio_net_config)) - memcpy(buf, (u8 *)&ndev->config + offset, len); + memcpy(buf, (u8 *)&ndev->config + offset, len); } static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 779ae6c144d7..392180c6f2cf 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -451,9 +451,6 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - if (offset + len > vdpasim->dev_attr.config_size) - return; - if (vdpasim->dev_attr.get_config) vdpasim->dev_attr.get_config(vdpasim, vdpasim->config); @@ -465,9 +462,6 @@ static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - if (offset + len > vdpasim->dev_attr.config_size) - return; - memcpy(vdpasim->config + offset, buf, len); if (vdpasim->dev_attr.set_config) -- 2.29.2
[RFC PATCH 05/10] vdpa: remove WARN_ON() in the get/set_config callbacks
vdpa_get_config() and vdpa_set_config() now check parameters before calling callbacks, so we can remove these warnings. Signed-off-by: Stefano Garzarella --- Maybe we can skip this patch and leave the WARN_ONs in place. What do you recommend? --- drivers/vdpa/ifcvf/ifcvf_base.c | 3 +-- drivers/vdpa/ifcvf/ifcvf_main.c | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index f2a128e56de5..5941ecf934d0 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -222,7 +222,6 @@ void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset, u8 old_gen, new_gen, *p; int i; - WARN_ON(offset + length > sizeof(struct virtio_net_config)); do { old_gen = ifc_ioread8(&hw->common_cfg->config_generation); p = dst; @@ -240,7 +239,7 @@ void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset, int i; p = src; - WARN_ON(offset + length > sizeof(struct virtio_net_config)); + for (i = 0; i < length; i++) ifc_iowrite8(*p++, hw->net_cfg + offset + i); } diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 2443271e17d2..e55f88c57461 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -343,7 +343,6 @@ static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - WARN_ON(offset + len > sizeof(struct virtio_net_config)); ifcvf_read_net_config(vf, offset, buf, len); } @@ -353,7 +352,6 @@ static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev, { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - WARN_ON(offset + len > sizeof(struct virtio_net_config)); ifcvf_write_net_config(vf, offset, buf, len); } -- 2.29.2
[RFC PATCH 06/10] virtio_vdpa: use vdpa_set_config()
Instead of calling the 'set_config' callback directly, we call the new vdpa_set_config() helper which also checks the parameters. Signed-off-by: Stefano Garzarella --- drivers/virtio/virtio_vdpa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index e28acf482e0c..2f1c4a2dd241 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -65,9 +65,8 @@ static void virtio_vdpa_set(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len) { struct vdpa_device *vdpa = vd_get_vdpa(vdev); - const struct vdpa_config_ops *ops = vdpa->config; - ops->set_config(vdpa, offset, buf, len); + vdpa_set_config(vdpa, offset, buf, len); } static u32 virtio_vdpa_generation(struct virtio_device *vdev) -- 2.29.2
[RFC PATCH 07/10] vhost/vdpa: use vdpa_set_config()
Instead of calling the 'set_config' callback directly, we call the new vdpa_set_config() helper which also checks the parameters. Signed-off-by: Stefano Garzarella --- drivers/vhost/vdpa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index ef688c8c0e0e..cdd8f24168b2 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -236,7 +236,6 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, struct vhost_vdpa_config __user *c) { struct vdpa_device *vdpa = v->vdpa; - const struct vdpa_config_ops *ops = vdpa->config; struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); u8 *buf; @@ -250,7 +249,7 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, if (IS_ERR(buf)) return PTR_ERR(buf); - ops->set_config(vdpa, config.off, buf, config.len); + vdpa_set_config(vdpa, config.off, buf, config.len); kvfree(buf); return 0; -- 2.29.2
[RFC PATCH 08/10] vhost/vdpa: allow user space to pass buffers bigger than config space
vdpa_get_config() and vdpa_set_config() now are able to read/write only the bytes available in the device configuration space, also if the buffer provided is bigger than that. Let's use this feature to allow the user space application to pass any buffer. We limit the size of the internal bounce buffer allocated with the device config size. Signed-off-by: Stefano Garzarella --- drivers/vhost/vdpa.c | 36 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index cdd8f24168b2..544f8582a42b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -185,10 +185,10 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) return 0; } -static int vhost_vdpa_config_validate(struct vhost_vdpa *v, - struct vhost_vdpa_config *c) +static ssize_t vhost_vdpa_config_validate(struct vhost_vdpa *v, + struct vhost_vdpa_config *c) { - long size = 0; + u32 size = 0; switch (v->virtio_id) { case VIRTIO_ID_NET: @@ -199,10 +199,7 @@ static int vhost_vdpa_config_validate(struct vhost_vdpa *v, if (c->len == 0) return -EINVAL; - if (c->len > size - c->off) - return -E2BIG; - - return 0; + return min(c->len, size); } static long vhost_vdpa_get_config(struct vhost_vdpa *v, @@ -211,19 +208,23 @@ static long vhost_vdpa_get_config(struct vhost_vdpa *v, struct vdpa_device *vdpa = v->vdpa; struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); + ssize_t config_size; u8 *buf; if (copy_from_user(&config, c, size)) return -EFAULT; - if (vhost_vdpa_config_validate(v, &config)) - return -EINVAL; - buf = kvzalloc(config.len, GFP_KERNEL); + + config_size = vhost_vdpa_config_validate(v, &config); + if (config_size <= 0) + return config_size; + + buf = kvzalloc(config_size, GFP_KERNEL); if (!buf) return -ENOMEM; - vdpa_get_config(vdpa, config.off, buf, config.len); + vdpa_get_config(vdpa, config.off, buf, config_size); - if (copy_to_user(c->buf, buf, config.len)) { + if (copy_to_user(c->buf, buf, config_size)) { kvfree(buf); return -EFAULT; } @@ -238,18 +239,21 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, struct vdpa_device *vdpa = v->vdpa; struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); + ssize_t config_size; u8 *buf; if (copy_from_user(&config, c, size)) return -EFAULT; - if (vhost_vdpa_config_validate(v, &config)) - return -EINVAL; - buf = vmemdup_user(c->buf, config.len); + config_size = vhost_vdpa_config_validate(v, &config); + if (config_size <= 0) + return config_size; + + buf = vmemdup_user(c->buf, config_size); if (IS_ERR(buf)) return PTR_ERR(buf); - vdpa_set_config(vdpa, config.off, buf, config.len); + vdpa_set_config(vdpa, config.off, buf, config_size); kvfree(buf); return 0; -- 2.29.2
[RFC PATCH 09/10] vhost/vdpa: use get_config_size callback in vhost_vdpa_config_validate()
Let's use the new 'get_config_size()' callback available instead of using the 'virtio_id' to get the size of the device config space. Signed-off-by: Stefano Garzarella --- drivers/vhost/vdpa.c | 9 ++--- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 544f8582a42b..21eea2be5afa 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -188,13 +188,8 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) static ssize_t vhost_vdpa_config_validate(struct vhost_vdpa *v, struct vhost_vdpa_config *c) { - u32 size = 0; - - switch (v->virtio_id) { - case VIRTIO_ID_NET: - size = sizeof(struct virtio_net_config); - break; - } + struct vdpa_device *vdpa = v->vdpa; + u32 size = vdpa->config->get_config_size(vdpa); if (c->len == 0) return -EINVAL; -- 2.29.2
[RFC PATCH 10/10] vhost/vdpa: return configuration bytes read and written to user space
vdpa_get_config() and vdpa_set_config() now return the amount of bytes read and written, so let's return them to the user space. We also modify vhost_vdpa_config_validate() to return 0 (bytes read or written) instead of an error, when the buffer length is 0. Signed-off-by: Stefano Garzarella --- drivers/vhost/vdpa.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 21eea2be5afa..b754c53171a7 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -191,9 +191,6 @@ static ssize_t vhost_vdpa_config_validate(struct vhost_vdpa *v, struct vdpa_device *vdpa = v->vdpa; u32 size = vdpa->config->get_config_size(vdpa); - if (c->len == 0) - return -EINVAL; - return min(c->len, size); } @@ -204,6 +201,7 @@ static long vhost_vdpa_get_config(struct vhost_vdpa *v, struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); ssize_t config_size; + long ret; u8 *buf; if (copy_from_user(&config, c, size)) @@ -217,15 +215,18 @@ static long vhost_vdpa_get_config(struct vhost_vdpa *v, if (!buf) return -ENOMEM; - vdpa_get_config(vdpa, config.off, buf, config_size); - - if (copy_to_user(c->buf, buf, config_size)) { - kvfree(buf); - return -EFAULT; + ret = vdpa_get_config(vdpa, config.off, buf, config_size); + if (ret < 0) { + ret = -EFAULT; + goto out; } + if (copy_to_user(c->buf, buf, config_size)) + ret = -EFAULT; + +out: kvfree(buf); - return 0; + return ret; } static long vhost_vdpa_set_config(struct vhost_vdpa *v, @@ -235,6 +236,7 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); ssize_t config_size; + long ret; u8 *buf; if (copy_from_user(&config, c, size)) @@ -248,10 +250,12 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, if (IS_ERR(buf)) return PTR_ERR(buf); - vdpa_set_config(vdpa, config.off, buf, config_size); + ret = vdpa_set_config(vdpa, config.off, buf, config_size); + if (ret < 0) + ret = -EFAULT; kvfree(buf); - return 0; + return ret; } static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) -- 2.29.2
Re: [PATCH for 5.10] vdpa_sim: fix param validation in vdpasim_get_config()
On Mon, Feb 15, 2021 at 04:23:54PM +0100, Greg KH wrote: On Mon, Feb 15, 2021 at 04:03:21PM +0100, Stefano Garzarella wrote: On Mon, Feb 15, 2021 at 03:32:19PM +0100, Greg KH wrote: > On Thu, Feb 11, 2021 at 05:25:19PM +0100, Stefano Garzarella wrote: > > Commit 65b709586e222fa6ffd4166ac7fdb5d5dad113ee upstream. > > No, this really is not that commit, so please do not say it is. Oops, sorry. > > > Before this patch, if 'offset + len' was equal to > > sizeof(struct virtio_net_config), the entire buffer wasn't filled, > > returning incorrect values to the caller. > > > > Since 'vdpasim->config' type is 'struct virtio_net_config', we can > > safely copy its content under this condition. > > > > Commit 65b709586e22 ("vdpa_sim: add get_config callback in > > vdpasim_dev_attr") unintentionally solved it upstream while > > refactoring vdpa_sim.c to support multiple devices. But we don't want > > to backport it to stable branches as it contains many changes. > > > > Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator") > > Cc: # 5.10.x > > Signed-off-by: Stefano Garzarella > > --- > > drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c > > index 6a90fdb9cbfc..8ca178d7b02f 100644 > > --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c > > +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c > > @@ -572,7 +572,7 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, > > { > > struct vdpasim *vdpasim = vdpa_to_sim(vdpa); > > > > - if (offset + len < sizeof(struct virtio_net_config)) > > + if (offset + len <= sizeof(struct virtio_net_config)) > > memcpy(buf, (u8 *)&vdpasim->config + offset, len); > > } > > I'll be glad to take a one-off patch, but why can't we take the real > upstream patch? That is always the better long-term solution, right? Because that patch depends on the following patches merged in v5.11-rc1 while refactoring vdpa_sim: f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer a13b5918fdd0 vdpa_sim: add work_fn in vdpasim_dev_attr 011c35bac5ef vdpa_sim: add supported_features field in vdpasim_dev_attr 2f8f46188805 vdpa_sim: add device id field in vdpasim_dev_attr 6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes 36a9c3063025 vdpa_sim: rename vdpasim_config_ops variables 423248d60d2b vdpa_sim: remove hard-coded virtq count Maybe we can skip some of them, but IMHO should be less risky to apply only this change. If you want I can try to figure out the minimum sub-set of patches needed for 65b709586e22 ("vdpa_sim: add get_config callback in vdpasim_dev_attr"). The minimum is always nice :) The minimum set, including the patch that fixes the issue, is the following: 65b709586e22 vdpa_sim: add get_config callback in vdpasim_dev_attr f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer 6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes 423248d60d2b vdpa_sim: remove hard-coded virtq count The patches apply fairly cleanly. There are a few contextual differences due to the lack of the other patches: $ git backport-diff -u master -r linux-5.10.y..HEAD Key: [] : patches are identical [] : number of functional differences between upstream/downstream patch [down] : patch is downstream-only The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively 001/5:[] [--] 'vdpa_sim: remove hard-coded virtq count' 002/5:[] [-C] 'vdpa_sim: add struct vdpasim_dev_attr for device attributes' 003/5:[] [--] 'vdpa_sim: store parsed MAC address in a buffer' 004/5:[] [-C] 'vdpa_sim: make 'config' generic and usable for any device type' 005/5:[] [-C] 'vdpa_sim: add get_config callback in vdpasim_dev_attr' If it's just too much churn for no good reason, then yes, the one-line change above will be ok, but you need to document the heck out of why this is not upstream and that it is a one-off thing. Shortly I'll send the series to sta...@vger.kernel.org so you can judge if it's okay or better to resend this patch with a better description. Thanks Stefano
[PATCH for 5.10 v2 0/5] vdpa_sim: fix param validation in vdpasim_get_config()
v1: https://lore.kernel.org/stable/20210211162519.215418-1-sgarz...@redhat.com/ v2: - backport the upstream patch and related patches needed Commit 65b709586e22 ("vdpa_sim: add get_config callback in vdpasim_dev_attr") unintentionally solved an issue in vdpasim_get_config() upstream while refactoring vdpa_sim.c to support multiple devices. Before that patch, if 'offset + len' was equal to sizeof(struct virtio_net_config), the entire buffer wasn't filled, returning incorrect values to the caller. Since 'vdpasim->config' type is 'struct virtio_net_config', we can safely copy its content under this condition. The minimum set of patches to backport the patch that fixes the issue, is the following: 423248d60d2b vdpa_sim: remove hard-coded virtq count 6c6e28fe4579 vdpa_sim: add struct vdpasim_dev_attr for device attributes cf1a3b35382c vdpa_sim: store parsed MAC address in a buffer f37cbbc65178 vdpa_sim: make 'config' generic and usable for any device type 65b709586e22 vdpa_sim: add get_config callback in vdpasim_dev_attr The patches apply fairly cleanly. There are a few contextual differences due to the lack of the other patches: $ git backport-diff -u master -r linux-5.10.y..HEAD Key: [] : patches are identical [] : number of functional differences between upstream/downstream patch [down] : patch is downstream-only The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively 001/5:[] [--] 'vdpa_sim: remove hard-coded virtq count' 002/5:[] [-C] 'vdpa_sim: add struct vdpasim_dev_attr for device attributes' 003/5:[] [--] 'vdpa_sim: store parsed MAC address in a buffer' 004/5:[] [-C] 'vdpa_sim: make 'config' generic and usable for any device type' 005/5:[] [-C] 'vdpa_sim: add get_config callback in vdpasim_dev_attr' Thanks, Stefano Max Gurtovoy (1): vdpa_sim: remove hard-coded virtq count Stefano Garzarella (4): vdpa_sim: add struct vdpasim_dev_attr for device attributes vdpa_sim: store parsed MAC address in a buffer vdpa_sim: make 'config' generic and usable for any device type vdpa_sim: add get_config callback in vdpasim_dev_attr drivers/vdpa/vdpa_sim/vdpa_sim.c | 83 +++- 1 file changed, 60 insertions(+), 23 deletions(-) -- 2.29.2
[PATCH for 5.10 v2 1/5] vdpa_sim: remove hard-coded virtq count
From: Max Gurtovoy commit 423248d60d2b655321fc49eca1545f95a1bc9d6c upstream. Add a new attribute that will define the number of virt queues to be created for the vdpasim device. Signed-off-by: Max Gurtovoy [sgarzare: replace kmalloc_array() with kcalloc()] Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-4-sgarz...@redhat.com Signed-off-by: Michael S. Tsirkin Cc: # 5.10.x Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6a90fdb9cbfc..ee8f24a4643b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -70,7 +70,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; - struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM]; + struct vdpasim_virtqueue *vqs; struct work_struct work; /* spinlock to synchronize virtqueue state */ spinlock_t lock; @@ -80,6 +80,7 @@ struct vdpasim { u32 status; u32 generation; u64 features; + int nvqs; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; @@ -144,7 +145,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < VDPASIM_VQ_NUM; i++) + for (i = 0; i < vdpasim->nvqs; i++) vdpasim_vq_reset(&vdpasim->vqs[i]); spin_lock(&vdpasim->iommu_lock); @@ -350,7 +351,7 @@ static struct vdpasim *vdpasim_create(void) const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; struct device *dev; - int ret = -ENOMEM; + int i, ret = -ENOMEM; if (batch_mapping) ops = &vdpasim_net_batch_config_ops; @@ -361,6 +362,7 @@ static struct vdpasim *vdpasim_create(void) if (!vdpasim) goto err_alloc; + vdpasim->nvqs = VDPASIM_VQ_NUM; INIT_WORK(&vdpasim->work, vdpasim_work); spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -371,6 +373,11 @@ static struct vdpasim *vdpasim_create(void) goto err_iommu; set_dma_ops(dev, &vdpasim_dma_ops); + vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue), + GFP_KERNEL); + if (!vdpasim->vqs) + goto err_iommu; + vdpasim->iommu = vhost_iotlb_alloc(2048, 0); if (!vdpasim->iommu) goto err_iommu; @@ -389,8 +396,8 @@ static struct vdpasim *vdpasim_create(void) eth_random_addr(vdpasim->config.mac); } - vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu); - vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu); + for (i = 0; i < vdpasim->nvqs; i++) + vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; ret = vdpa_register_device(&vdpasim->vdpa); @@ -659,6 +666,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) kfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); + kfree(vdpasim->vqs); } static const struct vdpa_config_ops vdpasim_net_config_ops = { -- 2.29.2