On Fri, Jun 28, 2024 at 9:38 AM Mina Almasry <almasrym...@google.com> wrote:
>

Hi Mina,
Thank you so much for this work!

> ncdevmem is a devmem TCP netcat. It works similarly to netcat, but it
> sends and receives data using the devmem TCP APIs. It uses udmabuf as
> the dmabuf provider. It is compatible with a regular netcat running on
> a peer, or a ncdevmem running on a peer.
>
> In addition to normal netcat support, ncdevmem has a validation mode,
> where it sends a specific pattern and validates this pattern on the
> receiver side to ensure data integrity.
>
> Suggested-by: Stanislav Fomichev <s...@google.com>
> Signed-off-by: Mina Almasry <almasrym...@google.com>
>
> ---
> v15:
> - Fix linking against libynl. (Jakub)
>
> v9: 
> https://lore.kernel.org/netdev/20240403002053.2376017-15-almasrym...@google.com/
> - Remove unused nic_pci_addr entry (Cong).
>
> v6:
> - Updated to bind 8 queues.
> - Added RSS configuration.
> - Added some more tests for the netlink API.
>
> Changes in v1:
> - Many more general cleanups (Willem).
> - Removed driver reset (Jakub).
> - Removed hardcoded if index (Paolo).
>
> RFC v2:
> - General cleanups (Willem).
>
> ---
> tools/testing/selftests/net/.gitignore | 1 +
> tools/testing/selftests/net/Makefile | 9 +
> tools/testing/selftests/net/ncdevmem.c | 542 +++++++++++++++++++++++++
> 3 files changed, 552 insertions(+)
> create mode 100644 tools/testing/selftests/net/ncdevmem.c
>
> diff --git a/tools/testing/selftests/net/.gitignore 
> b/tools/testing/selftests/net/.gitignore
> index 666ab7d9390b1..fe770903118c5 100644
> --- a/tools/testing/selftests/net/.gitignore
> +++ b/tools/testing/selftests/net/.gitignore
> @@ -17,6 +17,7 @@ ipv6_flowlabel
> ipv6_flowlabel_mgr
> log.txt
> msg_zerocopy
> +ncdevmem
> nettest
> psock_fanout
> psock_snd
> diff --git a/tools/testing/selftests/net/Makefile 
> b/tools/testing/selftests/net/Makefile
> index bc3925200637c..39420a6e86b7f 100644
> --- a/tools/testing/selftests/net/Makefile
> +++ b/tools/testing/selftests/net/Makefile
> @@ -95,6 +95,11 @@ TEST_PROGS += fq_band_pktlimit.sh
> TEST_PROGS += vlan_hw_filter.sh
> TEST_PROGS += bpf_offload.py
>
> +# YNL files, must be before "include ..lib.mk"
> +EXTRA_CLEAN += $(OUTPUT)/libynl.a
> +YNL_GEN_FILES := ncdevmem
> +TEST_GEN_FILES += $(YNL_GEN_FILES)
> +
> TEST_FILES := settings
> TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh
>
> @@ -104,6 +109,10 @@ TEST_INCLUDES := forwarding/lib.sh
>
> include ../lib.mk
>
> +# YNL build
> +YNL_GENS := netdev
> +include ynl.mk
> +
> $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
> $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
> $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
> diff --git a/tools/testing/selftests/net/ncdevmem.c 
> b/tools/testing/selftests/net/ncdevmem.c
> new file mode 100644
> index 0000000000000..e00255e54f77b
> --- /dev/null
> +++ b/tools/testing/selftests/net/ncdevmem.c
> @@ -0,0 +1,542 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#define __EXPORTED_HEADERS__
> +
> +#include <linux/uio.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <errno.h>
> +#define __iovec_defined
> +#include <fcntl.h>
> +#include <malloc.h>
> +#include <error.h>
> +
> +#include <arpa/inet.h>
> +#include <sys/socket.h>
> +#include <sys/mman.h>
> +#include <sys/ioctl.h>
> +#include <sys/syscall.h>
> +
> +#include <linux/memfd.h>
> +#include <linux/if.h>
> +#include <linux/dma-buf.h>
> +#include <linux/udmabuf.h>
> +#include <libmnl/libmnl.h>
> +#include <linux/types.h>
> +#include <linux/netlink.h>
> +#include <linux/genetlink.h>
> +#include <linux/netdev.h>
> +#include <time.h>
> +
> +#include "netdev-user.h"
> +#include <ynl.h>
> +
> +#define PAGE_SHIFT 12
> +#define TEST_PREFIX "ncdevmem"
> +#define NUM_PAGES 16000
> +
> +#ifndef MSG_SOCK_DEVMEM
> +#define MSG_SOCK_DEVMEM 0x2000000
> +#endif
> +
> +/*
> + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP
> + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider.
> + *
> + * Usage:
> + *
> + * On server:
> + * ncdevmem -s <server IP> -c <client IP> -f eth1 -d 3 -n 0000:06:00.0 -l \
> + * -p 5201 -v 7

The 'n' option disappeared, please remove it.


> + *
> + * On client:
> + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \
> + * tr \\n \\0 | \
> + * head -c 5G | \
> + * nc <server IP> 5201 -p 5201
> + *
> + * Note this is compatible with regular netcat. i.e. the sender or receiver 
> can
> + * be replaced with regular netcat to test the RX or TX path in isolation.
> + */
> +
> +static char *server_ip = "192.168.1.4";
> +static char *client_ip = "192.168.1.2";
> +static char *port = "5201";
> +static size_t do_validation;
> +static int start_queue = 8;
> +static int num_queues = 8;
> +static char *ifname = "eth1";
> +static unsigned int ifindex = 3;
> +static unsigned int iterations;
> +static unsigned int dmabuf_id;
> +
> +void print_bytes(void *ptr, size_t size)
> +{
> + unsigned char *p = ptr;
> + int i;
> +
> + for (i = 0; i < size; i++)
> + printf("%02hhX ", p[i]);
> + printf("\n");
> +}
> +
> +void print_nonzero_bytes(void *ptr, size_t size)
> +{
> + unsigned char *p = ptr;
> + unsigned int i;
> +
> + for (i = 0; i < size; i++)
> + putchar(p[i]);
> + printf("\n");
> +}
> +
> +void validate_buffer(void *line, size_t size)
> +{
> + static unsigned char seed = 1;
> + unsigned char *ptr = line;
> + int errors = 0;
> + size_t i;
> +
> + for (i = 0; i < size; i++) {
> + if (ptr[i] != seed) {
> + fprintf(stderr,
> + "Failed validation: expected=%u, actual=%u, index=%lu\n",
> + seed, ptr[i], i);
> + errors++;
> + if (errors > 20)
> + error(1, 0, "validation failed.");
> + }
> + seed++;
> + if (seed == do_validation)
> + seed = 0;
> + }
> +
> + fprintf(stdout, "Validated buffer\n");
> +}
> +
> +static void reset_flow_steering(void)
> +{
> + char command[256];
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple off",
> + "eth1");

I think we use ifname instead of "eth1".

> + system(command);
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command), "sudo ethtool -K %s ntuple on",
> + "eth1");

Please use ifname instead of "eth1" too.

> + system(command);
> +}
> +
> +static void configure_rss(void)
> +{
> + char command[256];
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command), "sudo ethtool -X %s equal %d",
> + ifname, start_queue);
> + system(command);
> +}
> +
> +static void configure_flow_steering(void)
> +{
> + char command[256];
> +
> + memset(command, 0, sizeof(command));
> + snprintf(command, sizeof(command),
> + "sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port 
> %s queue %d",
> + ifname, client_ip, server_ip, port, port, start_queue);
> + system(command);
> +}
> +
> +static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd,
> + struct netdev_queue_dmabuf *queues,
> + unsigned int n_queue_index, struct ynl_sock **ys)
> +{
> + struct netdev_bind_rx_req *req = NULL;
> + struct netdev_bind_rx_rsp *rsp = NULL;
> + struct ynl_error yerr;
> +
> + *ys = ynl_sock_create(&ynl_netdev_family, &yerr);
> + if (!*ys) {
> + fprintf(stderr, "YNL: %s\n", yerr.msg);
> + return -1;
> + }
> +
> + req = netdev_bind_rx_req_alloc();
> + netdev_bind_rx_req_set_ifindex(req, ifindex);
> + netdev_bind_rx_req_set_dmabuf_fd(req, dmabuf_fd);
> + __netdev_bind_rx_req_set_queues(req, queues, n_queue_index);
> +
> + rsp = netdev_bind_rx(*ys, req);
> + if (!rsp) {
> + perror("netdev_bind_rx");
> + goto err_close;
> + }
> +
> + if (!rsp->_present.dmabuf_id) {
> + perror("dmabuf_id not present");
> + goto err_close;
> + }
> +
> + printf("got dmabuf id=%d\n", rsp->dmabuf_id);
> + dmabuf_id = rsp->dmabuf_id;
> +
> + netdev_bind_rx_req_free(req);
> + netdev_bind_rx_rsp_free(rsp);
> +
> + return 0;
> +
> +err_close:
> + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg);
> + netdev_bind_rx_req_free(req);
> + ynl_sock_destroy(*ys);
> + return -1;
> +}
> +
> +static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t 
> dmabuf_size)
> +{
> + struct udmabuf_create create;
> + int ret;
> +
> + *devfd = open("/dev/udmabuf", O_RDWR);
> + if (*devfd < 0) {
> + error(70, 0,
> + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
> + TEST_PREFIX);
> + }
> +
> + *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
> + if (*memfd < 0)
> + error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX);
> +
> + /* Required for udmabuf */
> + ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK);
> + if (ret < 0)
> + error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
> +
> + ret = ftruncate(*memfd, dmabuf_size);
> + if (ret == -1)
> + error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
> +
> + memset(&create, 0, sizeof(create));
> +
> + create.memfd = *memfd;
> + create.offset = 0;
> + create.size = dmabuf_size;
> + *buf = ioctl(*devfd, UDMABUF_CREATE, &create);
> + if (*buf < 0)
> + error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
> +}
> +
> +int do_server(void)
> +{
> + char ctrl_data[sizeof(int) * 20000];
> + struct netdev_queue_dmabuf *queues;
> + size_t non_page_aligned_frags = 0;
> + struct sockaddr_in client_addr;
> + struct sockaddr_in server_sin;
> + size_t page_aligned_frags = 0;
> + int devfd, memfd, buf, ret;
> + size_t total_received = 0;
> + socklen_t client_addr_len;
> + bool is_devmem = false;
> + char *buf_mem = NULL;
> + struct ynl_sock *ys;
> + size_t dmabuf_size;
> + char iobuf[819200];
> + char buffer[256];
> + int socket_fd;
> + int client_fd;
> + size_t i = 0;
> + int opt = 1;
> +
> + dmabuf_size = getpagesize() * NUM_PAGES;
> +
> + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
> +
> + reset_flow_steering();
> +
> + /* Configure RSS to divert all traffic from our devmem queues */
> + configure_rss();
> +
> + /* Flow steer our devmem flows to start_queue */
> + configure_flow_steering();
> +
> + sleep(1);
> +
> + queues = malloc(sizeof(*queues) * num_queues);
> +
> + for (i = 0; i < num_queues; i++) {
> + queues[i]._present.type = 1;
> + queues[i]._present.idx = 1;
> + queues[i].type = NETDEV_QUEUE_TYPE_RX;
> + queues[i].idx = start_queue + i;
> + }
> +
> + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
> + error(1, 0, "Failed to bind\n");
> +
> + buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> + buf, 0);
> + if (buf_mem == MAP_FAILED)
> + error(1, 0, "mmap()");
> +
> + server_sin.sin_family = AF_INET;
> + server_sin.sin_port = htons(atoi(port));
> +
> + ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr);
> + if (socket < 0)
> + error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX);
> +
> + socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0);
> + if (socket < 0)
> + error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX);
> +
> + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt,
> + sizeof(opt));
> + if (ret)
> + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
> +
> + ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt,
> + sizeof(opt));
> + if (ret)
> + error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX);
> +
> + printf("binding to address %s:%d\n", server_ip,
> + ntohs(server_sin.sin_port));
> +
> + ret = bind(socket_fd, &server_sin, sizeof(server_sin));
> + if (ret)
> + error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX);
> +
> + ret = listen(socket_fd, 1);
> + if (ret)
> + error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX);
> +
> + client_addr_len = sizeof(client_addr);
> +
> + inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer,
> + sizeof(buffer));
> + printf("Waiting or connection on %s:%d\n", buffer,
> + ntohs(server_sin.sin_port));
> + client_fd = accept(socket_fd, &client_addr, &client_addr_len);
> +
> + inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer,
> + sizeof(buffer));
> + printf("Got connection from %s:%d\n", buffer,
> + ntohs(client_addr.sin_port));
> +
> + while (1) {
> + struct iovec iov = { .iov_base = iobuf,
> + .iov_len = sizeof(iobuf) };
> + struct dmabuf_cmsg *dmabuf_cmsg = NULL;
> + struct dma_buf_sync sync = { 0 };
> + struct cmsghdr *cm = NULL;
> + struct msghdr msg = { 0 };
> + struct dmabuf_token token;
> + ssize_t ret;
> +
> + is_devmem = false;
> + printf("\n\n");
> +
> + msg.msg_iov = &iov;
> + msg.msg_iovlen = 1;
> + msg.msg_control = ctrl_data;
> + msg.msg_controllen = sizeof(ctrl_data);
> + ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM);
> + printf("recvmsg ret=%ld\n", ret);
> + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
> + continue;
> + if (ret < 0) {
> + perror("recvmsg");
> + continue;
> + }
> + if (ret == 0) {
> + printf("client exited\n");
> + goto cleanup;
> + }
> +
> + i++;
> + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
> + if (cm->cmsg_level != SOL_SOCKET ||
> + (cm->cmsg_type != SCM_DEVMEM_DMABUF &&
> + cm->cmsg_type != SCM_DEVMEM_LINEAR)) {
> + fprintf(stdout, "skipping non-devmem cmsg\n");
> + continue;
> + }
> +
> + dmabuf_cmsg = (struct dmabuf_cmsg *)CMSG_DATA(cm);
> + is_devmem = true;
> +
> + if (cm->cmsg_type == SCM_DEVMEM_LINEAR) {
> + /* TODO: process data copied from skb's linear
> + * buffer.
> + */
> + fprintf(stdout,
> + "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
> + dmabuf_cmsg->frag_size);
> +
> + continue;
> + }
> +
> + token.token_start = dmabuf_cmsg->frag_token;
> + token.token_count = 1;
> +
> + total_received += dmabuf_cmsg->frag_size;
> + printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, 
> frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n",
> + dmabuf_cmsg->frag_offset >> PAGE_SHIFT,
> + dmabuf_cmsg->frag_offset % getpagesize(),
> + dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size,
> + dmabuf_cmsg->frag_token, total_received,
> + dmabuf_cmsg->dmabuf_id);
> +
> + if (dmabuf_cmsg->dmabuf_id != dmabuf_id)
> + error(1, 0,
> + "received on wrong dmabuf_id: flow steering error\n");
> +
> + if (dmabuf_cmsg->frag_size % getpagesize())
> + non_page_aligned_frags++;
> + else
> + page_aligned_frags++;
> +
> + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
> + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
> +
> + if (do_validation)
> + validate_buffer(
> + ((unsigned char *)buf_mem) +
> + dmabuf_cmsg->frag_offset,
> + dmabuf_cmsg->frag_size);
> + else
> + print_nonzero_bytes(
> + ((unsigned char *)buf_mem) +
> + dmabuf_cmsg->frag_offset,
> + dmabuf_cmsg->frag_size);
> +
> + sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
> + ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
> +
> + ret = setsockopt(client_fd, SOL_SOCKET,
> + SO_DEVMEM_DONTNEED, &token,
> + sizeof(token));
> + if (ret != 1)
> + error(1, 0,
> + "SO_DEVMEM_DONTNEED not enough tokens");
> + }
> + if (!is_devmem)
> + error(1, 0, "flow steering error\n");
> +
> + printf("total_received=%lu\n", total_received);
> + }
> +
> + fprintf(stdout, "%s: ok\n", TEST_PREFIX);
> +
> + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
> + page_aligned_frags, non_page_aligned_frags);
> +
> + fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n",
> + page_aligned_frags, non_page_aligned_frags);
> +
> +cleanup:
> +
> + munmap(buf_mem, dmabuf_size);
> + close(client_fd);
> + close(socket_fd);
> + close(buf);
> + close(memfd);
> + close(devfd);
> + ynl_sock_destroy(ys);
> +
> + return 0;
> +}
> +
> +void run_devmem_tests(void)
> +{
> + struct netdev_queue_dmabuf *queues;
> + int devfd, memfd, buf;
> + struct ynl_sock *ys;
> + size_t dmabuf_size;
> + size_t i = 0;
> +
> + dmabuf_size = getpagesize() * NUM_PAGES;
> +
> + create_udmabuf(&devfd, &memfd, &buf, dmabuf_size);
> +
> + /* Configure RSS to divert all traffic from our devmem queues */
> + configure_rss();
> +
> + sleep(1);
> +
> + queues = malloc(sizeof(*queues) * num_queues);
> +
> + for (i = 0; i < num_queues; i++) {
> + queues[i]._present.type = 1;
> + queues[i]._present.idx = 1;
> + queues[i].type = NETDEV_QUEUE_TYPE_RX;
> + queues[i].idx = start_queue + i;
> + }
> +
> + if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys))
> + error(1, 0, "Failed to bind\n");
> +
> + /* Closing the netlink socket does an implicit unbind */
> + ynl_sock_destroy(ys);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + int is_server = 0, opt;
> +
> + while ((opt = getopt(argc, argv, "ls:c:p:v:q:f:n:i:d:")) != -1) {

I think 't' option should be added here.

> + switch (opt) {
> + case 'l':
> + is_server = 1;
> + break;
> + case 's':
> + server_ip = optarg;
> + break;
> + case 'c':
> + client_ip = optarg;
> + break;
> + case 'p':
> + port = optarg;
> + break;
> + case 'v':
> + do_validation = atoll(optarg);
> + break;
> + case 'q':
> + num_queues = atoi(optarg);
> + break;
> + case 't':
> + start_queue = atoi(optarg);
> + break;
> + case 'f':
> + ifname = optarg;
> + break;
> + case 'd':
> + ifindex = atoi(optarg);

How about using if_nametoindex() instead of 'd' option?

> + break;
> + case 'i':
> + iterations = atoll(optarg);

I couldn't find a use of this variable.

> + break;
> + case '?':
> + printf("unknown option: %c\n", optopt);
> + break;
> + }
> + }
> +
> + for (; optind < argc; optind++)
> + printf("extra arguments: %s\n", argv[optind]);
> +
> + run_devmem_tests();
> +
> + if (is_server)
> + return do_server();
> +
> + return 0;
> +}
> --
> 2.45.2.803.g4e1b14247a-goog
>
>

Thanks a lot!
Taehee Yoo

Reply via email to