DPDK 16.04 introduces the vHost PMD which allows 'dpdkvhostuser' ports to be controlled by the librte_ether API, like physical 'dpdk' ports. The commit integrates this functionality into OVS, and refactors some of the existing vhost code such that it is vhost-cuse specific. Similarly, there is now some overlap between dpdk and vhost-user port code.
Signed-off-by: Ciara Loftus <ciara.lof...@intel.com> --- INSTALL.DPDK.md | 12 ++ NEWS | 2 + lib/netdev-dpdk.c | 628 +++++++++++++++++++++++++++++++++--------------------- 3 files changed, 396 insertions(+), 246 deletions(-) diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md index 93f92e4..db7153a 100644 --- a/INSTALL.DPDK.md +++ b/INSTALL.DPDK.md @@ -990,6 +990,18 @@ Restrictions: increased to the desired number of queues. Both DPDK and OVS must be recompiled for this change to take effect. + DPDK 'eth' type ports: + - dpdk, dpdkr and dpdkvhostuser ports are 'eth' type ports in the context of + DPDK as they are all managed by the rte_ether API. This means that they + adhere to the DPDK configuration option CONFIG_RTE_MAX_ETHPORTS which by + default is set to 32. This means by default the combined total number of + dpdk, dpdkr and dpdkvhostuser ports allowable in OVS with DPDK is 32. This + value can be changed if desired by modifying the configuration file in + DPDK, or by overriding the default value on the command line when building + DPDK. eg. + + `make install CONFIG_RTE_MAX_ETHPORTS=64` + Bug Reporting: -------------- diff --git a/NEWS b/NEWS index 4e81cad..841314b 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,8 @@ Post-v2.5.0 * DB entries have been added for many of the DPDK EAL command line arguments. Additional arguments can be passed via the dpdk-extra entry. + * vHost PMD integration brings vhost-user ports under control of the + rte_ether DPDK API. - ovs-benchmark: This utility has been removed due to lack of use and bitrot. - ovs-appctl: diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 89d783a..814ef83 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -55,6 +55,7 @@ #include "unixctl.h" #include "rte_config.h" +#include "rte_eth_vhost.h" #include "rte_mbuf.h" #include "rte_meter.h" #include "rte_virtio_net.h" @@ -139,6 +140,11 @@ static char *cuse_dev_name = NULL; /* Character device cuse_dev_name. */ #endif static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ +/* Array that tracks the used & unused vHost user driver IDs */ +static unsigned int vhost_user_drv_ids[RTE_MAX_ETHPORTS]; +/* Maximum string length allowed to provide to rte_eth_attach function */ +#define DEVARGS_MAX (RTE_ETH_NAME_MAX_LEN + PATH_MAX + 18) + /* * Maximum amount of time in micro seconds to try and enqueue to vhost. */ @@ -172,7 +178,8 @@ enum { DRAIN_TSC = 200000ULL }; enum dpdk_dev_type { DPDK_DEV_ETH = 0, - DPDK_DEV_VHOST = 1, + DPDK_DEV_VHOST_USER = 1, + DPDK_DEV_VHOST_CUSE = 2, }; static int rte_eal_init_ret = ENODEV; @@ -358,12 +365,22 @@ struct netdev_dpdk { int real_n_rxq; bool txq_needs_locking; - /* virtio-net structure for vhost device */ + /* Spinlock for vhost cuse transmission. Other DPDK devices use spinlocks + * in dpdk_tx_queue */ + rte_spinlock_t vhost_cuse_tx_lock; + + /* virtio-net structure for vhost cuse device */ OVSRCU_TYPE(struct virtio_net *) virtio_dev; + /* Number of virtqueue pairs reported by the guest */ + uint32_t vhost_qp_nb; + /* Identifier used to distinguish vhost devices from each other */ char vhost_id[PATH_MAX]; + /* ID of vhost user port given to the PMD driver */ + unsigned int vhost_pmd_id; + /* In dpdk_list. */ struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -381,16 +398,20 @@ struct netdev_rxq_dpdk { static bool dpdk_thread_is_pmd(void); static int netdev_dpdk_construct(struct netdev *); +static int netdev_dpdk_vhost_user_construct(struct netdev *); struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev); void link_status_changed_callback(uint8_t port_id, enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED); +void vring_state_changed_callback(uint8_t port_id, + enum rte_eth_event_type type OVS_UNUSED, void *param OVS_UNUSED); static bool -is_dpdk_class(const struct netdev_class *class) +is_dpdk_eth_class(const struct netdev_class *class) { - return class->construct == netdev_dpdk_construct; + return ((class->construct == netdev_dpdk_construct) || + (class->construct == netdev_dpdk_vhost_user_construct)); } /* DPDK NIC drivers allocate RX buffers at a particular granularity, typically @@ -592,7 +613,12 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) } dev->up.n_rxq = n_rxq; - dev->real_n_txq = n_txq; + /* Only set real_n_txq for physical devices. vHost User devices will + * set this value correctly during vring_state_changed callbacks. + */ + if (dev->type == DPDK_DEV_ETH) { + dev->real_n_txq = n_txq; + } return 0; } @@ -697,6 +723,118 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned int n_txqs) } } +/* + * Fixes mapping for vhost-user tx queues. Must be called after each + * enabling/disabling of queues and real_n_txq modifications. + */ +static void +netdev_dpdk_remap_txqs(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + int *enabled_queues, n_enabled = 0; + int i, k, total_txqs = dev->real_n_txq; + + enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues); + + for (i = 0; i < total_txqs; i++) { + /* Enabled queues always mapped to themselves. */ + if (dev->tx_q[i].map == i) { + enabled_queues[n_enabled++] = i; + } + } + + if (n_enabled == 0 && total_txqs != 0) { + enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED; + n_enabled = 1; + } + + k = 0; + for (i = 0; i < total_txqs; i++) { + if (dev->tx_q[i].map != i) { + dev->tx_q[i].map = enabled_queues[k]; + k = (k + 1) % n_enabled; + } + } + + VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id); + for (i = 0; i < total_txqs; i++) { + VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map); + } + + rte_free(enabled_queues); +} + +static void +netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + dev->real_n_rxq = dev->vhost_qp_nb; + dev->real_n_txq = dev->vhost_qp_nb; + dev->txq_needs_locking = true; + + /* Enable TX queue 0 by default if it wasn't disabled. */ + if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) { + dev->tx_q[0].map = 0; + } + + netdev_dpdk_remap_txqs(dev); + + return; +} + +/* Clears mapping for all available queues of vhost interface. */ +static void +netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + int i; + + for (i = 0; i < dev->real_n_txq; i++) { + dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN; + } +} + +void +vring_state_changed_callback(uint8_t port_id, + enum rte_eth_event_type type OVS_UNUSED, + void *param OVS_UNUSED) +{ + struct netdev_dpdk *dev; + struct rte_eth_vhost_queue_event event; + int err = 0; + + err = rte_eth_vhost_get_queue_event(port_id, &event); + if (err || (event.rx == 1)) { + return; + } + + ovs_mutex_lock(&dpdk_mutex); + LIST_FOR_EACH (dev, list_node, &dpdk_list) { + if (port_id == dev->port_id) { + ovs_mutex_lock(&dev->mutex); + if (event.enable) { + dev->tx_q[event.queue_id].map = event.queue_id; + dev->vhost_qp_nb++; + } else { + dev->tx_q[event.queue_id].map = OVS_VHOST_QUEUE_DISABLED; + dev->vhost_qp_nb--; + } + if (dev->vhost_qp_nb > dev->up.n_rxq) { + VLOG_ERR("Error configuring vhost device %s - guest queues " + "out-number queues configured in OVS %d > %d", + dev->vhost_id, dev->vhost_qp_nb, dev->up.n_rxq); + } else { + netdev_dpdk_vhost_set_queues(dev); + } + ovs_mutex_unlock(&dev->mutex); + break; + } + } + ovs_mutex_unlock(&dpdk_mutex); + + return; +} + void link_status_changed_callback(uint8_t port_id, enum rte_eth_event_type type OVS_UNUSED, @@ -709,6 +847,25 @@ link_status_changed_callback(uint8_t port_id, if (port_id == dev->port_id) { ovs_mutex_lock(&dev->mutex); check_link_status(dev); + if (dev->type == DPDK_DEV_VHOST_USER) { + if (dev->link.link_status == ETH_LINK_UP) { + /* new device */ + if (dev->vhost_qp_nb > dev->up.n_rxq) { + ovs_mutex_unlock(&dev->mutex); + ovs_mutex_unlock(&dpdk_mutex); + return; + } else { + netdev_dpdk_vhost_set_queues(dev); + VLOG_INFO("vHost Device '%s' has been added", + dev->vhost_id); + } + } else { + /* destroy device */ + netdev_dpdk_txq_map_clear(dev); + VLOG_INFO("vHost Device '%s' has been removed", + dev->vhost_id); + } + } ovs_mutex_unlock(&dev->mutex); break; } @@ -727,6 +884,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no, int sid; int err = 0; uint32_t buf_size; + unsigned int nr_q = 0; ovs_mutex_init(&dev->mutex); ovs_mutex_lock(&dev->mutex); @@ -736,7 +894,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no, /* If the 'sid' is negative, it means that the kernel fails * to obtain the pci numa info. In that situation, always * use 'SOCKET0'. */ - if (type == DPDK_DEV_ETH) { + if (type != DPDK_DEV_VHOST_CUSE) { sid = rte_eth_dev_socket_id(port_no); } else { sid = rte_lcore_to_socket_id(rte_get_master_lcore()); @@ -764,22 +922,27 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int port_no, netdev->n_rxq = NR_QUEUE; netdev->requested_n_rxq = NR_QUEUE; dev->real_n_txq = NR_QUEUE; + dev->vhost_qp_nb = 0; - if (type == DPDK_DEV_ETH) { - netdev_dpdk_alloc_txq(dev, NR_QUEUE); + if (type != DPDK_DEV_VHOST_CUSE) { + nr_q = (type == DPDK_DEV_ETH ? 1 : RTE_MAX_QUEUES_PER_PORT); + netdev_dpdk_alloc_txq(dev, nr_q); err = dpdk_eth_dev_init(dev); if (err) { goto unlock; } - } else { - netdev_dpdk_alloc_txq(dev, OVS_VHOST_MAX_QUEUE_NUM); } - if (type == DPDK_DEV_ETH) { + if (type != DPDK_DEV_VHOST_CUSE) { rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_INTR_LSC, (void*)link_status_changed_callback, NULL); } + if (type == DPDK_DEV_VHOST_USER) { + rte_eth_dev_callback_register(port_no, RTE_ETH_EVENT_QUEUE_STATE, + (void*)vring_state_changed_callback, + NULL); + } ovs_list_push_back(&dpdk_list, &dev->list_node); @@ -813,16 +976,6 @@ dpdk_dev_parse_name(const char dev_name[], const char prefix[], } static int -vhost_construct_helper(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex) -{ - if (rte_eal_init_ret) { - return rte_eal_init_ret; - } - - return netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST); -} - -static int netdev_dpdk_vhost_cuse_construct(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); @@ -834,22 +987,48 @@ netdev_dpdk_vhost_cuse_construct(struct netdev *netdev) ovs_mutex_lock(&dpdk_mutex); strncpy(dev->vhost_id, netdev->name, sizeof(dev->vhost_id)); - err = vhost_construct_helper(netdev); + + rte_spinlock_init(&dev->vhost_cuse_tx_lock); + + err = rte_eal_init_ret ? rte_eal_init_ret : + netdev_dpdk_init(netdev, -1, DPDK_DEV_VHOST_CUSE); + ovs_mutex_unlock(&dpdk_mutex); return err; } static int +get_vhost_user_drv_id(void) +{ + int i = 0; + + for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (vhost_user_drv_ids[i] == 0) { + return i; + } + } + + return -1; +} + +static int netdev_dpdk_vhost_user_construct(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); const char *name = netdev->name; int err; + uint8_t port_no = 0; + char devargs[DEVARGS_MAX]; + int driver_id = 0; + + if (rte_eal_init_ret) { + return rte_eal_init_ret; + } /* 'name' is appended to 'vhost_sock_dir' and used to create a socket in * the file system. '/' or '\' would traverse directories, so they're not * acceptable in 'name'. */ - if (strchr(name, '/') || strchr(name, '\\')) { + if (strchr(name, '/') || strchr(name, '\\') || strchr(name, ',')) { VLOG_ERR("\"%s\" is not a valid name for a vhost-user port. " "A valid name must not include '/' or '\\'", name); @@ -866,18 +1045,32 @@ netdev_dpdk_vhost_user_construct(struct netdev *netdev) */ snprintf(dev->vhost_id, sizeof(dev->vhost_id), "%s/%s", vhost_sock_dir, name); + driver_id = get_vhost_user_drv_id(); + if (driver_id == -1) { + VLOG_ERR("Unable to create vhost-user device %s - too many vhost-user" + "devices registered with PMD", dev->vhost_id); + err = ENODEV; + goto out; + + } else { + snprintf(devargs, sizeof(devargs), "eth_vhost%u,iface=%s,queues=%i", + driver_id, dev->vhost_id, RTE_MAX_QUEUES_PER_PORT); + err = rte_eth_dev_attach(devargs, &port_no); + } - err = rte_vhost_driver_register(dev->vhost_id); if (err) { - VLOG_ERR("vhost-user socket device setup failure for socket %s\n", + VLOG_ERR("Failed to attach vhost-user device %s to DPDK", dev->vhost_id); } else { fatal_signal_add_file_to_unlink(dev->vhost_id); VLOG_INFO("Socket %s created for vhost-user port %s\n", dev->vhost_id, name); - err = vhost_construct_helper(netdev); + dev->vhost_pmd_id = driver_id; + vhost_user_drv_ids[dev->vhost_pmd_id] = 1; + err = netdev_dpdk_init(netdev, port_no, DPDK_DEV_VHOST_USER); } +out: ovs_mutex_unlock(&dpdk_mutex); return err; } @@ -910,6 +1103,12 @@ netdev_dpdk_destruct(struct netdev *netdev) struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); ovs_mutex_lock(&dev->mutex); + + if (dev->type == DPDK_DEV_VHOST_USER) { + rte_eth_dev_detach(dev->port_id, dev->vhost_id); + vhost_user_drv_ids[dev->vhost_pmd_id] = 0; + } + rte_eth_dev_stop(dev->port_id); ovs_mutex_unlock(&dev->mutex); @@ -921,7 +1120,7 @@ netdev_dpdk_destruct(struct netdev *netdev) } static void -netdev_dpdk_vhost_destruct(struct netdev *netdev) +netdev_dpdk_vhost_cuse_destruct(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); @@ -929,15 +1128,8 @@ netdev_dpdk_vhost_destruct(struct netdev *netdev) if (netdev_dpdk_get_virtio(dev) != NULL) { VLOG_ERR("Removing port '%s' while vhost device still attached.", netdev->name); - VLOG_ERR("To restore connectivity after re-adding of port, VM on socket" - " '%s' must be restarted.", - dev->vhost_id); - } - - if (rte_vhost_driver_unregister(dev->vhost_id)) { - VLOG_ERR("Unable to remove vhost-user socket %s", dev->vhost_id); - } else { - fatal_signal_remove_file_to_unlink(dev->vhost_id); + VLOG_ERR("To restore connectivity after re-adding of port, VM with" + "port '%s' must be restarted.", dev->vhost_id); } ovs_mutex_lock(&dpdk_mutex); @@ -1037,11 +1229,12 @@ netdev_dpdk_set_multiq(struct netdev *netdev, unsigned int n_txq, } static int -netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned int n_txq, - unsigned int n_rxq) +netdev_dpdk_vhost_user_set_multiq(struct netdev *netdev, unsigned int n_txq, + unsigned int n_rxq) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); int err = 0; + int old_rxq, old_txq; if (netdev->n_txq == n_txq && netdev->n_rxq == n_rxq) { return err; @@ -1050,10 +1243,20 @@ netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned int n_txq, ovs_mutex_lock(&dpdk_mutex); ovs_mutex_lock(&dev->mutex); + rte_eth_dev_stop(dev->port_id); + + old_txq = netdev->n_txq; + old_rxq = netdev->n_rxq; netdev->n_txq = n_txq; - dev->real_n_txq = 1; - netdev->n_rxq = 1; - dev->txq_needs_locking = dev->real_n_txq != netdev->n_txq; + netdev->n_rxq = n_rxq; + + err = dpdk_eth_dev_init(dev); + if (err) { + /* If there has been an error, it means that the requested queues + * have not been created. Restore the old numbers. */ + netdev->n_txq = old_txq; + netdev->n_rxq = old_rxq; + } ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); @@ -1062,8 +1265,8 @@ netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned int n_txq, } static int -netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int n_txq, - unsigned int n_rxq) +netdev_dpdk_vhost_cuse_set_multiq(struct netdev *netdev, unsigned int n_txq, + unsigned int n_rxq) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); int err = 0; @@ -1076,7 +1279,8 @@ netdev_dpdk_vhost_set_multiq(struct netdev *netdev, unsigned int n_txq, ovs_mutex_lock(&dev->mutex); netdev->n_txq = n_txq; - netdev->n_rxq = n_rxq; + dev->real_n_txq = 1; + netdev->n_rxq = 1; ovs_mutex_unlock(&dev->mutex); ovs_mutex_unlock(&dpdk_mutex); @@ -1171,13 +1375,13 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid) } static bool -is_vhost_running(struct virtio_net *virtio_dev) +is_vhost_cuse_running(struct virtio_net *virtio_dev) { return (virtio_dev != NULL && (virtio_dev->flags & VIRTIO_DEV_RUNNING)); } static inline void -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats, +netdev_dpdk_vhost_cuse_update_rx_size_counters(struct netdev_stats *stats, unsigned int packet_size) { /* Hard-coded search for the size bucket. */ @@ -1203,7 +1407,7 @@ netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats, } static inline void -netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats, +netdev_dpdk_vhost_cuse_update_rx_counters(struct netdev_stats *stats, struct dp_packet **packets, int count) { int i; @@ -1224,7 +1428,7 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats, continue; } - netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size); + netdev_dpdk_vhost_cuse_update_rx_size_counters(stats, packet_size); struct eth_header *eh = (struct eth_header *) dp_packet_data(packet); if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) { @@ -1236,26 +1440,22 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_stats *stats, } /* - * The receive path for the vhost port is the TX path out from guest. + * The receive path for the vhost cuse port is the TX path out from guest. */ static int -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, +netdev_dpdk_vhost_cuse_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets, int *c) { struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev); struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev); - int qid = rxq->queue_id; + int qid = 1; uint16_t nb_rx = 0; - if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) { + if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) { return EAGAIN; } - if (rxq->queue_id >= dev->real_n_rxq) { - return EOPNOTSUPP; - } - - nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid * VIRTIO_QNUM + VIRTIO_TXQ, + nb_rx = rte_vhost_dequeue_burst(virtio_dev, qid, dev->dpdk_mp->mp, (struct rte_mbuf **)packets, NETDEV_MAX_BURST); @@ -1264,7 +1464,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, } rte_spinlock_lock(&dev->stats_lock); - netdev_dpdk_vhost_update_rx_counters(&dev->stats, packets, nb_rx); + netdev_dpdk_vhost_cuse_update_rx_counters(&dev->stats, packets, nb_rx); rte_spinlock_unlock(&dev->stats_lock); *c = (int) nb_rx; @@ -1300,6 +1500,18 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet **packets, return 0; } +static int +netdev_dpdk_vhost_user_rxq_recv(struct netdev_rxq *rxq, + struct dp_packet **packets, int *c) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev); + + if (rxq->queue_id >= dev->real_n_rxq) { + return EOPNOTSUPP; + } + + return netdev_dpdk_rxq_recv(rxq, packets, c); +} static inline int netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts, int cnt) @@ -1318,7 +1530,7 @@ netdev_dpdk_qos_run__(struct netdev_dpdk *dev, struct rte_mbuf **pkts, } static inline void -netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats, +netdev_dpdk_vhost_cuse_update_tx_counters(struct netdev_stats *stats, struct dp_packet **packets, int attempted, int dropped) @@ -1335,9 +1547,8 @@ netdev_dpdk_vhost_update_tx_counters(struct netdev_stats *stats, } static void -__netdev_dpdk_vhost_send(struct netdev *netdev, int qid, - struct dp_packet **pkts, int cnt, - bool may_steal) +__netdev_dpdk_vhost_cuse_send(struct netdev *netdev, struct dp_packet **pkts, + int cnt, bool may_steal) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev); @@ -1346,26 +1557,24 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, unsigned int qos_pkts = cnt; uint64_t start = 0; - qid = dev->tx_q[qid % dev->real_n_txq].map; - - if (OVS_UNLIKELY(!is_vhost_running(virtio_dev) || qid < 0)) { + if (OVS_UNLIKELY(!is_vhost_cuse_running(virtio_dev))) { rte_spinlock_lock(&dev->stats_lock); dev->stats.tx_dropped+= cnt; rte_spinlock_unlock(&dev->stats_lock); goto out; } - rte_spinlock_lock(&dev->tx_q[qid].tx_lock); + /* There is a single vhost-cuse TX queue, So we need to lock it for TX. */ + rte_spinlock_lock(&dev->vhost_cuse_tx_lock); /* Check has QoS has been configured for the netdev */ cnt = netdev_dpdk_qos_run__(dev, cur_pkts, cnt); qos_pkts -= cnt; do { - int vhost_qid = qid * VIRTIO_QNUM + VIRTIO_RXQ; unsigned int tx_pkts; - tx_pkts = rte_vhost_enqueue_burst(virtio_dev, vhost_qid, + tx_pkts = rte_vhost_enqueue_burst(virtio_dev, VIRTIO_RXQ, cur_pkts, cnt); if (OVS_LIKELY(tx_pkts)) { /* Packets have been sent.*/ @@ -1384,7 +1593,7 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, * Unable to enqueue packets to vhost interface. * Check available entries before retrying. */ - while (!rte_vring_available_entries(virtio_dev, vhost_qid)) { + while (!rte_vring_available_entries(virtio_dev, VIRTIO_RXQ)) { if (OVS_UNLIKELY((rte_get_timer_cycles() - start) > timeout)) { expired = 1; break; @@ -1396,12 +1605,12 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, int qid, } } } while (cnt); - - rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); + rte_spinlock_unlock(&dev->vhost_cuse_tx_lock); rte_spinlock_lock(&dev->stats_lock); cnt += qos_pkts; - netdev_dpdk_vhost_update_tx_counters(&dev->stats, pkts, total_pkts, cnt); + netdev_dpdk_vhost_cuse_update_tx_counters(&dev->stats, pkts, total_pkts, + cnt); rte_spinlock_unlock(&dev->stats_lock); out: @@ -1495,8 +1704,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts, newcnt++; } - if (dev->type == DPDK_DEV_VHOST) { - __netdev_dpdk_vhost_send(netdev, qid, (struct dp_packet **) mbufs, newcnt, true); + if (dev->type == DPDK_DEV_VHOST_CUSE) { + __netdev_dpdk_vhost_cuse_send(netdev, (struct dp_packet **) mbufs, + newcnt, true); } else { unsigned int qos_pkts = newcnt; @@ -1520,8 +1730,8 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts, } static int -netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts, - int cnt, bool may_steal) +netdev_dpdk_vhost_cuse_send(struct netdev *netdev, int qid OVS_UNUSED, + struct dp_packet **pkts, int cnt, bool may_steal) { if (OVS_UNLIKELY(pkts[0]->source != DPBUF_DPDK)) { int i; @@ -1533,7 +1743,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet **pkts, } } } else { - __netdev_dpdk_vhost_send(netdev, qid, pkts, cnt, may_steal); + __netdev_dpdk_vhost_cuse_send(netdev, pkts, cnt, may_steal); } return 0; } @@ -1544,11 +1754,6 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, { int i; - if (OVS_UNLIKELY(dev->txq_needs_locking)) { - qid = qid % dev->real_n_txq; - rte_spinlock_lock(&dev->tx_q[qid].tx_lock); - } - if (OVS_UNLIKELY(!may_steal || pkts[0]->source != DPBUF_DPDK)) { struct netdev *netdev = &dev->up; @@ -1607,19 +1812,49 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, rte_spinlock_unlock(&dev->stats_lock); } } +} + +static int +netdev_dpdk_eth_send(struct netdev *netdev, int qid, + struct dp_packet **pkts, int cnt, bool may_steal) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + + if (OVS_UNLIKELY(dev->txq_needs_locking)) { + qid = qid % dev->real_n_txq; + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); + } + + netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); if (OVS_UNLIKELY(dev->txq_needs_locking)) { rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); } + + return 0; } static int -netdev_dpdk_eth_send(struct netdev *netdev, int qid, +netdev_dpdk_vhost_user_send(struct netdev *netdev, int qid, struct dp_packet **pkts, int cnt, bool may_steal) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); - netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); + qid = dev->tx_q[qid % dev->real_n_txq].map; + if (qid == -1) { + if (may_steal) { + int i; + + for (i = 0; i < cnt; i++) { + dp_packet_delete(pkts[i]); + } + } + } else { + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); + netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); + rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); + } + return 0; } @@ -1717,7 +1952,7 @@ static int netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); static int -netdev_dpdk_vhost_get_stats(const struct netdev *netdev, +netdev_dpdk_vhost_cuse_get_stats(const struct netdev *netdev, struct netdev_stats *stats) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); @@ -1847,14 +2082,26 @@ netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats) stats->tx_packets = rte_stats.opackets; stats->rx_bytes = rte_stats.ibytes; stats->tx_bytes = rte_stats.obytes; - /* DPDK counts imissed as errors, but count them here as dropped instead */ - stats->rx_errors = rte_stats.ierrors - rte_stats.imissed; - stats->tx_errors = rte_stats.oerrors; - stats->multicast = rte_stats.imcasts; - rte_spinlock_lock(&dev->stats_lock); - stats->tx_dropped = dev->stats.tx_dropped; - rte_spinlock_unlock(&dev->stats_lock); + if (dev->type == DPDK_DEV_ETH) { + /* DPDK counts imissed as errors, but count them here as dropped + * instead */ + stats->rx_errors = rte_stats.ierrors - rte_stats.imissed; + stats->tx_errors = rte_stats.oerrors; + stats->multicast = rte_stats.imcasts; + + rte_spinlock_lock(&dev->stats_lock); + stats->tx_dropped = dev->stats.tx_dropped; + rte_spinlock_unlock(&dev->stats_lock); + } else { + stats->rx_errors = UINT64_MAX; + stats->tx_errors = UINT64_MAX; + stats->multicast = UINT64_MAX; + + rte_spinlock_lock(&dev->stats_lock); + stats->tx_dropped = UINT64_MAX; + rte_spinlock_unlock(&dev->stats_lock); + } /* These are the available DPDK counters for packets not received due to * local resource constraints in DPDK and NIC respectively. */ @@ -1940,14 +2187,14 @@ netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier) } static int -netdev_dpdk_vhost_get_carrier(const struct netdev *netdev, bool *carrier) +netdev_dpdk_vhost_cuse_get_carrier(const struct netdev *netdev, bool *carrier) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct virtio_net *virtio_dev = netdev_dpdk_get_virtio(dev); ovs_mutex_lock(&dev->mutex); - if (is_vhost_running(virtio_dev)) { + if (is_vhost_cuse_running(virtio_dev)) { *carrier = 1; } else { *carrier = 0; @@ -1997,7 +2244,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev, return 0; } - if (dev->type == DPDK_DEV_ETH) { + if (dev->type != DPDK_DEV_VHOST_CUSE) { if (dev->flags & NETDEV_UP) { err = rte_eth_dev_start(dev->port_id); if (err) @@ -2098,7 +2345,7 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc, if (argc > 2) { struct netdev *netdev = netdev_from_name(argv[1]); - if (netdev && is_dpdk_class(netdev->netdev_class)) { + if (netdev && is_dpdk_eth_class(netdev->netdev_class)) { struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev); ovs_mutex_lock(&dpdk_dev->mutex); @@ -2142,75 +2389,7 @@ set_irq_status(struct virtio_net *virtio_dev) } /* - * Fixes mapping for vhost-user tx queues. Must be called after each - * enabling/disabling of queues and real_n_txq modifications. - */ -static void -netdev_dpdk_remap_txqs(struct netdev_dpdk *dev) - OVS_REQUIRES(dev->mutex) -{ - int *enabled_queues, n_enabled = 0; - int i, k, total_txqs = dev->real_n_txq; - - enabled_queues = dpdk_rte_mzalloc(total_txqs * sizeof *enabled_queues); - - for (i = 0; i < total_txqs; i++) { - /* Enabled queues always mapped to themselves. */ - if (dev->tx_q[i].map == i) { - enabled_queues[n_enabled++] = i; - } - } - - if (n_enabled == 0 && total_txqs != 0) { - enabled_queues[0] = OVS_VHOST_QUEUE_DISABLED; - n_enabled = 1; - } - - k = 0; - for (i = 0; i < total_txqs; i++) { - if (dev->tx_q[i].map != i) { - dev->tx_q[i].map = enabled_queues[k]; - k = (k + 1) % n_enabled; - } - } - - VLOG_DBG("TX queue mapping for %s\n", dev->vhost_id); - for (i = 0; i < total_txqs; i++) { - VLOG_DBG("%2d --> %2d", i, dev->tx_q[i].map); - } - - rte_free(enabled_queues); -} - -static int -netdev_dpdk_vhost_set_queues(struct netdev_dpdk *dev, struct virtio_net *virtio_dev) - OVS_REQUIRES(dev->mutex) -{ - uint32_t qp_num; - - qp_num = virtio_dev->virt_qp_nb; - if (qp_num > dev->up.n_rxq) { - VLOG_ERR("vHost Device '%s' %"PRIu64" can't be added - " - "too many queues %d > %d", virtio_dev->ifname, virtio_dev->device_fh, - qp_num, dev->up.n_rxq); - return -1; - } - - dev->real_n_rxq = qp_num; - dev->real_n_txq = qp_num; - dev->txq_needs_locking = true; - /* Enable TX queue 0 by default if it wasn't disabled. */ - if (dev->tx_q[0].map == OVS_VHOST_QUEUE_MAP_UNKNOWN) { - dev->tx_q[0].map = 0; - } - - netdev_dpdk_remap_txqs(dev); - - return 0; -} - -/* - * A new virtio-net device is added to a vhost port. + * A new virtio-net device is added to a vhost cuse port. */ static int new_device(struct virtio_net *virtio_dev) @@ -2223,11 +2402,6 @@ new_device(struct virtio_net *virtio_dev) LIST_FOR_EACH(dev, list_node, &dpdk_list) { if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) { ovs_mutex_lock(&dev->mutex); - if (netdev_dpdk_vhost_set_queues(dev, virtio_dev)) { - ovs_mutex_unlock(&dev->mutex); - ovs_mutex_unlock(&dpdk_mutex); - return -1; - } ovsrcu_set(&dev->virtio_dev, virtio_dev); exists = true; virtio_dev->flags |= VIRTIO_DEV_RUNNING; @@ -2251,23 +2425,11 @@ new_device(struct virtio_net *virtio_dev) return 0; } -/* Clears mapping for all available queues of vhost interface. */ -static void -netdev_dpdk_txq_map_clear(struct netdev_dpdk *dev) - OVS_REQUIRES(dev->mutex) -{ - int i; - - for (i = 0; i < dev->real_n_txq; i++) { - dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN; - } -} - /* - * Remove a virtio-net device from the specific vhost port. Use dev->remove - * flag to stop any more packets from being sent or received to/from a VM and - * ensure all currently queued packets have been sent/received before removing - * the device. + * Remove a virtio-net device from the specific vhost cuse port. Use + * dev->remove flag to stop any more packets from being sent or received + * to/from a VM and ensure all currently queued packets have been sent/received + * before removing the device. */ static void destroy_device(volatile struct virtio_net *virtio_dev) @@ -2282,7 +2444,6 @@ destroy_device(volatile struct virtio_net *virtio_dev) ovs_mutex_lock(&dev->mutex); virtio_dev->flags &= ~VIRTIO_DEV_RUNNING; ovsrcu_set(&dev->virtio_dev, NULL); - netdev_dpdk_txq_map_clear(dev); exists = true; ovs_mutex_unlock(&dev->mutex); break; @@ -2310,49 +2471,6 @@ destroy_device(volatile struct virtio_net *virtio_dev) } } -static int -vring_state_changed(struct virtio_net *virtio_dev, uint16_t queue_id, - int enable) -{ - struct netdev_dpdk *dev; - bool exists = false; - int qid = queue_id / VIRTIO_QNUM; - - if (queue_id % VIRTIO_QNUM == VIRTIO_TXQ) { - return 0; - } - - ovs_mutex_lock(&dpdk_mutex); - LIST_FOR_EACH (dev, list_node, &dpdk_list) { - if (strncmp(virtio_dev->ifname, dev->vhost_id, IF_NAME_SZ) == 0) { - ovs_mutex_lock(&dev->mutex); - if (enable) { - dev->tx_q[qid].map = qid; - } else { - dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED; - } - netdev_dpdk_remap_txqs(dev); - exists = true; - ovs_mutex_unlock(&dev->mutex); - break; - } - } - ovs_mutex_unlock(&dpdk_mutex); - - if (exists) { - VLOG_INFO("State of queue %d ( tx_qid %d ) of vhost device '%s' %" - PRIu64" changed to \'%s\'", queue_id, qid, - virtio_dev->ifname, virtio_dev->device_fh, - (enable == 1) ? "enabled" : "disabled"); - } else { - VLOG_INFO("vHost Device '%s' %"PRIu64" not found", virtio_dev->ifname, - virtio_dev->device_fh); - return -1; - } - - return 0; -} - struct virtio_net * netdev_dpdk_get_virtio(const struct netdev_dpdk *dev) { @@ -2360,18 +2478,18 @@ netdev_dpdk_get_virtio(const struct netdev_dpdk *dev) } /* - * These callbacks allow virtio-net devices to be added to vhost ports when - * configuration has been fully complete. + * These callbacks allow virtio-net devices to be added to vhost cuse ports + * when configuration has been fully complete. */ static const struct virtio_net_device_ops virtio_net_device_ops = { .new_device = new_device, .destroy_device = destroy_device, - .vring_state_changed = vring_state_changed }; +#ifdef VHOST_CUSE static void * -start_vhost_loop(void *dummy OVS_UNUSED) +start_vhost_cuse_loop(void *dummy OVS_UNUSED) { pthread_detach(pthread_self()); /* Put the cuse thread into quiescent state. */ @@ -2379,19 +2497,27 @@ start_vhost_loop(void *dummy OVS_UNUSED) rte_vhost_driver_session_start(); return NULL; } +#endif static int dpdk_vhost_class_init(void) { +#ifdef VHOST_CUSE rte_vhost_driver_callback_register(&virtio_net_device_ops); rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4 | 1ULL << VIRTIO_NET_F_HOST_TSO6 | 1ULL << VIRTIO_NET_F_CSUM); - ovs_thread_create("vhost_thread", start_vhost_loop, NULL); + ovs_thread_create("vhost_cuse_thread", start_vhost_cuse_loop, NULL); +#else + rte_eth_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4 + | 1ULL << VIRTIO_NET_F_HOST_TSO6 + | 1ULL << VIRTIO_NET_F_CSUM); +#endif return 0; } + static int dpdk_vhost_cuse_class_init(void) { @@ -2511,7 +2637,17 @@ netdev_dpdk_ring_send(struct netdev *netdev, int qid, dp_packet_rss_invalidate(pkts[i]); } + if (OVS_UNLIKELY(dev->txq_needs_locking)) { + qid = qid % dev->real_n_txq; + rte_spinlock_lock(&dev->tx_q[qid].tx_lock); + } + netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal); + + if (OVS_UNLIKELY(dev->txq_needs_locking)) { + rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); + } + return 0; } @@ -3276,28 +3412,28 @@ static const struct netdev_class OVS_UNUSED dpdk_vhost_cuse_class = "dpdkvhostcuse", dpdk_vhost_cuse_class_init, netdev_dpdk_vhost_cuse_construct, - netdev_dpdk_vhost_destruct, + netdev_dpdk_vhost_cuse_destruct, netdev_dpdk_vhost_cuse_set_multiq, - netdev_dpdk_vhost_send, - netdev_dpdk_vhost_get_carrier, - netdev_dpdk_vhost_get_stats, + netdev_dpdk_vhost_cuse_send, + netdev_dpdk_vhost_cuse_get_carrier, + netdev_dpdk_vhost_cuse_get_stats, NULL, NULL, - netdev_dpdk_vhost_rxq_recv); + netdev_dpdk_vhost_cuse_rxq_recv); static const struct netdev_class OVS_UNUSED dpdk_vhost_user_class = NETDEV_DPDK_CLASS( "dpdkvhostuser", dpdk_vhost_user_class_init, netdev_dpdk_vhost_user_construct, - netdev_dpdk_vhost_destruct, - netdev_dpdk_vhost_set_multiq, - netdev_dpdk_vhost_send, - netdev_dpdk_vhost_get_carrier, - netdev_dpdk_vhost_get_stats, + netdev_dpdk_destruct, + netdev_dpdk_vhost_user_set_multiq, + netdev_dpdk_vhost_user_send, + netdev_dpdk_get_carrier, + netdev_dpdk_get_stats, NULL, NULL, - netdev_dpdk_vhost_rxq_recv); + netdev_dpdk_vhost_user_rxq_recv); void netdev_dpdk_register(void) -- 2.4.3 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev