Regards, Keith > On Oct 11, 2016, at 6:49 AM, Yigit, Ferruh <ferruh.yigit at intel.com> wrote: > > On 10/4/2016 3:45 PM, Keith Wiles wrote: >> The rte_eth_tap.c PMD creates a device using TUN/TAP interfaces >> on the local host. The PMD allows for DPDK and the host to >> communicate using a raw device interface on the host and in >> the DPDK application. The device created is a Tap device with >> a L2 packet header. >>
Will try to ship out a v5 soon. >> v4 - merge with latest driver changes >> v3 - fix includes by removing ifdef for other type besides Linux. >> Fix the copyright notice in the Makefile >> v2 - merge all of the patches into one patch. >> Fix a typo on naming the tap device. >> Update the maintainers list >> >> Signed-off-by: Keith Wiles <keith.wiles at intel.com> >> --- >> MAINTAINERS | 5 + >> config/common_linuxapp | 2 + >> doc/guides/nics/tap.rst | 84 ++++ >> drivers/net/Makefile | 1 + >> drivers/net/tap/Makefile | 57 +++ >> drivers/net/tap/rte_eth_tap.c | 866 >> ++++++++++++++++++++++++++++++++ >> drivers/net/tap/rte_pmd_tap_version.map | 4 + >> mk/rte.app.mk | 1 + >> 8 files changed, 1020 insertions(+) >> create mode 100644 doc/guides/nics/tap.rst >> create mode 100644 drivers/net/tap/Makefile >> create mode 100644 drivers/net/tap/rte_eth_tap.c >> create mode 100644 drivers/net/tap/rte_pmd_tap_version.map >> > <> >> diff --git a/config/common_linuxapp b/config/common_linuxapp >> index 2483dfa..59a2053 100644 >> --- a/config/common_linuxapp >> +++ b/config/common_linuxapp >> @@ -44,3 +44,5 @@ CONFIG_RTE_LIBRTE_PMD_VHOST=y >> CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y >> CONFIG_RTE_LIBRTE_POWER=y >> CONFIG_RTE_VIRTIO_USER=y >> +CONFIG_RTE_LIBRTE_PMD_TAP=y > > According existing config items, a default value of a config option > should go to config/common_base, and environment specific config file > overwrites it if required. > So this option needs to be added into config/common_base too as disabled > by default. Add the define to common_base as no, plus a comment for Linux only. > >> +CONFIG_RTE_PMD_TAP_MAX_QUEUES=32 Moved this to the .c file as a define. > > Is the number of max queues really needs to be a config option, I assume > in normal use case user won't need to update this and will use single > queue, if that is true what about pushing this into source code to not > make config file more complex? > >> diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst > > <...> > >> +.. code-block:: console >> + >> + The interfaced name can be changed by adding the iface=foo0 >> + e.g. --vedv=eth_tap,iface=foo0 --vdev=eth_tap,iface=foo1, ... > > s/vedv/vdev > eth_tap needs to be net_tap as part of unifying device names work Fixed. > > <...> > >> diff --git a/drivers/net/Makefile b/drivers/net/Makefile >> index bc93230..b4afa98 100644 >> --- a/drivers/net/Makefile >> +++ b/drivers/net/Makefile >> @@ -55,6 +55,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += thunderx >> DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio >> DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3 >> DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt >> +DIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap > > Rest of the PMDs sorted alphabetically, please do same. Done. > >> >> ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) >> DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost > > <...> > >> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c > > <...> > >> + >> +static const char *drivername = "Tap PMD"; >> +static int tap_unit = 0; > > No need to initialize to zero. Fixed > > <...> > >> + >> +struct pmd_internals { >> + char name[RTE_ETH_NAME_MAX_LEN]; /* Internal Tap device name */ >> + uint16_t nb_queues; /* Number of queues supported */ >> + uint16_t pad0; > > Why this padding? Is it reserved? Removed pad0. I just like to know about gaps in the structures is the reason. > >> + struct ether_addr eth_addr; /* Mac address of the device port */ >> + >> + int if_index; /* IF_INDEX for the port */ >> + int fds[RTE_PMD_TAP_MAX_QUEUES]; /* List of all file descriptors */ >> + >> + struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */ >> + struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */ >> +}; >> + >> +/* >> + * Tun/Tap allocation routine >> + * >> + * name is the number of the interface to use, unless NULL to take the host >> + * supplied name. >> + */ >> +static int >> +tun_alloc(char * name) > > char *name Fixed. > > <...> > >> + >> + /* Always set the fiile descriptor to non-blocking */ > > s/fiile/file > >> + if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) { >> + RTE_LOG(ERR, PMD, "Unable to set to nonblocking\n"); >> + perror("F_SETFL, NONBLOCK"); >> + goto error; >> + } >> + >> + /* If the name is different that new name as default */ >> + if (name && strcmp(name, ifr.ifr_name)) >> + strcpy(name, ifr.ifr_name); > What about more secure copy? Changed to be more secure. > >> + >> + return fd; >> + >> +error: >> + if (fd > 0) >> + close(fd); >> + return -1; >> +} >> + > > <...> > >> + >> +static void >> +tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) >> +{ >> + struct pmd_internals *internals = dev->data->dev_private; >> + >> + dev_info->driver_name = drivername; >> + dev_info->if_index = internals->if_index; >> + dev_info->max_mac_addrs = 1; >> + dev_info->max_rx_pktlen = (uint32_t)ETHER_MAX_VLAN_FRAME_LEN; >> + dev_info->max_rx_queues = (uint16_t)internals->nb_queues; >> + dev_info->max_tx_queues = (uint16_t)internals->nb_queues; > casting to uint16_t is not requires, it is already uint16_t. Removed > >> + dev_info->min_rx_bufsize = 0; >> + dev_info->pci_dev = NULL; >> +} >> + >> +static void >> +tap_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats) > igb_stats? > >> +{ >> + unsigned i, imax; >> + unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0; >> + unsigned long rx_bytes_total = 0, tx_bytes_total = 0; >> + const struct pmd_internals *internal = dev->data->dev_private; >> + >> + imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? >> + internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS; >> + >> + for (i = 0; i < imax; i++) { >> + igb_stats->q_ipackets[i] = internal->rxq[i].stats.ipackets; >> + igb_stats->q_ibytes[i] = internal->rxq[i].stats.ibytes; >> + rx_total += igb_stats->q_ipackets[i]; >> + rx_bytes_total += igb_stats->q_ibytes[i]; >> + } >> + >> + imax = (internal->nb_queues < RTE_ETHDEV_QUEUE_STAT_CNTRS) ? >> + internal->nb_queues : RTE_ETHDEV_QUEUE_STAT_CNTRS; > Do we need to duplicate imax calculation? Removed > > >> + >> + for (i = 0; i < imax; i++) { >> + igb_stats->q_opackets[i] = internal->txq[i].stats.opackets; >> + igb_stats->q_errors[i] = internal->txq[i].stats.errs; >> + igb_stats->q_obytes[i] = internal->txq[i].stats.obytes; >> + tx_total += igb_stats->q_opackets[i]; >> + tx_err_total += igb_stats->q_errors[i]; >> + tx_bytes_total += igb_stats->q_obytes[i]; >> + } >> + >> + igb_stats->ipackets = rx_total; >> + igb_stats->ibytes = rx_bytes_total; >> + igb_stats->opackets = tx_total; >> + igb_stats->oerrors = tx_err_total; >> + igb_stats->obytes = tx_bytes_total; >> +} >> + > > <...> > >> + >> +static int >> +rte_eth_dev_create(const char *name, >> + struct rte_eth_dev **eth_dev, >> + const struct eth_dev_ops *dev_ops, >> + void **internals, size_t internal_size, >> + uint16_t flag) >> +{ >> + char buff[RTE_ETH_NAME_MAX_LEN]; >> + int numa_node = rte_socket_id(); >> + struct rte_eth_dev *dev = NULL; >> + struct rte_eth_dev_data *data = NULL; >> + void *priv = NULL; >> + >> + if ((name == NULL) || (eth_dev == NULL) || (dev_ops == NULL) || >> + (internals == NULL) || (internal_size == 0)) { >> + RTE_PMD_DEBUG_TRACE("Paramters are invalid\n"); >> + return -1; >> + } >> + >> + dev = rte_eth_dev_allocate(name); >> + if (dev == NULL) { >> + RTE_PMD_DEBUG_TRACE("%s: rte_eth_dev_allocate failed for %s\n", >> + name, buff); >> + goto error; >> + } >> + >> + if (flag & RTE_USE_PRIVATE_DATA) { > > You may need to save this flag value somewhere in internals, to decide > how to free data later. Let me look into this one more and see if it is required at all. > >> + /* >> + * now do all data allocation - for eth_dev structure, dummy >> + * pci driver and internal (private) data >> + */ >> + snprintf(buff, sizeof(buff), "D-%s-%d", name, numa_node); >> + data = rte_zmalloc_socket(buff, sizeof(struct rte_eth_dev_data), >> + 0, numa_node); >> + if (data == NULL) { >> + RTE_PMD_DEBUG_TRACE("%s: Unable to allocate memory\n", >> + name); >> + goto error; >> + } >> + /* move the current state of the structure to the new one */ >> + rte_memcpy(data, dev->data, sizeof(struct rte_eth_dev_data)); > Why do we need to copy, trying to preserve which data? > >> + dev->data = data; /* Override the current data pointer */ >> + } else >> + data = dev->data; >> + >> + snprintf(buff, sizeof(buff), "I-%s-%d", name, numa_node); >> + priv = rte_zmalloc_socket(buff, internal_size, 0, numa_node); >> + if (priv == NULL) { >> + RTE_PMD_DEBUG_TRACE("Unable to allocate internal memory %lu\n", >> + internal_size); >> + goto error; >> + } >> + >> + /* Setup some default values */ >> + dev->dev_ops = dev_ops; >> + data->dev_private = priv; > >> + data->port_id = dev->data->port_id; >> + memmove(data->name, dev->data->name, strlen(dev->data->name)); > These two assignments are useless, needs to be done before "dev->data = > data" assignment. Reworked this code area to remove it. > >> + >> + dev->driver = NULL; >> + data->dev_flags = RTE_ETH_DEV_DETACHABLE; >> + data->kdrv = RTE_KDRV_NONE; >> + data->numa_node = numa_node; >> + >> + *eth_dev = dev; >> + *internals = priv; >> + >> + return 0; >> +error: >> + rte_free(priv); >> + >> + if (flag & RTE_USE_PRIVATE_DATA) >> + rte_free(data); >> + >> + rte_eth_dev_release_port(dev); >> + >> + return -1; >> +} >> + >> +static int >> +pmd_init_internals(const char *name, struct tap_info *tap, >> + struct pmd_internals **internals, >> + struct rte_eth_dev **eth_dev) >> +{ >> + struct rte_eth_dev *dev = NULL; >> + struct pmd_internals *internal = NULL; >> + struct rte_eth_dev_data *data = NULL; >> + int ret, i, fd = -1; >> + >> + RTE_LOG(INFO, PMD, >> + "%s: Create TUN/TAP Ethernet device with %d queues on numa >> %u\n", >> + name, RTE_PMD_TAP_MAX_QUEUES, rte_socket_id()); >> + >> + pmd_link.link_speed = tap->speed; >> + >> + ret = rte_eth_dev_create(tap->name, &dev, &ops, >> + (void **)&internal, sizeof(struct >> pmd_internals), > Why rte_eth_dev_create() get "void **internals" which requires casting, > but not "struct pmd_internals **internals? ? Fixed. > >> + RTE_USE_PRIVATE_DATA); >> + if (ret < 0) >> + return -1; >> + >> + strncpy(internal->name, tap->name, sizeof(internal->name)); >> + >> + internal->nb_queues = RTE_PMD_TAP_MAX_QUEUES; >> + >> + /* Create the first Tap device */ >> + if ((fd = tun_alloc(dev->data->name)) < 0) { >> + RTE_LOG(ERR, PMD, "tun_alloc(%s) failed\n", dev->data->name); >> + rte_free(internal); > rte_free(dev->data); ? > But needs to check RTE_USE_PRIVATE_DATA .. See above > >> + rte_eth_dev_release_port(dev); >> + return -1; >> + } >> + >> + /* Presetup the fds to -1 as being not working */ >> + for(i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { >> + internal->fds[i] = -1; >> + internal->rxq[i].fd = -1; >> + internal->txq[i].fd = -1; >> + } >> + >> + /* Take the TUN/TAP fd and place in the first location */ >> + internal->rxq[0].fd = fd; >> + internal->txq[0].fd = fd; >> + internal->fds[0] = fd; >> + >> + if (pmd_mac_address(fd, dev, &internal->eth_addr) < 0) { >> + rte_free(internal); > rte_free(dev->data); ? Yes Added. > >> + rte_eth_dev_release_port(dev); >> + return -1; >> + } >> + >> + data = dev->data; >> + >> + data->dev_link = pmd_link; >> + data->mac_addrs = &internal->eth_addr; >> + >> + data->nb_rx_queues = (uint16_t)internal->nb_queues; >> + data->nb_tx_queues = (uint16_t)internal->nb_queues; > no cast required. Removed > >> + data->drv_name = drivername; >> + >> + *eth_dev = dev; >> + *internals = internal; >> + >> + return 0; >> +} >> + > > <...> > >> + >> +static int >> +set_interface_speed(const char *key __rte_unused, >> + const char *value, >> + void *extra_args __rte_unused) > need to drop __rte_unused for extra_args > >> +{ >> + struct tap_info *tap = (struct tap_info *)extra_args; >> + >> + pmd_link.link_speed = (value) ? atoi(value) : ETH_SPEED_NUM_10G; >> + tap->speed = pmd_link.link_speed; >> + >> + return 0; >> +} >> + >> +/* >> + * Open a TAP interface device. >> + */ >> +static int >> +rte_pmd_tap_devinit(const char *name, const char *params) >> +{ >> + int ret = 0; >> + struct rte_kvargs *kvlist; >> + struct tap_info tap_info; >> + >> + /* Setup default values */ >> + memset(&tap_info, 0, sizeof(tap_info)); >> + >> + tap_info.speed = ETH_SPEED_NUM_10G; >> + snprintf(tap_info.name, sizeof(tap_info.name), "dtap%d", tap_unit++); > What about extracting iface name "dtap" into a macro to make it more > visible. Created macro for the default name. > >> + >> + if ((params == NULL) || (params[0] == '\0')) { >> + RTE_LOG(INFO, PMD, "Initializing pmd_tap for %s\n", name); >> + >> + ret = eth_dev_tap_create(name, &tap_info); > This "name" is not used at all (except from RTE_LOG), instead tap->name > is used for interface name, so why carying this variable around? Fixed and changed the API. > >> + goto leave; >> + } >> + >> + RTE_LOG(INFO, PMD, "Initialize %s with params (%s)\n", name, params); >> + >> + kvlist = rte_kvargs_parse(params, valid_arguments); >> + if (!kvlist) { >> + ret = eth_dev_tap_create(name, &tap_info); >> + goto leave; >> + } >> + >> + if (rte_kvargs_count(kvlist, ETH_TAP_SPEED_ARG) == 1) { >> + ret = rte_kvargs_process(kvlist, ETH_TAP_SPEED_ARG, >> + &set_interface_speed, &tap_info); >> + if (ret < 0) >> + goto leave; >> + } else >> + set_interface_speed(NULL, NULL, &tap_info); > This call is redundant, tap_info already has default speed value set. Removed > >> + >> + if (rte_kvargs_count(kvlist, ETH_TAP_IFACE_ARG) == 1) { >> + ret = rte_kvargs_process(kvlist, ETH_TAP_IFACE_ARG, >> + &set_interface_name, &tap_info); >> + if (ret < 0) >> + goto leave; >> + } else >> + set_interface_name(NULL, NULL, (void *)&tap_info); > tap_info->name already set to default value (dtap%d), this call is not > required. Removed > >> + >> + rte_kvargs_free(kvlist); >> + >> +leave: >> + if (ret == -1) >> + RTE_LOG(INFO, PMD, "Failed to create pmd_tap for %s\n", name); >> + >> + return ret; >> +} >> + >> +/* >> + * detach a TAP device. >> + */ >> +static int >> +rte_pmd_tap_devuninit(const char *name) >> +{ >> + struct rte_eth_dev *eth_dev = NULL; >> + struct pmd_internals *internals; >> + int i; >> + >> + RTE_LOG(INFO, PMD, "Closing TUN/TAP Ethernet device on numa %u\n", >> + rte_socket_id()); >> + >> + if (name == NULL) > This check is redundant, eal layer won't call this function with "name > == NULL? Removed > >> + return 0; >> + >> + /* find the ethdev entry */ >> + eth_dev = rte_eth_dev_allocated(name); >> + if (eth_dev == NULL) >> + return 0; >> + >> + internals = eth_dev->data->dev_private; >> + for (i = 0; i < internals->nb_queues; i++) >> + if (internals->fds[i] != -1) >> + close(internals->fds[i]); >> + >> + rte_free(eth_dev->data->dev_private); >> + rte_free(eth_dev->data); > data can be shared? > Don't we need a RTE_USE_PRIVATE_DATA flag check? > >> + >> + rte_eth_dev_release_port(eth_dev); >> + >> + return 0; >> +} >> + >> +static struct rte_vdev_driver pmd_tap_drv = { >> + .init = rte_pmd_tap_devinit, >> + .uninit = rte_pmd_tap_devuninit, >> +}; >> + >> +DRIVER_REGISTER_VDEV(eth_tap, pmd_tap_drv); > name convention is now: ?net_tap" Fixed > >> +DRIVER_REGISTER_PARAM_STRING(eth_tap, >> + "iface=<string>,speed=N"); >> diff --git a/drivers/net/tap/rte_pmd_tap_version.map >> b/drivers/net/tap/rte_pmd_tap_version.map >> new file mode 100644 >> index 0000000..61463bf >> --- /dev/null >> +++ b/drivers/net/tap/rte_pmd_tap_version.map >> @@ -0,0 +1,4 @@ >> +DPDK_16.11 { >> + >> + local: *; >> +}; >> diff --git a/mk/rte.app.mk b/mk/rte.app.mk >> index 1a0095b..bd1d10f 100644 >> --- a/mk/rte.app.mk >> +++ b/mk/rte.app.mk >> @@ -129,6 +129,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) >> _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost >> endif # $(CONFIG_RTE_LIBRTE_VHOST) >> _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio >> +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += -lrte_pmd_tap > please put in alphebetical order Done > >> >> ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) >> _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -lrte_pmd_aesni_mb