implement socket server fd event dispatch mechanism vhost sock message handling memory map for each region VHOST_USER_SET_VRING_KICK_FD as the indicator that vring is available VHOST_USER_GET_VRING_BASE as the message that vring should be released
The message flow between vhost-user and vhost-cuse is kindof different, which makes virtio-net common message handler layer difficult and complicated to handle both cases in new_device/destroy_device/memory map/resource cleanup. Will only leave the most common messag handling in virtio-net, and move the control logic to cuse/fuse layer. Signed-off-by: Huawei Xie <huawei.xie at intel.com> --- lib/librte_vhost/Makefile | 14 +- lib/librte_vhost/eventfd_link/eventfd_link.c | 27 +- lib/librte_vhost/eventfd_link/eventfd_link.h | 48 +- lib/librte_vhost/libvirt/qemu-wrap.py | 367 --------------- lib/librte_vhost/rte_virtio_net.h | 106 ++--- lib/librte_vhost/vhost-cuse/vhost-net-cdev.c | 436 ++++++++++++++++++ lib/librte_vhost/vhost-cuse/virtio-net-cdev.c | 314 +++++++++++++ lib/librte_vhost/vhost-cuse/virtio-net-cdev.h | 43 ++ lib/librte_vhost/vhost-net-cdev.c | 389 ---------------- lib/librte_vhost/vhost-net-cdev.h | 113 ----- lib/librte_vhost/vhost-user/fd_man.c | 158 +++++++ lib/librte_vhost/vhost-user/fd_man.h | 31 ++ lib/librte_vhost/vhost-user/vhost-net-user.c | 417 +++++++++++++++++ lib/librte_vhost/vhost-user/vhost-net-user.h | 74 +++ lib/librte_vhost/vhost-user/virtio-net-user.c | 208 +++++++++ lib/librte_vhost/vhost-user/virtio-net-user.h | 11 + lib/librte_vhost/vhost_rxtx.c | 625 ++++---------------------- lib/librte_vhost/virtio-net.c | 450 ++++--------------- 18 files changed, 1939 insertions(+), 1892 deletions(-) delete mode 100755 lib/librte_vhost/libvirt/qemu-wrap.py create mode 100644 lib/librte_vhost/vhost-cuse/vhost-net-cdev.c create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.c create mode 100644 lib/librte_vhost/vhost-cuse/virtio-net-cdev.h delete mode 100644 lib/librte_vhost/vhost-net-cdev.c delete mode 100644 lib/librte_vhost/vhost-net-cdev.h create mode 100644 lib/librte_vhost/vhost-user/fd_man.c create mode 100644 lib/librte_vhost/vhost-user/fd_man.h create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.c create mode 100644 lib/librte_vhost/vhost-user/vhost-net-user.h create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.c create mode 100644 lib/librte_vhost/vhost-user/virtio-net-user.h diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index c008d64..cb4e172 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -34,17 +34,19 @@ include $(RTE_SDK)/mk/rte.vars.mk # library name LIB = librte_vhost.a -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 -lfuse +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I. -I vhost-user -I vhost-cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse LDFLAGS += -lfuse # all source are stored in SRCS-y -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-net-cdev.c virtio-net.c vhost_rxtx.c +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-cuse/vhost-net-cdev.c vhost-cuse/virtio-net-cdev.c + +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-user/fd_man.c vhost-user/vhost-net-user.c vhost-user/virtio-net-user.c + +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c # install includes SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h -# dependencies -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_ether -DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_mbuf +# this lib needs eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal lib/librte_mbuf include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.c b/lib/librte_vhost/eventfd_link/eventfd_link.c index 7755dd6..4c9b628 100644 --- a/lib/librte_vhost/eventfd_link/eventfd_link.c +++ b/lib/librte_vhost/eventfd_link/eventfd_link.c @@ -13,8 +13,7 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * along with this program; If not, see <http://www.gnu.org/licenses/>. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * @@ -78,8 +77,7 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) switch (ioctl) { case EVENTFD_COPY: - if (copy_from_user(&eventfd_copy, argp, - sizeof(struct eventfd_copy))) + if (copy_from_user(&eventfd_copy, argp, sizeof(struct eventfd_copy))) return -EFAULT; /* @@ -88,28 +86,28 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) task_target = pid_task(find_vpid(eventfd_copy.target_pid), PIDTYPE_PID); if (task_target == NULL) { - pr_debug("Failed to get mem ctx for target pid\n"); + printk(KERN_DEBUG "Failed to get mem ctx for target pid\n"); return -EFAULT; } files = get_files_struct(current); if (files == NULL) { - pr_debug("Failed to get files struct\n"); + printk(KERN_DEBUG "Failed to get files struct\n"); return -EFAULT; } rcu_read_lock(); file = fcheck_files(files, eventfd_copy.source_fd); if (file) { - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) + if (file->f_mode & FMODE_PATH + || !atomic_long_inc_not_zero(&file->f_count)) file = NULL; } rcu_read_unlock(); put_files_struct(files); if (file == NULL) { - pr_debug("Failed to get file from source pid\n"); + printk(KERN_DEBUG "Failed to get file from source pid\n"); return 0; } @@ -128,25 +126,26 @@ eventfd_link_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) files = get_files_struct(task_target); if (files == NULL) { - pr_debug("Failed to get files struct\n"); + printk(KERN_DEBUG "Failed to get files struct\n"); return -EFAULT; } rcu_read_lock(); file = fcheck_files(files, eventfd_copy.target_fd); if (file) { - if (file->f_mode & FMODE_PATH || - !atomic_long_inc_not_zero(&file->f_count)) - file = NULL; + if (file->f_mode & FMODE_PATH + || !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); put_files_struct(files); if (file == NULL) { - pr_debug("Failed to get file from target pid\n"); + printk(KERN_DEBUG "Failed to get file from target pid\n"); return 0; } + /* * Install the file struct from the target process into the * file desciptor of the source process, diff --git a/lib/librte_vhost/eventfd_link/eventfd_link.h b/lib/librte_vhost/eventfd_link/eventfd_link.h index ea619ec..38052e2 100644 --- a/lib/librte_vhost/eventfd_link/eventfd_link.h +++ b/lib/librte_vhost/eventfd_link/eventfd_link.h @@ -1,7 +1,4 @@ /*- - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * * GPL LICENSE SUMMARY * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. @@ -16,61 +13,28 @@ * General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * along with this program; If not, see <http://www.gnu.org/licenses/>. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * Contact Information: * Intel Corporation - * - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * */ #ifndef _EVENTFD_LINK_H_ #define _EVENTFD_LINK_H_ /* - * ioctl to copy an fd entry in calling process to an fd in a target process + * ioctl to copy an fd entry in calling process to an fd in a target process */ #define EVENTFD_COPY 1 /* - * arguements for the EVENTFD_COPY ioctl + * arguements for the EVENTFD_COPY ioctl */ struct eventfd_copy { - unsigned target_fd; /* fd in the target pid */ - unsigned source_fd; /* fd in the calling pid */ - pid_t target_pid; /* pid of the target pid */ + unsigned target_fd; /**< fd in the target pid */ + unsigned source_fd; /**< fd in the calling pid */ + pid_t target_pid; /**< pid of the target pid */ }; #endif /* _EVENTFD_LINK_H_ */ diff --git a/lib/librte_vhost/libvirt/qemu-wrap.py b/lib/librte_vhost/libvirt/qemu-wrap.py deleted file mode 100755 index e2d68a0..0000000 --- a/lib/librte_vhost/libvirt/qemu-wrap.py +++ /dev/null @@ -1,367 +0,0 @@ -#!/usr/bin/python -#/* -# * BSD LICENSE -# * -# * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. -# * All rights reserved. -# * -# * Redistribution and use in source and binary forms, with or without -# * modification, are permitted provided that the following conditions -# * are met: -# * -# * * Redistributions of source code must retain the above copyright -# * notice, this list of conditions and the following disclaimer. -# * * Redistributions in binary form must reproduce the above copyright -# * notice, this list of conditions and the following disclaimer in -# * the documentation and/or other materials provided with the -# * distribution. -# * * Neither the name of Intel Corporation nor the names of its -# * contributors may be used to endorse or promote products derived -# * from this software without specific prior written permission. -# * -# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# */ - -##################################################################### -# This script is designed to modify the call to the QEMU emulator -# to support userspace vhost when starting a guest machine through -# libvirt with vhost enabled. The steps to enable this are as follows -# and should be run as root: -# -# 1. Place this script in a libvirtd's binary search PATH ($PATH) -# A good location would be in the same directory that the QEMU -# binary is located -# -# 2. Ensure that the script has the same owner/group and file -# permissions as the QEMU binary -# -# 3. Update the VM xml file using "virsh edit VM.xml" -# -# 3.a) Set the VM to use the launch script -# -# Set the emulator path contained in the -# <emulator><emulator/> tags -# -# e.g replace <emulator>/usr/bin/qemu-kvm<emulator/> -# with <emulator>/usr/bin/qemu-wrap.py<emulator/> -# -# 3.b) Set the VM's device's to use vhost-net offload -# -# <interface type="network"> -# <model type="virtio"/> -# <driver name="vhost"/> -# <interface/> -# -# 4. Enable libvirt to access our userpace device file by adding it to -# controllers cgroup for libvirtd using the following steps -# -# 4.a) In /etc/libvirt/qemu.conf add/edit the following lines: -# 1) cgroup_controllers = [ ... "devices", ... ] -# 2) clear_emulator_capabilities = 0 -# 3) user = "root" -# 4) group = "root" -# 5) cgroup_device_acl = [ -# "/dev/null", "/dev/full", "/dev/zero", -# "/dev/random", "/dev/urandom", -# "/dev/ptmx", "/dev/kvm", "/dev/kqemu", -# "/dev/rtc", "/dev/hpet", "/dev/net/tun", -# "/dev/<devbase-name>-<index>", -# ] -# -# 4.b) Disable SELinux or set to permissive mode -# -# 4.c) Mount cgroup device controller -# "mkdir /dev/cgroup" -# "mount -t cgroup none /dev/cgroup -o devices" -# -# 4.d) Set hugetlbfs_mount variable - ( Optional ) -# VMs using userspace vhost must use hugepage backed -# memory. This can be enabled in the libvirt XML -# config by adding a memory backing section to the -# XML config e.g. -# <memoryBacking> -# <hugepages/> -# </memoryBacking> -# This memory backing section should be added after the -# <memory> and <currentMemory> sections. This will add -# flags "-mem-prealloc -mem-path <path>" to the QEMU -# command line. The hugetlbfs_mount variable can be used -# to override the default <path> passed through by libvirt. -# -# if "-mem-prealloc" or "-mem-path <path>" are not passed -# through and a vhost device is detected then these options will -# be automatically added by this script. This script will detect -# the system hugetlbfs mount point to be used for <path>. The -# default <path> for this script can be overidden by the -# hugetlbfs_dir variable in the configuration section of this script. -# -# -# 4.e) Restart the libvirtd system process -# e.g. on Fedora "systemctl restart libvirtd.service" -# -# -# 4.f) Edit the Configuration Parameters section of this script -# to point to the correct emulator location and set any -# addition options -# -# The script modifies the libvirtd Qemu call by modifying/adding -# options based on the configuration parameters below. -# NOTE: -# emul_path and us_vhost_path must be set -# All other parameters are optional -##################################################################### - - -############################################# -# Configuration Parameters -############################################# -#Path to QEMU binary -emul_path = "/usr/local/bin/qemu-system-x86_64" - -#Path to userspace vhost device file -# This filename should match the --dev-basename --dev-index parameters of -# the command used to launch the userspace vhost sample application e.g. -# if the sample app lauch command is: -# ./build/vhost-switch ..... --dev-basename usvhost --dev-index 1 -# then this variable should be set to: -# us_vhost_path = "/dev/usvhost-1" -us_vhost_path = "/dev/usvhost-1" - -#List of additional user defined emulation options. These options will -#be added to all Qemu calls -emul_opts_user = [] - -#List of additional user defined emulation options for vhost only. -#These options will only be added to vhost enabled guests -emul_opts_user_vhost = [] - -#For all VHOST enabled VMs, the VM memory is preallocated from hugetlbfs -# Set this variable to one to enable this option for all VMs -use_huge_all = 0 - -#Instead of autodetecting, override the hugetlbfs directory by setting -#this variable -hugetlbfs_dir = "" - -############################################# - - -############################################# -# ****** Do Not Modify Below this Line ****** -############################################# - -import sys, os, subprocess - - -#List of open userspace vhost file descriptors -fd_list = [] - -#additional virtio device flags when using userspace vhost -vhost_flags = [ "csum=off", - "gso=off", - "guest_tso4=off", - "guest_tso6=off", - "guest_ecn=off" - ] - - -############################################# -# Find the system hugefile mount point. -# Note: -# if multiple hugetlbfs mount points exist -# then the first one found will be used -############################################# -def find_huge_mount(): - - if (len(hugetlbfs_dir)): - return hugetlbfs_dir - - huge_mount = "" - - if (os.access("/proc/mounts", os.F_OK)): - f = open("/proc/mounts", "r") - line = f.readline() - while line: - line_split = line.split(" ") - if line_split[2] == 'hugetlbfs': - huge_mount = line_split[1] - break - line = f.readline() - else: - print "/proc/mounts not found" - exit (1) - - f.close - if len(huge_mount) == 0: - print "Failed to find hugetlbfs mount point" - exit (1) - - return huge_mount - - -############################################# -# Get a userspace Vhost file descriptor -############################################# -def get_vhost_fd(): - - if (os.access(us_vhost_path, os.F_OK)): - fd = os.open( us_vhost_path, os.O_RDWR) - else: - print ("US-Vhost file %s not found" %us_vhost_path) - exit (1) - - return fd - - -############################################# -# Check for vhostfd. if found then replace -# with our own vhost fd and append any vhost -# flags onto the end -############################################# -def modify_netdev_arg(arg): - - global fd_list - vhost_in_use = 0 - s = '' - new_opts = [] - netdev_opts = arg.split(",") - - for opt in netdev_opts: - #check if vhost is used - if "vhost" == opt[:5]: - vhost_in_use = 1 - else: - new_opts.append(opt) - - #if using vhost append vhost options - if vhost_in_use == 1: - #append vhost on option - new_opts.append('vhost=on') - #append vhostfd ption - new_fd = get_vhost_fd() - new_opts.append('vhostfd=' + str(new_fd)) - fd_list.append(new_fd) - - #concatenate all options - for opt in new_opts: - if len(s) > 0: - s+=',' - - s+=opt - - return s - - -############################################# -# Main -############################################# -def main(): - - global fd_list - global vhost_in_use - new_args = [] - num_cmd_args = len(sys.argv) - emul_call = '' - mem_prealloc_set = 0 - mem_path_set = 0 - num = 0; - - #parse the parameters - while (num < num_cmd_args): - arg = sys.argv[num] - - #Check netdev +1 parameter for vhostfd - if arg == '-netdev': - num_vhost_devs = len(fd_list) - new_args.append(arg) - - num+=1 - arg = sys.argv[num] - mod_arg = modify_netdev_arg(arg) - new_args.append(mod_arg) - - #append vhost flags if this is a vhost device - # and -device is the next arg - # i.e -device -opt1,-opt2,...,-opt3,%vhost - if (num_vhost_devs < len(fd_list)): - num+=1 - arg = sys.argv[num] - if arg == '-device': - new_args.append(arg) - num+=1 - new_arg = sys.argv[num] - for flag in vhost_flags: - new_arg = ''.join([new_arg,',',flag]) - new_args.append(new_arg) - else: - new_args.append(arg) - elif arg == '-mem-prealloc': - mem_prealloc_set = 1 - new_args.append(arg) - elif arg == '-mem-path': - mem_path_set = 1 - new_args.append(arg) - - else: - new_args.append(arg) - - num+=1 - - #Set Qemu binary location - emul_call+=emul_path - emul_call+=" " - - #Add prealloc mem options if using vhost and not already added - if ((len(fd_list) > 0) and (mem_prealloc_set == 0)): - emul_call += "-mem-prealloc " - - #Add mempath mem options if using vhost and not already added - if ((len(fd_list) > 0) and (mem_path_set == 0)): - #Detect and add hugetlbfs mount point - mp = find_huge_mount() - mp = "".join(["-mem-path ", mp]) - emul_call += mp - emul_call += " " - - - #add user options - for opt in emul_opts_user: - emul_call += opt - emul_call += " " - - #Add add user vhost only options - if len(fd_list) > 0: - for opt in emul_opts_user_vhost: - emul_call += opt - emul_call += " " - - #Add updated libvirt options - iter_args = iter(new_args) - #skip 1st arg i.e. call to this script - next(iter_args) - for arg in iter_args: - emul_call+=str(arg) - emul_call+= " " - - #Call QEMU - subprocess.call(emul_call, shell=True) - - - #Close usvhost files - for fd in fd_list: - os.close(fd) - - -if __name__ == "__main__": - main() - diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h index 00b1328..7a05dab 100644 --- a/lib/librte_vhost/rte_virtio_net.h +++ b/lib/librte_vhost/rte_virtio_net.h @@ -34,11 +34,6 @@ #ifndef _VIRTIO_NET_H_ #define _VIRTIO_NET_H_ -/** - * @file - * Interface to vhost net - */ - #include <stdint.h> #include <linux/virtio_ring.h> #include <linux/virtio_net.h> @@ -48,66 +43,38 @@ #include <rte_mempool.h> #include <rte_mbuf.h> -/* Used to indicate that the device is running on a data core */ -#define VIRTIO_DEV_RUNNING 1 - -/* Backend value set by guest. */ -#define VIRTIO_DEV_STOPPED -1 - +#define VIRTIO_DEV_RUNNING 1 /**< Used to indicate that the device is running on a data core. */ +#define VIRTIO_DEV_STOPPED -1 /**< Backend value set by guest. */ /* Enum for virtqueue management. */ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; -#define BUF_VECTOR_MAX 256 - -/** - * Structure contains buffer address, length and descriptor index - * from vring to do scatter RX. - */ -struct buf_vector { - uint64_t buf_addr; - uint32_t buf_len; - uint32_t desc_idx; -}; - /** * Structure contains variables relevant to RX/TX virtqueues. */ struct vhost_virtqueue { - struct vring_desc *desc; /**< Virtqueue descriptor ring. */ - struct vring_avail *avail; /**< Virtqueue available ring. */ - struct vring_used *used; /**< Virtqueue used ring. */ - uint32_t size; /**< Size of descriptor ring. */ - uint32_t backend; /**< Backend value to determine if device should started/stopped. */ - uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ - volatile uint16_t last_used_idx; /**< Last index used on the available ring */ - volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ - eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ - eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ - struct buf_vector buf_vec[BUF_VECTOR_MAX]; /**< for scatter RX. */ -} __rte_cache_aligned; - -/** - * Device structure contains all configuration information relating to the device. - */ -struct virtio_net { - struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ - struct virtio_memory *mem; /**< QEMU memory and memory region information. */ - uint64_t features; /**< Negotiated feature set. */ - uint64_t device_fh; /**< device identifier. */ - uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ - void *priv; /**< private context */ + struct vring_desc *desc; /**< descriptor ring. */ + struct vring_avail *avail; /**< available ring. */ + struct vring_used *used; /**< used ring. */ + uint32_t size; /**< Size of descriptor ring. */ + uint32_t backend; /**< Backend value to determine if device should be started/stopped. */ + uint16_t vhost_hlen; /**< Vhost header length (varies depending on RX merge buffers. */ + volatile uint16_t last_used_idx; /**< Last index used on the available ring. */ + volatile uint16_t last_used_idx_res; /**< Used for multiple devices reserving buffers. */ + eventfd_t callfd; /**< Currently unused as polling mode is enabled. */ + eventfd_t kickfd; /**< Used to notify the guest (trigger interrupt). */ } __rte_cache_aligned; /** - * Information relating to memory regions including offsets to addresses in QEMUs memory file. + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. */ struct virtio_memory_regions { - uint64_t guest_phys_address; /**< Base guest physical address of region. */ - uint64_t guest_phys_address_end; /**< End guest physical address of region. */ - uint64_t memory_size; /**< Size of region. */ - uint64_t userspace_address; /**< Base userspace address of region. */ - uint64_t address_offset; /**< Offset of region for address translation. */ + uint64_t guest_phys_address; /**< Base guest physical address of region. */ + uint64_t guest_phys_address_end; /**< End guest physical address of region. */ + uint64_t memory_size; /**< Size of region. */ + uint64_t userspace_address; /**< Base userspace address of region. */ + uint64_t address_offset; /**< Offset of region for address translation. */ }; @@ -115,21 +82,34 @@ struct virtio_memory_regions { * Memory structure includes region and mapping information. */ struct virtio_memory { - uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ - uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ - uint64_t mapped_size; /**< Total size of memory file. */ - uint32_t nregions; /**< Number of memory regions. */ + uint64_t base_address; /**< Base QEMU userspace address of the memory file. */ + uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */ + uint64_t mapped_size; /**< Total size of memory file. */ + uint32_t nregions; /**< Number of memory regions. */ struct virtio_memory_regions regions[0]; /**< Memory region information. */ }; /** + * Device structure contains all configuration information relating to the device. + */ +struct virtio_net { + struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */ + struct virtio_memory *mem; /**< QEMU memory and memory region information. */ + uint64_t features; /**< Negotiated feature set. */ + uint64_t device_fh; /**< Device identifier. */ + uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */ + void *priv; +} __rte_cache_aligned; + +/** * Device operations to add/remove device. */ struct virtio_net_device_ops { - int (*new_device)(struct virtio_net *); /**< Add device. */ - void (*destroy_device)(volatile struct virtio_net *); /**< Remove device. */ + int (*new_device)(struct virtio_net *); /**< Add device. */ + void (*destroy_device)(struct virtio_net *); /**< Remove device. */ }; + static inline uint16_t __attribute__((always_inline)) rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id) { @@ -179,7 +159,7 @@ int rte_vhost_driver_register(const char *dev_name); /* Register callbacks. */ int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const); -/* Start vhost driver session blocking loop. */ + int rte_vhost_driver_session_start(void); /** @@ -192,8 +172,8 @@ int rte_vhost_driver_session_start(void); * @return * num of packets enqueued */ -uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count); +uint32_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count); /** * This function gets guest buffers from the virtio device TX virtqueue, @@ -206,7 +186,7 @@ uint16_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, * @return * num of packets dequeued */ -uint16_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); +uint32_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count); #endif /* _VIRTIO_NET_H_ */ diff --git a/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c new file mode 100644 index 0000000..4671643 --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/vhost-net-cdev.c @@ -0,0 +1,436 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <fuse/cuse_lowlevel.h> +#include <linux/limits.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <string.h> +#include <unistd.h> +#include <sys/ioctl.h> + +#include <rte_ethdev.h> +#include <rte_log.h> +#include <rte_string_fns.h> +#include <rte_virtio_net.h> + +#include "virtio-net-cdev.h" +#include "vhost-net.h" +#include "eventfd_link/eventfd_link.h" + +#define FUSE_OPT_DUMMY "\0\0" +#define FUSE_OPT_FORE "-f\0\0" +#define FUSE_OPT_NOMULTI "-s\0\0" + +static const uint32_t default_major = 231; +static const uint32_t default_minor = 1; +static const char cuse_device_name[] = "/dev/cuse"; +static const char default_cdev[] = "vhost-net"; +static const char eventfd_cdev[] = "/dev/eventfd-link"; + +static struct fuse_session *session; +const struct vhost_net_device_ops const *ops; + +/* + * Returns vhost_device_ctx from given fuse_req_t. The index is populated later + * when the device is added to the device linked list. + */ +static struct vhost_device_ctx +fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) +{ + struct vhost_device_ctx ctx; + struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); + + ctx.pid = req_ctx->pid; + ctx.fh = fi->fh; + + return ctx; +} + +/* + * When the device is created in QEMU it gets initialised here and + * added to the device linked list. + */ +static void +vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) +{ + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + int err = 0; + + err = ops->new_device(ctx); + if (err == -1) { + fuse_reply_err(req, EPERM); + return; + } + + fi->fh = err; + + RTE_LOG(INFO, VHOST_CONFIG, + "(%"PRIu64") Device configuration started\n", fi->fh); + fuse_reply_open(req, fi); +} + +/* + * When QEMU is shutdown or killed the device gets released. + */ +static void +vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) +{ + int err = 0; + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + + ops->destroy_device(ctx); + RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); + fuse_reply_err(req, err); +} + +/* + * Boilerplate code for CUSE IOCTL + * Implicit arguments: ctx, req, result. + */ +#define VHOST_IOCTL(func) do { \ + result = (func)(ctx); \ + fuse_reply_ioctl(req, result, NULL, 0); \ +} while (0) + +/* + * Boilerplate IOCTL RETRY + * Implicit arguments: req. + */ +#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ + struct iovec iov_r = { arg, (size_r) }; \ + struct iovec iov_w = { arg, (size_w) }; \ + fuse_reply_ioctl_retry(req, &iov_r, \ + (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ +} while (0) + +/* + * Boilerplate code for CUSE Read IOCTL + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. + */ +#define VHOST_IOCTL_R(type, var, func) do { \ + if (!in_bufsz) { \ + VHOST_IOCTL_RETRY(sizeof(type), 0);\ + } else { \ + (var) = *(const type*)in_buf; \ + result = func(ctx, &(var)); \ + fuse_reply_ioctl(req, result, NULL, 0);\ + } \ +} while (0) + +/* + * Boilerplate code for CUSE Write IOCTL + * Implicit arguments: ctx, req, result, out_bufsz. + */ +#define VHOST_IOCTL_W(type, var, func) do { \ + if (!out_bufsz) { \ + VHOST_IOCTL_RETRY(0, sizeof(type));\ + } else { \ + result = (func)(ctx, &(var));\ + fuse_reply_ioctl(req, result, &(var), sizeof(type));\ + } \ +} while (0) + +/* + * Boilerplate code for CUSE Read/Write IOCTL + * Implicit arguments: ctx, req, result, in_bufsz, in_buf. + */ +#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ + if (!in_bufsz) { \ + VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ + } else { \ + (var1) = *(const type1*) (in_buf); \ + result = (func)(ctx, (var1), &(var2)); \ + fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ + } \ +} while (0) + +/* + * This function uses the eventfd_link kernel module to copy an eventfd file + * descriptor provided by QEMU in to our process space. + */ +static int +eventfd_copy(int target_fd, int target_pid) +{ + int eventfd_link, ret; + struct eventfd_copy eventfd_copy; + int fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + + if (fd == -1) + return -1; + + /* Open the character device to the kernel module. */ + /* TODO: check this earlier rather than fail until VM boots! */ + eventfd_link = open(eventfd_cdev, O_RDWR); + if (eventfd_link < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "eventfd_link module is not loaded\n"); + return -1; + } + + eventfd_copy.source_fd = fd; + eventfd_copy.target_fd = target_fd; + eventfd_copy.target_pid = target_pid; + /* Call the IOCTL to copy the eventfd. */ + ret = ioctl(eventfd_link, EVENTFD_COPY, &eventfd_copy); + close(eventfd_link); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "EVENTFD_COPY ioctl failed\n"); + return -1; + } + + return fd; +} + +/* + * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on + * the type of IOCTL a buffer is requested to read or to write. This + * request is handled by FUSE and the buffer is then given to CUSE. + */ +static void +vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, __rte_unused unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); + struct vhost_vring_file file; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + uint64_t features; + uint32_t index; + int result = 0; + + switch (cmd) { + case VHOST_NET_SET_BACKEND: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); + break; + + case VHOST_GET_FEATURES: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); + VHOST_IOCTL_W(uint64_t, features, ops->get_features); + break; + + case VHOST_SET_FEATURES: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); + VHOST_IOCTL_R(uint64_t, features, ops->set_features); + break; + + case VHOST_RESET_OWNER: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); + VHOST_IOCTL(ops->reset_owner); + break; + + case VHOST_SET_OWNER: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); + VHOST_IOCTL(ops->set_owner); + break; + + case VHOST_SET_MEM_TABLE: + /*TODO fix race condition.*/ + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); + static struct vhost_memory mem_temp; + switch (in_bufsz) { + case 0: + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); + break; + + case sizeof(struct vhost_memory): + mem_temp = *(const struct vhost_memory *) in_buf; + + if (mem_temp.nregions > 0) { + VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + + (sizeof(struct vhost_memory_region) * + mem_temp.nregions), 0); + } else { + result = -1; + fuse_reply_ioctl(req, result, NULL, 0); + } + break; + + default: + result = cuse_set_mem_table(ctx, in_buf, + mem_temp.nregions); + if (result) + fuse_reply_err(req, EINVAL); + else + fuse_reply_ioctl(req, result, NULL, 0); + } + break; + + case VHOST_SET_VRING_NUM: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_num); + break; + + case VHOST_SET_VRING_BASE: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_state, state, ops->set_vring_base); + break; + + case VHOST_GET_VRING_BASE: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); + VHOST_IOCTL_RW(uint32_t, index, + struct vhost_vring_state, state, ops->get_vring_base); + break; + + case VHOST_SET_VRING_ADDR: + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); + VHOST_IOCTL_R(struct vhost_vring_addr, addr, ops->set_vring_addr); + break; + + case VHOST_SET_VRING_KICK: + case VHOST_SET_VRING_CALL: + if (!in_buf) { + VHOST_IOCTL_RETRY(sizeof(struct vhost_vring_file), 0); + } else { + int fd; + file = *(const struct vhost_vring_file *)in_buf; + LOG_DEBUG(VHOST_CONFIG, + "kick/call idx:%d fd:%d\n", file.index, file.fd); + if ((fd = eventfd_copy(file.fd, ctx.pid)) < 0){ + fuse_reply_ioctl(req, -1, NULL, 0); + } + file.fd = fd; + if (cmd == VHOST_SET_VRING_KICK) { + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_call); + } + else { + VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_vring_kick); + } + } + break; + + default: + RTE_LOG(ERR, VHOST_CONFIG, + "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); + result = -1; + fuse_reply_ioctl(req, result, NULL, 0); + } + + if (result < 0) + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); + else + LOG_DEBUG(VHOST_CONFIG, + "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); +} + +/* + * Structure handling open, release and ioctl function pointers is populated. + */ +static const struct cuse_lowlevel_ops vhost_net_ops = { + .open = vhost_net_open, + .release = vhost_net_release, + .ioctl = vhost_net_ioctl, +}; + +/* + * cuse_info is populated and used to register the cuse device. + * vhost_net_device_ops are also passed when the device is registered in app. + */ +int +rte_vhost_driver_register(const char *dev_name) +{ + struct cuse_info cuse_info; + char device_name[PATH_MAX] = ""; + char char_device_name[PATH_MAX] = ""; + const char *device_argv[] = { device_name }; + + char fuse_opt_dummy[] = FUSE_OPT_DUMMY; + char fuse_opt_fore[] = FUSE_OPT_FORE; + char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; + char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; + + if (access(cuse_device_name, R_OK | W_OK) < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "char device %s can't be accessed, maybe not exist\n", + cuse_device_name); + return -1; + } + + /* + * The device name is created. This is passed to QEMU so that it can + * register the device with our application. + */ + snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); + snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); + + /* Check if device already exists. */ + if (access(char_device_name, F_OK) != -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "char device %s already exists\n", char_device_name); + return -1; + } + + memset(&cuse_info, 0, sizeof(cuse_info)); + cuse_info.dev_major = default_major; + cuse_info.dev_minor = default_minor; + cuse_info.dev_info_argc = 1; + cuse_info.dev_info_argv = device_argv; + cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; + + ops = get_virtio_net_callbacks(); + + session = cuse_lowlevel_setup(3, fuse_argv, + &cuse_info, &vhost_net_ops, 0, NULL); + if (session == NULL) + return -1; + + return 0; +} + +/** + * The CUSE session is launched allowing the application to receive open, + * release and ioctl calls. + */ +int +rte_vhost_driver_session_start(void) +{ + fuse_session_loop(session); + + return 0; +} diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c new file mode 100644 index 0000000..5c16aa5 --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.c @@ -0,0 +1,314 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <dirent.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <fuse/cuse_lowlevel.h> +#include <stddef.h> +#include <string.h> +#include <stdlib.h> +#include <sys/eventfd.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> +#include <errno.h> + +#include <rte_log.h> + +#include "vhost-net.h" +#include "virtio-net-cdev.h" + +extern struct vhost_net_device_ops const *ops; + +/* Line size for reading maps file. */ +static const uint32_t BUFSIZE = PATH_MAX; + +/* Size of prot char array in procmap. */ +#define PROT_SZ 5 + +/* Number of elements in procmap struct. */ +#define PROCMAP_SZ 8 + +/* Structure containing information gathered from maps file. */ +struct procmap { + uint64_t va_start; /* Start virtual address in file. */ + uint64_t len; /* Size of file. */ + uint64_t pgoff; /* Not used. */ + uint32_t maj; /* Not used. */ + uint32_t min; /* Not used. */ + uint32_t ino; /* Not used. */ + char prot[PROT_SZ]; /* Not used. */ + char fname[PATH_MAX]; /* File name. */ +}; + +/* + * Locate the file containing QEMU's memory space and + * map it to our address space. + */ +static int +host_memory_map(pid_t pid, uint64_t addr, + uint64_t *mapped_address, uint64_t *mapped_size) +{ + struct dirent *dptr = NULL; + struct procmap procmap; + DIR *dp = NULL; + int fd; + int i; + char memfile[PATH_MAX]; + char mapfile[PATH_MAX]; + char procdir[PATH_MAX]; + char resolved_path[PATH_MAX]; + FILE *fmap; + void *map; + uint8_t found = 0; + char line[BUFSIZE]; + char dlm[] = "- : "; + char *str, *sp, *in[PROCMAP_SZ]; + char *end = NULL; + + /* Path where mem files are located. */ + snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); + /* Maps file used to locate mem file. */ + snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); + + fmap = fopen(mapfile, "r"); + if (fmap == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to open maps file for pid %d\n", pid); + return -1; + } + + /* Read through maps file until we find out base_address. */ + while (fgets(line, BUFSIZE, fmap) != 0) { + str = line; + errno = 0; + /* Split line in to fields. */ + for (i = 0; i < PROCMAP_SZ; i++) { + in[i] = strtok_r(str, &dlm[i], &sp); + if ((in[i] == NULL) || (errno != 0)) { + fclose(fmap); + return -1; + } + str = NULL; + } + + /* Convert/Copy each field as needed. */ + procmap.va_start = strtoull(in[0], &end, 16); + if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.len = strtoull(in[1], &end, 16); + if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.pgoff = strtoull(in[3], &end, 16); + if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.maj = strtoul(in[4], &end, 16); + if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.min = strtoul(in[5], &end, 16); + if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + procmap.ino = strtoul(in[6], &end, 16); + if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || + (errno != 0)) { + fclose(fmap); + return -1; + } + + memcpy(&procmap.prot, in[2], PROT_SZ); + memcpy(&procmap.fname, in[7], PATH_MAX); + + if (procmap.va_start == addr) { + procmap.len = procmap.len - procmap.va_start; + found = 1; + break; + } + } + fclose(fmap); + + if (!found) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find memory file in pid %d maps file\n", pid); + return -1; + } + + /* Find the guest memory file among the process fds. */ + dp = opendir(procdir); + if (dp == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Cannot open pid %d process directory\n", + pid); + return -1; + + } + + found = 0; + + /* Read the fd directory contents. */ + while (NULL != (dptr = readdir(dp))) { + snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", + pid, dptr->d_name); + realpath(memfile, resolved_path); + if (resolved_path == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to resolve fd directory\n"); + closedir(dp); + return -1; + } + if (strncmp(resolved_path, procmap.fname, + strnlen(procmap.fname, PATH_MAX)) == 0) { + found = 1; + break; + } + } + + closedir(dp); + + if (found == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find memory file for pid %d\n", + pid); + return -1; + } + /* Open the shared memory file and map the memory into this process. */ + fd = open(memfile, O_RDWR); + + if (fd == -1) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to open %s for pid %d\n", + memfile, pid); + return -1; + } + + map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE , + MAP_POPULATE|MAP_SHARED, fd, 0); + close(fd); + + if (map == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "Error mapping the file %s for pid %d\n", + memfile, pid); + return -1; + } + + /* Store the memory address and size in the device data structure */ + *mapped_address = (uint64_t)(uintptr_t)map; + *mapped_size = procmap.len; + + LOG_DEBUG(VHOST_CONFIG, + "Mem File: %s->%s - Size: %llu - VA: %p\n", + memfile, resolved_path, + (unsigned long long)mapped_size, map); + + return 0; +} + +int +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, + uint32_t nregions) +{ + uint64_t size = offsetof(struct vhost_memory, regions); + uint32_t idx; + struct virtio_memory_regions regions[8]; /* VHOST_MAX_MEMORY_REGIONS */ + struct vhost_memory_region *mem_regions = (void *)(uintptr_t) + ((uint64_t)(uintptr_t)mem_regions_addr + size); + uint64_t base_address = 0, mapped_address, mapped_size; + + for (idx = 0; idx < nregions; idx++) { + regions[idx].guest_phys_address = + mem_regions[idx].guest_phys_addr; + regions[idx].guest_phys_address_end = + regions[idx].guest_phys_address + + mem_regions[idx].memory_size; + regions[idx].memory_size = + mem_regions[idx].memory_size; + regions[idx].userspace_address = + mem_regions[idx].userspace_addr; + + LOG_DEBUG(VHOST_CONFIG, "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", + idx, + (void *)(uintptr_t)regions[idx].guest_phys_address, + (void *)(uintptr_t)regions[idx].userspace_address, + regions[idx].memory_size); + + /*set the base address mapping*/ + if (regions[idx].guest_phys_address == 0x0) { + base_address = + regions[idx].userspace_address; + /* Map VM memory file */ + if (host_memory_map(ctx.pid, base_address, + &mapped_address, &mapped_size) != 0) { + return -1; + } + } + } + + /* Check that we have a valid base address. */ + if (base_address == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find base address of qemu memory file.\n"); + return -1; + } + + for (idx = 0; idx < nregions; idx++) { + regions[idx].address_offset = + mapped_address - base_address + + regions[idx].userspace_address - + regions[idx].guest_phys_address; + } + + ops->set_mem_table(ctx, ®ions[0], nregions); + return 0; +} diff --git a/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h new file mode 100644 index 0000000..6f98ce8 --- /dev/null +++ b/lib/librte_vhost/vhost-cuse/virtio-net-cdev.h @@ -0,0 +1,43 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _VIRTIO_NET_CDEV_H +#define _VIRTIO_NET_CDEV_H +#include <stdint.h> + +#include "vhost-net.h" + +int +cuse_set_mem_table(struct vhost_device_ctx ctx, const struct vhost_memory *mem_regions_addr, + uint32_t nregions); + +#endif diff --git a/lib/librte_vhost/vhost-net-cdev.c b/lib/librte_vhost/vhost-net-cdev.c deleted file mode 100644 index 57c76cb..0000000 --- a/lib/librte_vhost/vhost-net-cdev.c +++ /dev/null @@ -1,389 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include <errno.h> -#include <fuse/cuse_lowlevel.h> -#include <linux/limits.h> -#include <linux/vhost.h> -#include <stdint.h> -#include <string.h> -#include <unistd.h> - -#include <rte_ethdev.h> -#include <rte_log.h> -#include <rte_string_fns.h> -#include <rte_virtio_net.h> - -#include "vhost-net-cdev.h" - -#define FUSE_OPT_DUMMY "\0\0" -#define FUSE_OPT_FORE "-f\0\0" -#define FUSE_OPT_NOMULTI "-s\0\0" - -static const uint32_t default_major = 231; -static const uint32_t default_minor = 1; -static const char cuse_device_name[] = "/dev/cuse"; -static const char default_cdev[] = "vhost-net"; - -static struct fuse_session *session; -static struct vhost_net_device_ops const *ops; - -/* - * Returns vhost_device_ctx from given fuse_req_t. The index is populated later - * when the device is added to the device linked list. - */ -static struct vhost_device_ctx -fuse_req_to_vhost_ctx(fuse_req_t req, struct fuse_file_info *fi) -{ - struct vhost_device_ctx ctx; - struct fuse_ctx const *const req_ctx = fuse_req_ctx(req); - - ctx.pid = req_ctx->pid; - ctx.fh = fi->fh; - - return ctx; -} - -/* - * When the device is created in QEMU it gets initialised here and - * added to the device linked list. - */ -static void -vhost_net_open(fuse_req_t req, struct fuse_file_info *fi) -{ - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - int err = 0; - - err = ops->new_device(ctx); - if (err == -1) { - fuse_reply_err(req, EPERM); - return; - } - - fi->fh = err; - - RTE_LOG(INFO, VHOST_CONFIG, - "(%"PRIu64") Device configuration started\n", fi->fh); - fuse_reply_open(req, fi); -} - -/* - * When QEMU is shutdown or killed the device gets released. - */ -static void -vhost_net_release(fuse_req_t req, struct fuse_file_info *fi) -{ - int err = 0; - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - - ops->destroy_device(ctx); - RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n", ctx.fh); - fuse_reply_err(req, err); -} - -/* - * Boilerplate code for CUSE IOCTL - * Implicit arguments: ctx, req, result. - */ -#define VHOST_IOCTL(func) do { \ - result = (func)(ctx); \ - fuse_reply_ioctl(req, result, NULL, 0); \ -} while (0) - -/* - * Boilerplate IOCTL RETRY - * Implicit arguments: req. - */ -#define VHOST_IOCTL_RETRY(size_r, size_w) do { \ - struct iovec iov_r = { arg, (size_r) }; \ - struct iovec iov_w = { arg, (size_w) }; \ - fuse_reply_ioctl_retry(req, &iov_r, \ - (size_r) ? 1 : 0, &iov_w, (size_w) ? 1 : 0);\ -} while (0) - -/* - * Boilerplate code for CUSE Read IOCTL - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. - */ -#define VHOST_IOCTL_R(type, var, func) do { \ - if (!in_bufsz) { \ - VHOST_IOCTL_RETRY(sizeof(type), 0);\ - } else { \ - (var) = *(const type*)in_buf; \ - result = func(ctx, &(var)); \ - fuse_reply_ioctl(req, result, NULL, 0);\ - } \ -} while (0) - -/* - * Boilerplate code for CUSE Write IOCTL - * Implicit arguments: ctx, req, result, out_bufsz. - */ -#define VHOST_IOCTL_W(type, var, func) do { \ - if (!out_bufsz) { \ - VHOST_IOCTL_RETRY(0, sizeof(type));\ - } else { \ - result = (func)(ctx, &(var));\ - fuse_reply_ioctl(req, result, &(var), sizeof(type));\ - } \ -} while (0) - -/* - * Boilerplate code for CUSE Read/Write IOCTL - * Implicit arguments: ctx, req, result, in_bufsz, in_buf. - */ -#define VHOST_IOCTL_RW(type1, var1, type2, var2, func) do { \ - if (!in_bufsz) { \ - VHOST_IOCTL_RETRY(sizeof(type1), sizeof(type2));\ - } else { \ - (var1) = *(const type1*) (in_buf); \ - result = (func)(ctx, (var1), &(var2)); \ - fuse_reply_ioctl(req, result, &(var2), sizeof(type2));\ - } \ -} while (0) - -/* - * The IOCTLs are handled using CUSE/FUSE in userspace. Depending on the type - * of IOCTL a buffer is requested to read or to write. This request is handled - * by FUSE and the buffer is then given to CUSE. - */ -static void -vhost_net_ioctl(fuse_req_t req, int cmd, void *arg, - struct fuse_file_info *fi, __rte_unused unsigned flags, - const void *in_buf, size_t in_bufsz, size_t out_bufsz) -{ - struct vhost_device_ctx ctx = fuse_req_to_vhost_ctx(req, fi); - struct vhost_vring_file file; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - uint64_t features; - uint32_t index; - int result = 0; - - switch (cmd) { - case VHOST_NET_SET_BACKEND: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_NET_SET_BACKEND\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_file, file, ops->set_backend); - break; - - case VHOST_GET_FEATURES: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_GET_FEATURES\n", ctx.fh); - VHOST_IOCTL_W(uint64_t, features, ops->get_features); - break; - - case VHOST_SET_FEATURES: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_FEATURES\n", ctx.fh); - VHOST_IOCTL_R(uint64_t, features, ops->set_features); - break; - - case VHOST_RESET_OWNER: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_RESET_OWNER\n", ctx.fh); - VHOST_IOCTL(ops->reset_owner); - break; - - case VHOST_SET_OWNER: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_OWNER\n", ctx.fh); - VHOST_IOCTL(ops->set_owner); - break; - - case VHOST_SET_MEM_TABLE: - /*TODO fix race condition.*/ - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh); - static struct vhost_memory mem_temp; - - switch (in_bufsz) { - case 0: - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory), 0); - break; - - case sizeof(struct vhost_memory): - mem_temp = *(const struct vhost_memory *) in_buf; - - if (mem_temp.nregions > 0) { - VHOST_IOCTL_RETRY(sizeof(struct vhost_memory) + - (sizeof(struct vhost_memory_region) * - mem_temp.nregions), 0); - } else { - result = -1; - fuse_reply_ioctl(req, result, NULL, 0); - } - break; - - default: - result = ops->set_mem_table(ctx, - in_buf, mem_temp.nregions); - if (result) - fuse_reply_err(req, EINVAL); - else - fuse_reply_ioctl(req, result, NULL, 0); - } - break; - - case VHOST_SET_VRING_NUM: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_NUM\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_state, state, - ops->set_vring_num); - break; - - case VHOST_SET_VRING_BASE: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_BASE\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_state, state, - ops->set_vring_base); - break; - - case VHOST_GET_VRING_BASE: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_GET_VRING_BASE\n", ctx.fh); - VHOST_IOCTL_RW(uint32_t, index, - struct vhost_vring_state, state, ops->get_vring_base); - break; - - case VHOST_SET_VRING_ADDR: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_ADDR\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_addr, addr, - ops->set_vring_addr); - break; - - case VHOST_SET_VRING_KICK: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_KICK\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_file, file, - ops->set_vring_kick); - break; - - case VHOST_SET_VRING_CALL: - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: VHOST_SET_VRING_CALL\n", ctx.fh); - VHOST_IOCTL_R(struct vhost_vring_file, file, - ops->set_vring_call); - break; - - default: - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") IOCTL: DOESN NOT EXIST\n", ctx.fh); - result = -1; - fuse_reply_ioctl(req, result, NULL, 0); - } - - if (result < 0) - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: FAIL\n", ctx.fh); - else - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") IOCTL: SUCCESS\n", ctx.fh); -} - -/* - * Structure handling open, release and ioctl function pointers is populated. - */ -static const struct cuse_lowlevel_ops vhost_net_ops = { - .open = vhost_net_open, - .release = vhost_net_release, - .ioctl = vhost_net_ioctl, -}; - -/* - * cuse_info is populated and used to register the cuse device. - * vhost_net_device_ops are also passed when the device is registered in app. - */ -int -rte_vhost_driver_register(const char *dev_name) -{ - struct cuse_info cuse_info; - char device_name[PATH_MAX] = ""; - char char_device_name[PATH_MAX] = ""; - const char *device_argv[] = { device_name }; - - char fuse_opt_dummy[] = FUSE_OPT_DUMMY; - char fuse_opt_fore[] = FUSE_OPT_FORE; - char fuse_opt_nomulti[] = FUSE_OPT_NOMULTI; - char *fuse_argv[] = {fuse_opt_dummy, fuse_opt_fore, fuse_opt_nomulti}; - - if (access(cuse_device_name, R_OK | W_OK) < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "char device %s can't be accessed, maybe not exist\n", - cuse_device_name); - return -1; - } - - /* - * The device name is created. This is passed to QEMU so that it can - * register the device with our application. - */ - snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name); - snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name); - - /* Check if device already exists. */ - if (access(char_device_name, F_OK) != -1) { - RTE_LOG(ERR, VHOST_CONFIG, - "char device %s already exists\n", char_device_name); - return -1; - } - - memset(&cuse_info, 0, sizeof(cuse_info)); - cuse_info.dev_major = default_major; - cuse_info.dev_minor = default_minor; - cuse_info.dev_info_argc = 1; - cuse_info.dev_info_argv = device_argv; - cuse_info.flags = CUSE_UNRESTRICTED_IOCTL; - - ops = get_virtio_net_callbacks(); - - session = cuse_lowlevel_setup(3, fuse_argv, - &cuse_info, &vhost_net_ops, 0, NULL); - if (session == NULL) - return -1; - - return 0; -} - -/** - * The CUSE session is launched allowing the application to receive open, - * release and ioctl calls. - */ -int -rte_vhost_driver_session_start(void) -{ - fuse_session_loop(session); - - return 0; -} diff --git a/lib/librte_vhost/vhost-net-cdev.h b/lib/librte_vhost/vhost-net-cdev.h deleted file mode 100644 index 03a5c57..0000000 --- a/lib/librte_vhost/vhost-net-cdev.h +++ /dev/null @@ -1,113 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _VHOST_NET_CDEV_H_ -#define _VHOST_NET_CDEV_H_ -#include <stdint.h> -#include <stdio.h> -#include <sys/types.h> -#include <unistd.h> -#include <linux/vhost.h> - -#include <rte_log.h> - -/* Macros for printing using RTE_LOG */ -#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 -#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 - -#ifdef RTE_LIBRTE_VHOST_DEBUG -#define VHOST_MAX_PRINT_BUFF 6072 -#define LOG_LEVEL RTE_LOG_DEBUG -#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) -#define PRINT_PACKET(device, addr, size, header) do { \ - char *pkt_addr = (char *)(addr); \ - unsigned int index; \ - char packet[VHOST_MAX_PRINT_BUFF]; \ - \ - if ((header)) \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \ - else \ - snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \ - for (index = 0; index < (size); index++) { \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ - "%02hhx ", pkt_addr[index]); \ - } \ - snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ - \ - LOG_DEBUG(VHOST_DATA, "%s", packet); \ -} while (0) -#else -#define LOG_LEVEL RTE_LOG_INFO -#define LOG_DEBUG(log_type, fmt, args...) do {} while (0) -#define PRINT_PACKET(device, addr, size, header) do {} while (0) -#endif - - -/* - * Structure used to identify device context. - */ -struct vhost_device_ctx { - pid_t pid; /* PID of process calling the IOCTL. */ - uint64_t fh; /* Populated with fi->fh to track the device index. */ -}; - -/* - * Structure contains function pointers to be defined in virtio-net.c. These - * functions are called in CUSE context and are used to configure devices. - */ -struct vhost_net_device_ops { - int (*new_device)(struct vhost_device_ctx); - void (*destroy_device)(struct vhost_device_ctx); - - int (*get_features)(struct vhost_device_ctx, uint64_t *); - int (*set_features)(struct vhost_device_ctx, uint64_t *); - - int (*set_mem_table)(struct vhost_device_ctx, const void *, uint32_t); - - int (*set_vring_num)(struct vhost_device_ctx, struct vhost_vring_state *); - int (*set_vring_addr)(struct vhost_device_ctx, struct vhost_vring_addr *); - int (*set_vring_base)(struct vhost_device_ctx, struct vhost_vring_state *); - int (*get_vring_base)(struct vhost_device_ctx, uint32_t, struct vhost_vring_state *); - - int (*set_vring_kick)(struct vhost_device_ctx, struct vhost_vring_file *); - int (*set_vring_call)(struct vhost_device_ctx, struct vhost_vring_file *); - - int (*set_backend)(struct vhost_device_ctx, struct vhost_vring_file *); - - int (*set_owner)(struct vhost_device_ctx); - int (*reset_owner)(struct vhost_device_ctx); -}; - - -struct vhost_net_device_ops const *get_virtio_net_callbacks(void); -#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/lib/librte_vhost/vhost-user/fd_man.c b/lib/librte_vhost/vhost-user/fd_man.c new file mode 100644 index 0000000..c7fd3f2 --- /dev/null +++ b/lib/librte_vhost/vhost-user/fd_man.c @@ -0,0 +1,158 @@ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/select.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#include <rte_log.h> + +#include "fd_man.h" + +/** + * Returns the index in the fdset for a fd. + * If fd is -1, it means to search for a free entry. + * @return + * Index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++); + + return i == pfdset->num ? -1 : i; +} + +static int +fdset_find_free_slot(struct fdset *pfdset) +{ + return fdset_find_fd(pfdset, -1); + +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, fd_cb rcb, + fd_cb wcb, uint64_t dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; +} + +/** + * Fill the read/write fdset with the fds in the fdset. + * @return + * the maximum fds filled in the read/write fd_set. + */ +static int +fdset_fill(fd_set *rfset, fd_set *wfset, struct fdset *pfdset) +{ + struct fdentry *pfdentry; + int i, maxfds = -1; + int num = MAX_FDS; + + for (i = 0; i < num ; i++) { + pfdentry = &pfdset->fd[i]; + if (pfdentry->fd != -1) { + int added = 0; + if (pfdentry->rcb && rfset) { + FD_SET(pfdentry->fd, rfset); + added = 1; + } + if (pfdentry->wcb && wfset) { + FD_SET(pfdentry->fd, wfset); + added = 1; + } + if (added) + maxfds = pfdentry->fd < maxfds ? + maxfds : pfdentry->fd; + } + } + return maxfds; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + for (i = 0; i < MAX_FDS; i++) + pfdset->fd[i].fd = -1; + pfdset->num = MAX_FDS; + +} + +/** + * Register the fd in the fdset with its read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, uint64_t dat) +{ + int i; + + if (fd == -1) + return -1; + + /* Find a free slot in the list. */ + i = fdset_find_free_slot(pfdset); + if (i == -1) + return -2; + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + + return 0; +} + +/** + * Unregister the fd from the fdset. + */ +void +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + + i = fdset_find_fd(pfdset, fd); + if (i != -1) { + pfdset->fd[i].fd = -1; + } +} + + +void +fdset_event_dispatch(struct fdset *pfdset) +{ + fd_set rfds,wfds; + int i, maxfds; + struct fdentry *pfdentry; + int num = MAX_FDS; + + if (pfdset == NULL) + return; + while (1) { + FD_ZERO(&rfds); + FD_ZERO(&wfds); + maxfds = fdset_fill(&rfds, &wfds, pfdset); + /* fd management runs in one thread */ + if (maxfds == -1) { + return; + } + + select(maxfds + 1, &rfds, &wfds, NULL, NULL); + + for (i = 0; i < num; i++) { + pfdentry = &pfdset->fd[i]; + if (FD_ISSET(pfdentry->fd, &rfds)) + pfdentry->rcb(pfdentry->fd, pfdentry->dat); + if (FD_ISSET(pfdentry->fd, &wfds)) + pfdentry->wcb(pfdentry->fd, pfdentry->dat); + } + + } +} diff --git a/lib/librte_vhost/vhost-user/fd_man.h b/lib/librte_vhost/vhost-user/fd_man.h new file mode 100644 index 0000000..57cc81d --- /dev/null +++ b/lib/librte_vhost/vhost-user/fd_man.h @@ -0,0 +1,31 @@ +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include <stdint.h> + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, uint64_t dat); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable.*/ + uint64_t dat; /* fd context */ +}; + +struct fdset { + struct fdentry fd[MAX_FDS]; + int num; +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, + fd_cb wcb, uint64_t ctx); + +void fdset_del(struct fdset *pfdset, int fd); + +void fdset_event_dispatch(struct fdset *pfdset); + +#endif diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.c b/lib/librte_vhost/vhost-user/vhost-net-user.c new file mode 100644 index 0000000..34450f4 --- /dev/null +++ b/lib/librte_vhost/vhost-user/vhost-net-user.c @@ -0,0 +1,417 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <limits.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <errno.h> + +#include <rte_log.h> +#include <rte_virtio_net.h> + +#include "fd_man.h" +#include "vhost-net-user.h" +#include "vhost-net.h" +#include "virtio-net-user.h" + +static void vserver_new_vq_conn(int fd, uint64_t data); +static void vserver_message_handler(int fd, uint64_t dat); +const struct vhost_net_device_ops *ops; + +static struct vhost_server *g_vhost_server; + +static const char *vhost_message_str[VHOST_USER_MAX] = +{ + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR" +}; + +/** + * Create a unix domain socket and bind to path. + * @return + * socket fd or -1 on failure + */ +static int +uds_socket(const char *path) +{ + struct sockaddr_un un; + int sockfd; + int ret; + + if (path == NULL) + return -1; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd); + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + snprintf(un.sun_path, sizeof(un.sun_path), "%s", path); + ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un)); + if (ret == -1) + goto err; + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(sockfd, 1); + if (ret == -1) + goto err; + + return sockfd; + +err: + close(sockfd); + return -1; +} + + +/* return bytes# of read */ +static int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh = { 0 }; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); + return ret; + } + /* ret == buflen */ + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "%s failed\n", __func__); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ( (cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + return ret; +} + +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "%s: invalid size:%d\n", __func__, msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret == 0) + return 0; + if (ret != (int)msg->size) { + printf("read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh = { 0 }; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + iov.iov_base = buf; + iov.iov_len = buflen; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return -1; + } + + return 0; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +/* call back when there is new connection. */ +static void +vserver_new_vq_conn(int fd, uint64_t dat) +{ + struct vhost_server *vserver = (void *)(uintptr_t)dat; + int conn_fd; + uint32_t fh; + struct vhost_device_ctx vdev_ctx = { 0 }; + + conn_fd = accept(fd, NULL, NULL); + RTE_LOG(INFO, VHOST_CONFIG, + "%s: new connection is %d\n", __func__, conn_fd); + if (conn_fd < 0) + return; + + fh = ops->new_device(vdev_ctx); + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh); + + fdset_add(&vserver->fdset, + conn_fd, vserver_message_handler, NULL, fh); +} + +/* callback when there is message on the connfd */ +static void +vserver_message_handler(int connfd, uint64_t dat) +{ + struct vhost_device_ctx ctx; + uint32_t fh = (uint32_t)dat; + struct VhostUserMsg msg; + uint64_t features; + int ret; + + ctx.fh = fh; + ret = read_vhost_message(connfd, &msg); + if (ret < 0) { + printf("vhost read message failed\n"); + + /*TODO: cleanup */ + close(connfd); + fdset_del(&g_vhost_server->fdset, connfd); + ops->destroy_device(ctx); + + return; + } else if (ret == 0) { + /*TODO: cleanup */ + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + close(connfd); + fdset_del(&g_vhost_server->fdset, connfd); + ops->destroy_device(ctx); + + return; + } + if (msg.request > VHOST_USER_MAX) { + /*TODO: cleanup */ + RTE_LOG(INFO, VHOST_CONFIG, + "vhost read incorrect message\n"); + close(connfd); + fdset_del(&g_vhost_server->fdset, connfd); + + return; + } + + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n", + vhost_message_str[msg.request]); + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + ret = ops->get_features(ctx, &features); + msg.payload.u64 = ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(connfd, &msg); + break; + case VHOST_USER_SET_FEATURES: + ops->set_features(ctx, &features); + break; + + case VHOST_USER_SET_OWNER: + ops->set_owner(ctx); + break; + case VHOST_USER_RESET_OWNER: + ops->reset_owner(ctx); + break; + + case VHOST_USER_SET_MEM_TABLE: + user_set_mem_table(ctx, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + case VHOST_USER_SET_LOG_FD: + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + ops->set_vring_num(ctx, &msg.payload.state); + break; + case VHOST_USER_SET_VRING_ADDR: + ops->set_vring_addr(ctx, &msg.payload.addr); + break; + case VHOST_USER_SET_VRING_BASE: + ops->set_vring_base(ctx, &msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + ret = ops->get_vring_base(ctx, msg.payload.state.index, + &msg.payload.state); + msg.size = sizeof(msg.payload.state); + send_vhost_message(connfd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + user_set_vring_kick(ctx, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + user_set_vring_call(ctx, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + default: + break; + + } +} + + +/** + * Creates and initialise the vhost server. + */ +int +rte_vhost_driver_register(const char *path) +{ + + struct vhost_server *vserver; + + if (g_vhost_server != NULL) + return -1; + + vserver = calloc(sizeof(struct vhost_server), 1); + /*TODO: all allocation is through DPDK memory allocation */ + if (vserver == NULL) + return -1; + + fdset_init(&vserver->fdset); + + unlink(path); + + vserver->listenfd = uds_socket(path); + if (vserver->listenfd < 0) { + free(vserver); + return -1; + } + vserver->path = path; + + fdset_add(&vserver->fdset, vserver->listenfd, + vserver_new_vq_conn, NULL, + (uint64_t)(uintptr_t)vserver); + + ops = get_virtio_net_callbacks(); + + g_vhost_server = vserver; + + return 0; +} + + +int +rte_vhost_driver_session_start(void) +{ + fdset_event_dispatch(&g_vhost_server->fdset); + return 0; +} + diff --git a/lib/librte_vhost/vhost-user/vhost-net-user.h b/lib/librte_vhost/vhost-user/vhost-net-user.h new file mode 100644 index 0000000..c9df9fa --- /dev/null +++ b/lib/librte_vhost/vhost-user/vhost-net-user.h @@ -0,0 +1,74 @@ +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H +#include <stdint.h> +#include <linux/vhost.h> + +#include "fd_man.h" + +struct vhost_server { + const char *path; /**< The path the uds is bind to. */ + int listenfd; /**< The listener sockfd. */ + struct fdset fdset; /**< The fd list this vhost server manages. */ +}; + +/*********** FROM hw/virtio/vhost-user.c *************************************/ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute__((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) + +/*****************************************************************************/ +#endif diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.c b/lib/librte_vhost/vhost-user/virtio-net-user.c new file mode 100644 index 0000000..f38e6cc --- /dev/null +++ b/lib/librte_vhost/vhost-user/virtio-net-user.c @@ -0,0 +1,208 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <rte_log.h> + +#include "virtio-net-user.h" +#include "vhost-net-user.h" +#include "vhost-net.h" + +extern const struct vhost_net_device_ops *ops; + +#if 0 +int +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + unsigned int idx; + struct VhostUserMemory memory = pmsg->payload.memory; + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; + uint64_t mapped_address, base_address = 0, mem_size = 0; + + for (idx = 0; idx < memory.nregions; idx++) { + if (memory.regions[idx].guest_phys_addr == 0) + base_address = memory.regions[idx].userspace_addr; + } + if (base_address == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't find the mem region whose gpa is 0.\n"); + return -1; + } + + for (idx = 0; idx < memory.nregions; idx++) { + uint64_t size = memory.regions[idx].userspace_addr - + base_address + memory.regions[idx].memory_size; + if (mem_size < size) + mem_size = size; + } + + /* + * here we assume qemu will map only one file for memory allocation, + * we only use fds[0] with offset 0. + */ + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, mem_size, + PROT_READ | PROT_WRITE, MAP_SHARED, pmsg->fds[0], 0); + + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); + return -1; + } + + for (idx = 0; idx < memory.nregions; idx++) { + regions[idx].guest_phys_address = + memory.regions[idx].guest_phys_addr; + regions[idx].guest_phys_address_end = + memory.regions[idx].guest_phys_addr + + memory.regions[idx].memory_size; + regions[idx].memory_size = memory.regions[idx].memory_size; + regions[idx].userspace_address = + memory.regions[idx].userspace_addr; + + regions[idx].address_offset = mapped_address - base_address + + regions[idx].userspace_address - + regions[idx].guest_phys_address; + LOG_DEBUG(VHOST_CONFIG, + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", + idx, + (void *)(uintptr_t)regions[idx].guest_phys_address, + (void *)(uintptr_t)regions[idx].userspace_address, + regions[idx].memory_size); + } + ops->set_mem_table(ctx, regions, memory.nregions); + return 0; +} + +#else + +int +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + unsigned int idx; + struct VhostUserMemory memory = pmsg->payload.memory; + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS]; + uint64_t mapped_address, base_address = 0; + + for (idx = 0; idx < memory.nregions; idx++) { + if (memory.regions[idx].guest_phys_addr == 0) + base_address = memory.regions[idx].userspace_addr; + } + if (base_address == 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't find the mem region whose gpa is 0.\n"); + return -1; + } + + + for (idx = 0; idx < memory.nregions; idx++) { + regions[idx].guest_phys_address = + memory.regions[idx].guest_phys_addr; + regions[idx].guest_phys_address_end = + memory.regions[idx].guest_phys_addr + + memory.regions[idx].memory_size; + regions[idx].memory_size = memory.regions[idx].memory_size; + regions[idx].userspace_address = + memory.regions[idx].userspace_addr; +/* + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, + regions[idx].memory_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + pmsg->fds[idx], + memory.regions[idx].mmap_offset); +*/ + +/* This is ugly */ + mapped_address = (uint64_t)(uintptr_t)mmap(NULL, + regions[idx].memory_size + + memory.regions[idx].mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + pmsg->fds[idx], + 0); + printf("mapped to %p\n", (void *)mapped_address); + + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, " mmap qemu guest failed.\n"); + return -1; + } + +// printf("ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset + 0x3FFFFFFF) & ~0x3FFFFFFF)); +// printf("unaligned ret=%d\n", munmap((void *)mapped_address, (regions[idx].memory_size + memory.regions[idx].mmap_offset ) )); + mapped_address += memory.regions[idx].mmap_offset; + + regions[idx].address_offset = mapped_address - + regions[idx].guest_phys_address; + LOG_DEBUG(VHOST_CONFIG, + "REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", + idx, + (void *)(uintptr_t)regions[idx].guest_phys_address, + (void *)(uintptr_t)regions[idx].userspace_address, + regions[idx].memory_size); + } + ops->set_mem_table(ctx, regions, memory.nregions); + return 0; +} + + + + +#endif + + +void +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + ops->set_vring_call(ctx, &file); +} + + +void +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + ops->set_vring_kick(ctx, &file); +} diff --git a/lib/librte_vhost/vhost-user/virtio-net-user.h b/lib/librte_vhost/vhost-user/virtio-net-user.h new file mode 100644 index 0000000..0969376 --- /dev/null +++ b/lib/librte_vhost/vhost-user/virtio-net-user.h @@ -0,0 +1,11 @@ +#ifndef _VIRTIO_NET_USER_H +#define _VIRTIO_NET_USER_H + +#include "vhost-net.h" +#include "vhost-net-user.h" + +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *); +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *); +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *); + +#endif diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c index ccfd82f..8ff0301 100644 --- a/lib/librte_vhost/vhost_rxtx.c +++ b/lib/librte_vhost/vhost_rxtx.c @@ -38,19 +38,14 @@ #include <rte_memcpy.h> #include <rte_virtio_net.h> -#include "vhost-net-cdev.h" +#include "vhost-net.h" -#define MAX_PKT_BURST 32 +#define VHOST_MAX_PKT_BURST 64 +#define VHOST_MAX_MRG_PKT_BURST 64 -/** - * This function adds buffers to the virtio devices RX virtqueue. Buffers can - * be received from the physical port or from another virtio device. A packet - * count is returned to indicate the number of packets that are succesfully - * added to the RX queue. This function works when mergeable is disabled. - */ -static inline uint32_t __attribute__((always_inline)) -virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) + +uint32_t +rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count) { struct vhost_virtqueue *vq; struct vring_desc *desc; @@ -59,26 +54,23 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0}; uint64_t buff_addr = 0; uint64_t buff_hdr_addr = 0; - uint32_t head[MAX_PKT_BURST], packet_len = 0; + uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0; uint32_t head_idx, packet_success = 0; + uint32_t mergeable, mrg_count = 0; uint16_t avail_idx, res_cur_idx; uint16_t res_base_idx, res_end_idx; uint16_t free_entries; uint8_t success = 0; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s()\n", dev->device_fh, __func__); if (unlikely(queue_id != VIRTIO_RXQ)) { LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); return 0; } vq = dev->virtqueue[VIRTIO_RXQ]; - count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count; - - /* - * As many data cores may want access to available buffers, - * they need to be reserved. - */ + count = (count > VHOST_MAX_PKT_BURST) ? VHOST_MAX_PKT_BURST : count; + /* As many data cores may want access to available buffers, they need to be reserved. */ do { res_base_idx = vq->last_used_idx_res; avail_idx = *((volatile uint16_t *)&vq->avail->idx); @@ -93,21 +85,25 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, res_end_idx = res_base_idx + count; /* vq->last_used_idx_res is atomically updated. */ - /* TODO: Allow to disable cmpset if no concurrency in application. */ + /* TODO: Allow to disable cmpset if no concurrency in application */ success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx, res_end_idx); + /* If there is contention here and failed, try again. */ } while (unlikely(success == 0)); res_cur_idx = res_base_idx; LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", - dev->device_fh, res_cur_idx, res_end_idx); + dev->device_fh, + res_cur_idx, res_end_idx); /* Prefetch available ring to retrieve indexes. */ rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]); + /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */ + mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF); + /* Retrieve all of the head indexes first to avoid caching issues. */ for (head_idx = 0; head_idx < count; head_idx++) - head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & - (vq->size - 1)]; + head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)]; /*Prefetch descriptor index. */ rte_prefetch0(&vq->desc[head[packet_success]]); @@ -123,46 +119,57 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, /* Prefetch buffer address. */ rte_prefetch0((void *)(uintptr_t)buff_addr); - /* Copy virtio_hdr to packet and increment buffer address */ - buff_hdr_addr = buff_addr; - packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; - - /* - * If the descriptors are chained the header and data are - * placed in separate buffers. - */ - if (desc->flags & VRING_DESC_F_NEXT) { - desc->len = vq->vhost_hlen; - desc = &vq->desc[desc->next]; - /* Buffer address translation. */ - buff_addr = gpa_to_vva(dev, desc->addr); - desc->len = rte_pktmbuf_data_len(buff); + if (mergeable && (mrg_count != 0)) { + desc->len = packet_len = rte_pktmbuf_data_len(buff); } else { - buff_addr += vq->vhost_hlen; - desc->len = packet_len; + /* Copy virtio_hdr to packet and increment buffer address */ + buff_hdr_addr = buff_addr; + packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen; + + /* + * If the descriptors are chained the header and data are placed in + * separate buffers. + */ + if (desc->flags & VRING_DESC_F_NEXT) { + desc->len = vq->vhost_hlen; + desc = &vq->desc[desc->next]; + /* Buffer address translation. */ + buff_addr = gpa_to_vva(dev, desc->addr); + desc->len = rte_pktmbuf_data_len(buff); + } else { + buff_addr += vq->vhost_hlen; + desc->len = packet_len; + } } + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0); + /* Update used ring with desc information */ - vq->used->ring[res_cur_idx & (vq->size - 1)].id = - head[packet_success]; + vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success]; vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len; /* Copy mbuf data to buffer */ - /* FIXME for sg mbuf and the case that desc couldn't hold the mbuf data */ - rte_memcpy((void *)(uintptr_t)buff_addr, - rte_pktmbuf_mtod(buff, const void *), - rte_pktmbuf_data_len(buff)); - PRINT_PACKET(dev, (uintptr_t)buff_addr, - rte_pktmbuf_data_len(buff), 0); + /* TODO fixme for sg mbuf and the case that desc couldn't hold the mbuf data */ + rte_memcpy((void *)(uintptr_t)buff_addr, (const void *)buff->pkt.data, rte_pktmbuf_data_len(buff)); res_cur_idx++; packet_success++; - rte_memcpy((void *)(uintptr_t)buff_hdr_addr, - (const void *)&virtio_hdr, vq->vhost_hlen); - - PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); - + /* If mergeable is disabled then a header is required per buffer. */ + if (!mergeable) { + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); + } else { + mrg_count++; + /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */ + if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) { + virtio_hdr.num_buffers = mrg_count; + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers); + rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen); + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1); + mrg_count = 0; + } + } if (res_cur_idx < res_end_idx) { /* Prefetch descriptor index. */ rte_prefetch0(&vq->desc[head[packet_success]]); @@ -184,357 +191,18 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id, return count; } -static inline uint32_t __attribute__((always_inline)) -copy_from_mbuf_to_vring(struct virtio_net *dev, uint16_t res_base_idx, - uint16_t res_end_idx, struct rte_mbuf *pkt) -{ - uint32_t vec_idx = 0; - uint32_t entry_success = 0; - struct vhost_virtqueue *vq; - /* The virtio_hdr is initialised to 0. */ - struct virtio_net_hdr_mrg_rxbuf virtio_hdr = { - {0, 0, 0, 0, 0, 0}, 0}; - uint16_t cur_idx = res_base_idx; - uint64_t vb_addr = 0; - uint64_t vb_hdr_addr = 0; - uint32_t seg_offset = 0; - uint32_t vb_offset = 0; - uint32_t seg_avail; - uint32_t vb_avail; - uint32_t cpy_len, entry_len; - - if (pkt == NULL) - return 0; - - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| " - "End Index %d\n", - dev->device_fh, cur_idx, res_end_idx); - - /* - * Convert from gpa to vva - * (guest physical addr -> vhost virtual addr) - */ - vq = dev->virtqueue[VIRTIO_RXQ]; - vb_addr = - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); - vb_hdr_addr = vb_addr; - - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); - - virtio_hdr.num_buffers = res_end_idx - res_base_idx; - - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", - dev->device_fh, virtio_hdr.num_buffers); - rte_memcpy((void *)(uintptr_t)vb_hdr_addr, - (const void *)&virtio_hdr, vq->vhost_hlen); - - PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1); - - seg_avail = rte_pktmbuf_data_len(pkt); - vb_offset = vq->vhost_hlen; - vb_avail = - vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen; - - entry_len = vq->vhost_hlen; - - if (vb_avail == 0) { - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; - vq->desc[desc_idx].len = vq->vhost_hlen; - - if ((vq->desc[desc_idx].flags - & VRING_DESC_F_NEXT) == 0) { - /* Update used ring with desc information */ - vq->used->ring[cur_idx & (vq->size - 1)].id - = vq->buf_vec[vec_idx].desc_idx; - vq->used->ring[cur_idx & (vq->size - 1)].len - = entry_len; - - entry_len = 0; - cur_idx++; - entry_success++; - } - - vec_idx++; - vb_addr = - gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr); - - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); - vb_offset = 0; - vb_avail = vq->buf_vec[vec_idx].buf_len; - } - - cpy_len = RTE_MIN(vb_avail, seg_avail); - - while (cpy_len > 0) { - /* Copy mbuf data to vring buffer */ - rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset), - (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset), - cpy_len); - - PRINT_PACKET(dev, - (uintptr_t)(vb_addr + vb_offset), - cpy_len, 0); - - seg_offset += cpy_len; - vb_offset += cpy_len; - seg_avail -= cpy_len; - vb_avail -= cpy_len; - entry_len += cpy_len; - - if (seg_avail != 0) { - /* - * The virtio buffer in this vring - * entry reach to its end. - * But the segment doesn't complete. - */ - if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags & - VRING_DESC_F_NEXT) == 0) { - /* Update used ring with desc information */ - vq->used->ring[cur_idx & (vq->size - 1)].id - = vq->buf_vec[vec_idx].desc_idx; - vq->used->ring[cur_idx & (vq->size - 1)].len - = entry_len; - entry_len = 0; - cur_idx++; - entry_success++; - } - - vec_idx++; - vb_addr = gpa_to_vva(dev, - vq->buf_vec[vec_idx].buf_addr); - vb_offset = 0; - vb_avail = vq->buf_vec[vec_idx].buf_len; - cpy_len = RTE_MIN(vb_avail, seg_avail); - } else { - /* - * This current segment complete, need continue to - * check if the whole packet complete or not. - */ - pkt = pkt->next; - if (pkt != NULL) { - /* - * There are more segments. - */ - if (vb_avail == 0) { - /* - * This current buffer from vring is - * used up, need fetch next buffer - * from buf_vec. - */ - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; - vq->desc[desc_idx].len = vb_offset; - - if ((vq->desc[desc_idx].flags & - VRING_DESC_F_NEXT) == 0) { - uint16_t wrapped_idx = - cur_idx & (vq->size - 1); - /* - * Update used ring with the - * descriptor information - */ - vq->used->ring[wrapped_idx].id - = desc_idx; - vq->used->ring[wrapped_idx].len - = entry_len; - entry_success++; - entry_len = 0; - cur_idx++; - } - - /* Get next buffer from buf_vec. */ - vec_idx++; - vb_addr = gpa_to_vva(dev, - vq->buf_vec[vec_idx].buf_addr); - vb_avail = - vq->buf_vec[vec_idx].buf_len; - vb_offset = 0; - } - - seg_offset = 0; - seg_avail = rte_pktmbuf_data_len(pkt); - cpy_len = RTE_MIN(vb_avail, seg_avail); - } else { - /* - * This whole packet completes. - */ - uint32_t desc_idx = - vq->buf_vec[vec_idx].desc_idx; - vq->desc[desc_idx].len = vb_offset; - - while (vq->desc[desc_idx].flags & - VRING_DESC_F_NEXT) { - desc_idx = vq->desc[desc_idx].next; - vq->desc[desc_idx].len = 0; - } - - /* Update used ring with desc information */ - vq->used->ring[cur_idx & (vq->size - 1)].id - = vq->buf_vec[vec_idx].desc_idx; - vq->used->ring[cur_idx & (vq->size - 1)].len - = entry_len; - entry_len = 0; - cur_idx++; - entry_success++; - seg_avail = 0; - cpy_len = RTE_MIN(vb_avail, seg_avail); - } - } - } - - return entry_success; -} - -/* - * This function works for mergeable RX. - */ -static inline uint32_t __attribute__((always_inline)) -virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint32_t count) +uint32_t +rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) { - struct vhost_virtqueue *vq; - uint32_t pkt_idx = 0, entry_success = 0; - uint16_t avail_idx, res_cur_idx; - uint16_t res_base_idx, res_end_idx; - uint8_t success = 0; - - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n", - dev->device_fh); - if (unlikely(queue_id != VIRTIO_RXQ)) { - LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n"); - } - - vq = dev->virtqueue[VIRTIO_RXQ]; - count = RTE_MIN((uint32_t)MAX_PKT_BURST, count); - - if (count == 0) - return 0; - - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t secure_len = 0; - uint16_t need_cnt; - uint32_t vec_idx = 0; - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen; - uint16_t i, id; - - do { - /* - * As many data cores may want access to available - * buffers, they need to be reserved. - */ - res_base_idx = vq->last_used_idx_res; - res_cur_idx = res_base_idx; - - do { - avail_idx = *((volatile uint16_t *)&vq->avail->idx); - if (unlikely(res_cur_idx == avail_idx)) { - LOG_DEBUG(VHOST_DATA, - "(%"PRIu64") Failed " - "to get enough desc from " - "vring\n", - dev->device_fh); - return pkt_idx; - } else { - uint16_t wrapped_idx = - (res_cur_idx) & (vq->size - 1); - uint32_t idx = - vq->avail->ring[wrapped_idx]; - uint8_t next_desc; - - do { - next_desc = 0; - secure_len += vq->desc[idx].len; - if (vq->desc[idx].flags & - VRING_DESC_F_NEXT) { - idx = vq->desc[idx].next; - next_desc = 1; - } - } while (next_desc); - - res_cur_idx++; - } - } while (pkt_len > secure_len); - - /* vq->last_used_idx_res is atomically updated. */ - success = rte_atomic16_cmpset(&vq->last_used_idx_res, - res_base_idx, - res_cur_idx); - } while (success == 0); - - id = res_base_idx; - need_cnt = res_cur_idx - res_base_idx; - - for (i = 0; i < need_cnt; i++, id++) { - uint16_t wrapped_idx = id & (vq->size - 1); - uint32_t idx = vq->avail->ring[wrapped_idx]; - uint8_t next_desc; - do { - next_desc = 0; - vq->buf_vec[vec_idx].buf_addr = - vq->desc[idx].addr; - vq->buf_vec[vec_idx].buf_len = - vq->desc[idx].len; - vq->buf_vec[vec_idx].desc_idx = idx; - vec_idx++; - - if (vq->desc[idx].flags & VRING_DESC_F_NEXT) { - idx = vq->desc[idx].next; - next_desc = 1; - } - } while (next_desc); - } - - res_end_idx = res_cur_idx; - - entry_success = copy_from_mbuf_to_vring(dev, res_base_idx, - res_end_idx, pkts[pkt_idx]); - - rte_compiler_barrier(); - - /* - * Wait until it's our turn to add our buffer - * to the used ring. - */ - while (unlikely(vq->last_used_idx != res_base_idx)) - rte_pause(); - - *(volatile uint16_t *)&vq->used->idx += entry_success; - vq->last_used_idx = res_end_idx; - - /* Kick the guest if necessary. */ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) - eventfd_write((int)vq->kickfd, 1); - } - - return count; -} - -uint16_t -rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mbuf **pkts, uint16_t count) -{ - if (unlikely(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))) - return virtio_dev_merge_rx(dev, queue_id, pkts, count); - else - return virtio_dev_rx(dev, queue_id, pkts, count); -} - -uint16_t -rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -{ - struct rte_mbuf *m, *prev; + struct rte_mbuf *mbuf; struct vhost_virtqueue *vq; struct vring_desc *desc; - uint64_t vb_addr = 0; - uint32_t head[MAX_PKT_BURST]; + uint64_t buff_addr = 0; + uint32_t head[VHOST_MAX_PKT_BURST]; uint32_t used_idx; uint32_t i; - uint16_t free_entries, entry_success = 0; + uint16_t free_entries, packet_success = 0; uint16_t avail_idx; if (unlikely(queue_id != VIRTIO_TXQ)) { @@ -549,8 +217,8 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, if (vq->last_used_idx == avail_idx) return 0; - LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, - dev->device_fh); + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") %s(%d->%d)\n", + dev->device_fh, __func__, vq->last_used_idx, avail_idx); /* Prefetch available ring to retrieve head indexes. */ rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]); @@ -558,173 +226,68 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, /*get the number of free entries in the ring*/ free_entries = (avail_idx - vq->last_used_idx); - free_entries = RTE_MIN(free_entries, count); + if (free_entries > count) + free_entries = count; /* Limit to MAX_PKT_BURST. */ - free_entries = RTE_MIN(free_entries, MAX_PKT_BURST); + if (free_entries > VHOST_MAX_PKT_BURST) + free_entries = VHOST_MAX_PKT_BURST; - LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", - dev->device_fh, free_entries); + LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries); /* Retrieve all of the head indexes first to avoid caching issues. */ for (i = 0; i < free_entries; i++) head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)]; /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[head[entry_success]]); + rte_prefetch0(&vq->desc[head[packet_success]]); rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]); - while (entry_success < free_entries) { - uint32_t vb_avail, vb_offset; - uint32_t seg_avail, seg_offset; - uint32_t cpy_len; - uint32_t seg_num = 0; - struct rte_mbuf *cur; - uint8_t alloc_err = 0; - - desc = &vq->desc[head[entry_success]]; + while (packet_success < free_entries) { + desc = &vq->desc[head[packet_success]]; /* Discard first buffer as it is the virtio header */ desc = &vq->desc[desc->next]; /* Buffer address translation. */ - vb_addr = gpa_to_vva(dev, desc->addr); + buff_addr = gpa_to_vva(dev, desc->addr); /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); + rte_prefetch0((void *)(uintptr_t)buff_addr); used_idx = vq->last_used_idx & (vq->size - 1); - if (entry_success < (free_entries - 1)) { + if (packet_success < (free_entries - 1)) { /* Prefetch descriptor index. */ - rte_prefetch0(&vq->desc[head[entry_success+1]]); + rte_prefetch0(&vq->desc[head[packet_success+1]]); rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]); } /* Update used index buffer information. */ - vq->used->ring[used_idx].id = head[entry_success]; + vq->used->ring[used_idx].id = head[packet_success]; vq->used->ring[used_idx].len = 0; - vb_offset = 0; - vb_avail = desc->len; - /* Allocate an mbuf and populate the structure. */ - m = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(m == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - return entry_success; + mbuf = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(mbuf == NULL)) { + RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n"); + return packet_success; } - seg_offset = 0; - seg_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; - cpy_len = RTE_MIN(vb_avail, seg_avail); - - PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0); - - seg_num++; - cur = m; - prev = m; - while (cpy_len != 0) { - rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset), - (void *)((uintptr_t)(vb_addr + vb_offset)), - cpy_len); - - seg_offset += cpy_len; - vb_offset += cpy_len; - vb_avail -= cpy_len; - seg_avail -= cpy_len; - - if (vb_avail != 0) { - /* - * The segment reachs to its end, - * while the virtio buffer in TX vring has - * more data to be copied. - */ - cur->data_len = seg_offset; - m->pkt_len += seg_offset; - /* Allocate mbuf and populate the structure. */ - cur = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(cur == NULL)) { - RTE_LOG(ERR, VHOST_DATA, "Failed to " - "allocate memory for mbuf.\n"); - rte_pktmbuf_free(m); - alloc_err = 1; - break; - } - - seg_num++; - prev->next = cur; - prev = cur; - seg_offset = 0; - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; - } else { - if (desc->flags & VRING_DESC_F_NEXT) { - /* - * There are more virtio buffers in - * same vring entry need to be copied. - */ - if (seg_avail == 0) { - /* - * The current segment hasn't - * room to accomodate more - * data. - */ - cur->data_len = seg_offset; - m->pkt_len += seg_offset; - /* - * Allocate an mbuf and - * populate the structure. - */ - cur = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(cur == NULL)) { - RTE_LOG(ERR, - VHOST_DATA, - "Failed to " - "allocate memory " - "for mbuf\n"); - rte_pktmbuf_free(m); - alloc_err = 1; - break; - } - seg_num++; - prev->next = cur; - prev = cur; - seg_offset = 0; - seg_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; - } - - desc = &vq->desc[desc->next]; - - /* Buffer address translation. */ - vb_addr = gpa_to_vva(dev, desc->addr); - /* Prefetch buffer address. */ - rte_prefetch0((void *)(uintptr_t)vb_addr); - vb_offset = 0; - vb_avail = desc->len; - - PRINT_PACKET(dev, (uintptr_t)vb_addr, - desc->len, 0); - } else { - /* The whole packet completes. */ - cur->data_len = seg_offset; - m->pkt_len += seg_offset; - vb_avail = 0; - } - } + mbuf->pkt.data_len = desc->len; + mbuf->pkt.pkt_len = mbuf->pkt.data_len; - cpy_len = RTE_MIN(vb_avail, seg_avail); - } + rte_memcpy((void *) mbuf->pkt.data, + (const void *) buff_addr, mbuf->pkt.data_len); - if (unlikely(alloc_err == 1)) - break; + pkts[packet_success] = mbuf; - m->nb_segs = seg_num; + VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0); - pkts[entry_success] = m; vq->last_used_idx++; - entry_success++; + packet_success++; } rte_compiler_barrier(); - vq->used->idx += entry_success; + vq->used->idx += packet_success; /* Kick guest if required. */ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) eventfd_write((int)vq->kickfd, 1); - return entry_success; + + return packet_success; } diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c index 852b6d1..516e743 100644 --- a/lib/librte_vhost/virtio-net.c +++ b/lib/librte_vhost/virtio-net.c @@ -31,17 +31,14 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <dirent.h> -#include <fuse/cuse_lowlevel.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include <stddef.h> #include <stdint.h> #include <stdlib.h> -#include <sys/eventfd.h> -#include <sys/ioctl.h> #include <sys/mman.h> #include <unistd.h> +#include <assert.h> #include <rte_ethdev.h> #include <rte_log.h> @@ -49,10 +46,8 @@ #include <rte_memory.h> #include <rte_virtio_net.h> -#include "vhost-net-cdev.h" -#include "eventfd_link/eventfd_link.h" - -/* +#include "vhost-net.h" +/** * Device linked list structure for configuration. */ struct virtio_net_config_ll { @@ -60,38 +55,15 @@ struct virtio_net_config_ll { struct virtio_net_config_ll *next; /* Next dev on linked list.*/ }; -const char eventfd_cdev[] = "/dev/eventfd-link"; - -/* device ops to add/remove device to/from data core. */ +/* device ops to add/remove device to data core. */ static struct virtio_net_device_ops const *notify_ops; -/* root address of the linked list of managed virtio devices */ +/* root address of the linked list in the configuration core. */ static struct virtio_net_config_ll *ll_root; /* Features supported by this lib. */ -#define VHOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ - (1ULL << VIRTIO_NET_F_CTRL_RX)) +#define VHOST_SUPPORTED_FEATURES (1ULL << VIRTIO_NET_F_MRG_RXBUF) static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES; -/* Line size for reading maps file. */ -static const uint32_t BUFSIZE = PATH_MAX; - -/* Size of prot char array in procmap. */ -#define PROT_SZ 5 - -/* Number of elements in procmap struct. */ -#define PROCMAP_SZ 8 - -/* Structure containing information gathered from maps file. */ -struct procmap { - uint64_t va_start; /* Start virtual address in file. */ - uint64_t len; /* Size of file. */ - uint64_t pgoff; /* Not used. */ - uint32_t maj; /* Not used. */ - uint32_t min; /* Not used. */ - uint32_t ino; /* Not used. */ - char prot[PROT_SZ]; /* Not used. */ - char fname[PATH_MAX]; /* File name. */ -}; /* * Converts QEMU virtual address to Vhost virtual address. This function is @@ -110,199 +82,15 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va) if ((qemu_va >= region->userspace_address) && (qemu_va <= region->userspace_address + region->memory_size)) { - vhost_va = dev->mem->mapped_address + qemu_va - - dev->mem->base_address; + vhost_va = qemu_va + region->guest_phys_address + + region->address_offset - + region->userspace_address; break; } } return vhost_va; } -/* - * Locate the file containing QEMU's memory space and - * map it to our address space. - */ -static int -host_memory_map(struct virtio_net *dev, struct virtio_memory *mem, - pid_t pid, uint64_t addr) -{ - struct dirent *dptr = NULL; - struct procmap procmap; - DIR *dp = NULL; - int fd; - int i; - char memfile[PATH_MAX]; - char mapfile[PATH_MAX]; - char procdir[PATH_MAX]; - char resolved_path[PATH_MAX]; - char *path = NULL; - FILE *fmap; - void *map; - uint8_t found = 0; - char line[BUFSIZE]; - char dlm[] = "- : "; - char *str, *sp, *in[PROCMAP_SZ]; - char *end = NULL; - - /* Path where mem files are located. */ - snprintf(procdir, PATH_MAX, "/proc/%u/fd/", pid); - /* Maps file used to locate mem file. */ - snprintf(mapfile, PATH_MAX, "/proc/%u/maps", pid); - - fmap = fopen(mapfile, "r"); - if (fmap == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to open maps file for pid %d\n", - dev->device_fh, pid); - return -1; - } - - /* Read through maps file until we find out base_address. */ - while (fgets(line, BUFSIZE, fmap) != 0) { - str = line; - errno = 0; - /* Split line into fields. */ - for (i = 0; i < PROCMAP_SZ; i++) { - in[i] = strtok_r(str, &dlm[i], &sp); - if ((in[i] == NULL) || (errno != 0)) { - fclose(fmap); - return -1; - } - str = NULL; - } - - /* Convert/Copy each field as needed. */ - procmap.va_start = strtoull(in[0], &end, 16); - if ((in[0] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.len = strtoull(in[1], &end, 16); - if ((in[1] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.pgoff = strtoull(in[3], &end, 16); - if ((in[3] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.maj = strtoul(in[4], &end, 16); - if ((in[4] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.min = strtoul(in[5], &end, 16); - if ((in[5] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - procmap.ino = strtoul(in[6], &end, 16); - if ((in[6] == '\0') || (end == NULL) || (*end != '\0') || - (errno != 0)) { - fclose(fmap); - return -1; - } - - memcpy(&procmap.prot, in[2], PROT_SZ); - memcpy(&procmap.fname, in[7], PATH_MAX); - - if (procmap.va_start == addr) { - procmap.len = procmap.len - procmap.va_start; - found = 1; - break; - } - } - fclose(fmap); - - if (!found) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find memory file in pid %d maps file\n", - dev->device_fh, pid); - return -1; - } - - /* Find the guest memory file among the process fds. */ - dp = opendir(procdir); - if (dp == NULL) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Cannot open pid %d process directory\n", - dev->device_fh, pid); - return -1; - } - - found = 0; - - /* Read the fd directory contents. */ - while (NULL != (dptr = readdir(dp))) { - snprintf(memfile, PATH_MAX, "/proc/%u/fd/%s", - pid, dptr->d_name); - path = realpath(memfile, resolved_path); - if ((path == NULL) && (strlen(resolved_path) == 0)) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to resolve fd directory\n", - dev->device_fh); - closedir(dp); - return -1; - } - if (strncmp(resolved_path, procmap.fname, - strnlen(procmap.fname, PATH_MAX)) == 0) { - found = 1; - break; - } - } - - closedir(dp); - - if (found == 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to find memory file for pid %d\n", - dev->device_fh, pid); - return -1; - } - /* Open the shared memory file and map the memory into this process. */ - fd = open(memfile, O_RDWR); - - if (fd == -1) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Failed to open %s for pid %d\n", - dev->device_fh, memfile, pid); - return -1; - } - - map = mmap(0, (size_t)procmap.len, PROT_READ|PROT_WRITE, - MAP_POPULATE|MAP_SHARED, fd, 0); - close(fd); - - if (map == MAP_FAILED) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") Error mapping the file %s for pid %d\n", - dev->device_fh, memfile, pid); - return -1; - } - - /* Store the memory address and size in the device data structure */ - mem->mapped_address = (uint64_t)(uintptr_t)map; - mem->mapped_size = procmap.len; - - LOG_DEBUG(VHOST_CONFIG, - "(%"PRIu64") Mem File: %s->%s - Size: %llu - VA: %p\n", - dev->device_fh, - memfile, resolved_path, - (unsigned long long)mem->mapped_size, map); - - return 0; -} /* * Retrieves an entry from the devices configuration linked list. @@ -376,7 +164,7 @@ add_config_ll_entry(struct virtio_net_config_ll *new_ll_dev) } } - +/*TODO dpdk alloc/free if possible */ /* * Unmap any memory, close any file descriptors and * free any memory owned by a device. @@ -389,16 +177,17 @@ cleanup_device(struct virtio_net *dev) munmap((void *)(uintptr_t)dev->mem->mapped_address, (size_t)dev->mem->mapped_size); free(dev->mem); + dev->mem = NULL; } /* Close any event notifiers opened by device. */ - if (dev->virtqueue[VIRTIO_RXQ]->callfd) + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); - if (dev->virtqueue[VIRTIO_RXQ]->kickfd) + if (dev->virtqueue[VIRTIO_RXQ]->kickfd > 0) close((int)dev->virtqueue[VIRTIO_RXQ]->kickfd); - if (dev->virtqueue[VIRTIO_TXQ]->callfd) + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); - if (dev->virtqueue[VIRTIO_TXQ]->kickfd) + if (dev->virtqueue[VIRTIO_TXQ]->kickfd > 0) close((int)dev->virtqueue[VIRTIO_TXQ]->kickfd); } @@ -522,8 +311,8 @@ new_device(struct vhost_device_ctx ctx) } /* - * Function is called from the CUSE release function. This function will - * cleanup the device and remove it from device configuration linked list. + * Function is called from the CUSE release function. This function will cleanup + * the device and remove it from device configuration linked list. */ static void destroy_device(struct vhost_device_ctx ctx) @@ -569,6 +358,7 @@ set_owner(struct vhost_device_ctx ctx) return -1; return 0; + /* TODO check ctx.fh is meaningfull here */ } /* @@ -651,14 +441,12 @@ set_features(struct vhost_device_ctx ctx, uint64_t *pu) * This includes storing offsets used to translate buffer addresses. */ static int -set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, - uint32_t nregions) +set_mem_table(struct vhost_device_ctx ctx, + const struct virtio_memory_regions *regions, uint32_t nregions) { struct virtio_net *dev; - struct vhost_memory_region *mem_regions; struct virtio_memory *mem; - uint64_t size = offsetof(struct vhost_memory, regions); - uint32_t regionidx, valid_regions; + uint32_t regionidx; dev = get_device(ctx); if (dev == NULL) @@ -682,107 +470,24 @@ set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, mem->nregions = nregions; - mem_regions = (void *)(uintptr_t) - ((uint64_t)(uintptr_t)mem_regions_addr + size); - for (regionidx = 0; regionidx < mem->nregions; regionidx++) { /* Populate the region structure for each region. */ - mem->regions[regionidx].guest_phys_address = - mem_regions[regionidx].guest_phys_addr; - mem->regions[regionidx].guest_phys_address_end = - mem->regions[regionidx].guest_phys_address + - mem_regions[regionidx].memory_size; - mem->regions[regionidx].memory_size = - mem_regions[regionidx].memory_size; - mem->regions[regionidx].userspace_address = - mem_regions[regionidx].userspace_addr; - - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") REGION: %u - GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", dev->device_fh, - regionidx, - (void *)(uintptr_t)mem->regions[regionidx].guest_phys_address, - (void *)(uintptr_t)mem->regions[regionidx].userspace_address, - mem->regions[regionidx].memory_size); - - /*set the base address mapping*/ + mem->regions[regionidx] = regions[regionidx]; if (mem->regions[regionidx].guest_phys_address == 0x0) { mem->base_address = mem->regions[regionidx].userspace_address; - /* Map VM memory file */ - if (host_memory_map(dev, mem, ctx.pid, - mem->base_address) != 0) { - free(mem); - return -1; - } + mem->mapped_address = + mem->regions[regionidx].address_offset; } } - /* Check that we have a valid base address. */ - if (mem->base_address == 0) { - RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Failed to find base address of qemu memory file.\n", dev->device_fh); - free(mem); - return -1; - } - - /* - * Check if all of our regions have valid mappings. - * Usually one does not exist in the QEMU memory file. - */ - valid_regions = mem->nregions; - for (regionidx = 0; regionidx < mem->nregions; regionidx++) { - if ((mem->regions[regionidx].userspace_address < - mem->base_address) || - (mem->regions[regionidx].userspace_address > - (mem->base_address + mem->mapped_size))) - valid_regions--; - } - - /* - * If a region does not have a valid mapping, - * we rebuild our memory struct to contain only valid entries. - */ - if (valid_regions != mem->nregions) { - LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") Not all memory regions exist in the QEMU mem file. Re-populating mem structure\n", - dev->device_fh); - - /* - * Re-populate the memory structure with only valid regions. - * Invalid regions are over-written with memmove. - */ - valid_regions = 0; - - for (regionidx = mem->nregions; 0 != regionidx--;) { - if ((mem->regions[regionidx].userspace_address < - mem->base_address) || - (mem->regions[regionidx].userspace_address > - (mem->base_address + mem->mapped_size))) { - memmove(&mem->regions[regionidx], - &mem->regions[regionidx + 1], - sizeof(struct virtio_memory_regions) * - valid_regions); - } else { - valid_regions++; - } - } - } - mem->nregions = valid_regions; + /*TODO addback the logic that remove invalid memory regions */ dev->mem = mem; - /* - * Calculate the address offset for each region. - * This offset is used to identify the vhost virtual address - * corresponding to a QEMU guest physical address. - */ - for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) { - dev->mem->regions[regionidx].address_offset = - dev->mem->regions[regionidx].userspace_address - - dev->mem->base_address + - dev->mem->mapped_address - - dev->mem->regions[regionidx].guest_phys_address; - - } return 0; } + /* * Called from CUSE IOCTL: VHOST_SET_VRING_NUM * The virtio device sends us the size of the descriptor ring. @@ -896,38 +601,62 @@ get_vring_base(struct vhost_device_ctx ctx, uint32_t index, /* State->index refers to the queue index. The txq is 1, rxq is 0. */ state->num = dev->virtqueue[state->index]->last_used_idx; - return 0; -} + if (dev->flags & VIRTIO_DEV_RUNNING) { + RTE_LOG(INFO, VHOST_CONFIG, + "get_vring_base message is for release\n"); + notify_ops->destroy_device(dev); + /* + * sync call. + * when it returns, it means it si removed from data core. + */ + } + /* TODO fix all munmap */ + if (dev->mem) { + munmap((void *)(uintptr_t)dev->mem->mapped_address, + (size_t)dev->mem->mapped_size); + free(dev->mem); + dev->mem = NULL; + } -/* - * This function uses the eventfd_link kernel module to copy an eventfd file - * descriptor provided by QEMU in to our process space. - */ -static int -eventfd_copy(struct virtio_net *dev, struct eventfd_copy *eventfd_copy) -{ - int eventfd_link, ret; - /* Open the character device to the kernel module. */ - eventfd_link = open(eventfd_cdev, O_RDWR); - if (eventfd_link < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") eventfd_link module is not loaded\n", - dev->device_fh); - return -1; - } + if (dev->virtqueue[VIRTIO_RXQ]->callfd > 0) + close((int)dev->virtqueue[VIRTIO_RXQ]->callfd); + dev->virtqueue[VIRTIO_RXQ]->callfd = -1; + if (dev->virtqueue[VIRTIO_TXQ]->callfd > 0) + close((int)dev->virtqueue[VIRTIO_TXQ]->callfd); + dev->virtqueue[VIRTIO_TXQ]->callfd = -1; + /* We don't cleanup callfd here as we willn't get CALLFD again */ + + dev->virtqueue[VIRTIO_RXQ]->desc = NULL; + dev->virtqueue[VIRTIO_RXQ]->avail = NULL; + dev->virtqueue[VIRTIO_RXQ]->used = NULL; + dev->virtqueue[VIRTIO_RXQ]->last_used_idx = 0; + dev->virtqueue[VIRTIO_RXQ]->last_used_idx_res = 0; + + dev->virtqueue[VIRTIO_TXQ]->desc = NULL; + dev->virtqueue[VIRTIO_TXQ]->avail = NULL; + dev->virtqueue[VIRTIO_TXQ]->used = NULL; + dev->virtqueue[VIRTIO_TXQ]->last_used_idx = 0; + dev->virtqueue[VIRTIO_TXQ]->last_used_idx_res = 0; - /* Call the IOCTL to copy the eventfd. */ - ret = ioctl(eventfd_link, EVENTFD_COPY, eventfd_copy); - close(eventfd_link); - if (ret < 0) { - RTE_LOG(ERR, VHOST_CONFIG, - "(%"PRIu64") EVENTFD_COPY ioctl failed\n", - dev->device_fh); - return -1; - } + return 0; +} +static int +virtio_is_ready(struct virtio_net *dev, int index) +{ + struct vhost_virtqueue *vq1, *vq2; + /* mq support in future.*/ + vq1 = dev->virtqueue[index]; + vq2 = dev->virtqueue[index ^ 1]; + if (vq1 && vq2 && vq1->desc && vq2->desc && + (vq1->kickfd > 0) && (vq1->callfd > 0) && + (vq2->kickfd > 0) && (vq2->callfd > 0)) { + LOG_DEBUG(VHOST_CONFIG, "virtio is ready for processing.\n"); + return 1; + } + LOG_DEBUG(VHOST_CONFIG, "virtio isn't ready for processing.\n"); return 0; } @@ -940,7 +669,6 @@ static int set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) { struct virtio_net *dev; - struct eventfd_copy eventfd_kick; struct vhost_virtqueue *vq; dev = get_device(ctx); @@ -953,14 +681,7 @@ set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file) if (vq->kickfd) close((int)vq->kickfd); - /* Populate the eventfd_copy structure and call eventfd_copy. */ - vq->kickfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); - eventfd_kick.source_fd = vq->kickfd; - eventfd_kick.target_fd = file->fd; - eventfd_kick.target_pid = ctx.pid; - - if (eventfd_copy(dev, &eventfd_kick)) - return -1; + vq->kickfd = file->fd; return 0; } @@ -974,7 +695,6 @@ static int set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) { struct virtio_net *dev; - struct eventfd_copy eventfd_call; struct vhost_virtqueue *vq; dev = get_device(ctx); @@ -986,16 +706,11 @@ set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file) if (vq->callfd) close((int)vq->callfd); + vq->callfd = file->fd; - /* Populate the eventfd_copy structure and call eventfd_copy. */ - vq->callfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); - eventfd_call.source_fd = vq->callfd; - eventfd_call.target_fd = file->fd; - eventfd_call.target_pid = ctx.pid; - - if (eventfd_copy(dev, &eventfd_call)) - return -1; - + if (virtio_is_ready(dev, file->index) && + !(dev->flags & VIRTIO_DEV_RUNNING)) + notify_ops->new_device(dev); return 0; } @@ -1024,6 +739,7 @@ set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file) * If the device isn't already running and both backend fds are set, * we add the device. */ + LOG_DEBUG(VHOST_CONFIG, "%s %d\n", __func__, file->fd); if (!(dev->flags & VIRTIO_DEV_RUNNING)) { if (((int)dev->virtqueue[VIRTIO_TXQ]->backend != VIRTIO_DEV_STOPPED) && ((int)dev->virtqueue[VIRTIO_RXQ]->backend != VIRTIO_DEV_STOPPED)) -- 1.8.1.4