On Fri, Feb 05, 2016 at 07:20:26PM +0800, Jianfeng Tan wrote:
> diff --git a/drivers/net/virtio/vhost.h b/drivers/net/virtio/vhost.h
> new file mode 100644
> index 0000000..73d4f5c
> --- /dev/null
> +++ b/drivers/net/virtio/vhost.h
> @@ -0,0 +1,194 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VHOST_NET_USER_H
> +#define _VHOST_NET_USER_H
> +
> +#include <stdint.h>
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define VHOST_MEMORY_MAX_NREGIONS 8

Don't hard-code this, it's not nice.

> +
> +struct vhost_vring_state {
> +     unsigned int index;
> +     unsigned int num;
> +};
> +
> +struct vhost_vring_file {
> +     unsigned int index;
> +     int fd;
> +};
> +
> +struct vhost_vring_addr {
> +     unsigned int index;
> +     /* Option flags. */
> +     unsigned int flags;
> +     /* Flag values: */
> +     /* Whether log address is valid. If set enables logging. */
> +#define VHOST_VRING_F_LOG 0
> +
> +     /* Start of array of descriptors (virtually contiguous) */
> +     uint64_t desc_user_addr;
> +     /* Used structure address. Must be 32 bit aligned */
> +     uint64_t used_user_addr;
> +     /* Available structure address. Must be 16 bit aligned */
> +     uint64_t avail_user_addr;
> +     /* Logging support. */
> +     /* Log writes to used structure, at offset calculated from specified
> +      * address. Address must be 32 bit aligned.
> +      */
> +     uint64_t log_guest_addr;
> +};
> +
> +#define VIRTIO_CONFIG_S_DRIVER_OK   4
> +
> +enum vhost_user_request {
> +     VHOST_USER_NONE = 0,
> +     VHOST_USER_GET_FEATURES = 1,
> +     VHOST_USER_SET_FEATURES = 2,
> +     VHOST_USER_SET_OWNER = 3,
> +     VHOST_USER_RESET_OWNER = 4,
> +     VHOST_USER_SET_MEM_TABLE = 5,
> +     VHOST_USER_SET_LOG_BASE = 6,
> +     VHOST_USER_SET_LOG_FD = 7,
> +     VHOST_USER_SET_VRING_NUM = 8,
> +     VHOST_USER_SET_VRING_ADDR = 9,
> +     VHOST_USER_SET_VRING_BASE = 10,
> +     VHOST_USER_GET_VRING_BASE = 11,
> +     VHOST_USER_SET_VRING_KICK = 12,
> +     VHOST_USER_SET_VRING_CALL = 13,
> +     VHOST_USER_SET_VRING_ERR = 14,
> +     VHOST_USER_GET_PROTOCOL_FEATURES = 15,
> +     VHOST_USER_SET_PROTOCOL_FEATURES = 16,
> +     VHOST_USER_GET_QUEUE_NUM = 17,
> +     VHOST_USER_SET_VRING_ENABLE = 18,
> +     VHOST_USER_MAX
> +};
> +
> +struct vhost_memory_region {
> +     uint64_t guest_phys_addr;
> +     uint64_t memory_size; /* bytes */
> +     uint64_t userspace_addr;
> +     uint64_t mmap_offset;
> +};
> +
> +struct vhost_memory_kernel {
> +     uint32_t nregions;
> +     uint32_t padding;
> +     struct vhost_memory_region regions[0];
> +};
> +
> +struct vhost_memory {
> +     uint32_t nregions;
> +     uint32_t padding;
> +     struct vhost_memory_region regions[VHOST_MEMORY_MAX_NREGIONS];
> +};
> +
> +struct vhost_user_msg {
> +     enum vhost_user_request request;
> +
> +#define VHOST_USER_VERSION_MASK     0x3
> +#define VHOST_USER_REPLY_MASK       (0x1 << 2)
> +     uint32_t flags;
> +     uint32_t size; /* the following payload size */
> +     union {
> +#define VHOST_USER_VRING_IDX_MASK   0xff
> +#define VHOST_USER_VRING_NOFD_MASK  (0x1 << 8)
> +             uint64_t u64;
> +             struct vhost_vring_state state;
> +             struct vhost_vring_addr addr;
> +             struct vhost_memory memory;
> +     } payload;
> +     int fds[VHOST_MEMORY_MAX_NREGIONS];
> +} __attribute((packed));
> +
> +#define VHOST_USER_HDR_SIZE offsetof(struct vhost_user_msg, payload.u64)
> +#define VHOST_USER_PAYLOAD_SIZE (sizeof(struct vhost_user_msg) - 
> VHOST_USER_HDR_SIZE)
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION    0x1
> +
> +/* ioctls */

Why do you duplicate ioctls?
Use them from /usr/include/linux/vhost.h, etc.

In fact, what's not coming from linux here
comes from lib/librte_vhost/vhost_user/vhost-net-user.h.

I think you should reuse code, avoid code duplication.


> +
> +#define VHOST_VIRTIO 0xAF
> +
> +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
> +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct 
> vhost_memory_kernel)
> +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
> +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct 
> vhost_vring_state)
> +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct 
> vhost_vring_addr)
> +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct 
> vhost_vring_state)
> +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct 
> vhost_vring_state)
> +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct 
> vhost_vring_file)
> +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct 
> vhost_vring_file)
> +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct 
> vhost_vring_file)
> +
> +/*****************************************************************************/
> +
> +/* Ioctl defines */
> +#define TUNSETIFF     _IOW('T', 202, int)
> +#define TUNGETFEATURES _IOR('T', 207, unsigned int)
> +#define TUNSETOFFLOAD  _IOW('T', 208, unsigned int)
> +#define TUNGETIFF      _IOR('T', 210, unsigned int)
> +#define TUNSETSNDBUF   _IOW('T', 212, int)
> +#define TUNGETVNETHDRSZ _IOR('T', 215, int)
> +#define TUNSETVNETHDRSZ _IOW('T', 216, int)
> +#define TUNSETQUEUE  _IOW('T', 217, int)
> +#define TUNSETVNETLE _IOW('T', 220, int)
> +#define TUNSETVNETBE _IOW('T', 222, int)
> +
> +/* TUNSETIFF ifr flags */
> +#define IFF_TAP          0x0002
> +#define IFF_NO_PI        0x1000
> +#define IFF_ONE_QUEUE    0x2000
> +#define IFF_VNET_HDR     0x4000
> +#define IFF_MULTI_QUEUE  0x0100
> +#define IFF_ATTACH_QUEUE 0x0200
> +#define IFF_DETACH_QUEUE 0x0400
> +
> +/* Features for GSO (TUNSETOFFLOAD). */
> +#define TUN_F_CSUM   0x01    /* You can hand me unchecksummed packets. */
> +#define TUN_F_TSO4   0x02    /* I can handle TSO for IPv4 packets */
> +#define TUN_F_TSO6   0x04    /* I can handle TSO for IPv6 packets */
> +#define TUN_F_TSO_ECN        0x08    /* I can handle TSO with ECN bits. */
> +#define TUN_F_UFO    0x10    /* I can handle UFO packets */
> +
> +#define PATH_NET_TUN "/dev/net/tun"
> +
> +#endif
> diff --git a/drivers/net/virtio/vhost_embedded.c 
> b/drivers/net/virtio/vhost_embedded.c
> new file mode 100644
> index 0000000..0073b86
> --- /dev/null
> +++ b/drivers/net/virtio/vhost_embedded.c
> @@ -0,0 +1,809 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <stdio.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <string.h>
> +#include <errno.h>
> +#include <assert.h>
> +#include <sys/mman.h>
> +#include <unistd.h>
> +#include <sys/eventfd.h>
> +#include <sys/ioctl.h>
> +#include <net/if.h>
> +
> +#include <rte_mbuf.h>
> +#include <rte_memory.h>
> +#include <rte_eal_memconfig.h>
> +
> +#include "virtio_pci.h"
> +#include "virtio_logs.h"
> +#include "virtio_ethdev.h"
> +#include "virtqueue.h"
> +#include "vhost.h"
> +
> +static int
> +vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
> +{
> +     int r;
> +     struct msghdr msgh;
> +     struct iovec iov;
> +     size_t fd_size = fd_num * sizeof(int);
> +     char control[CMSG_SPACE(fd_size)];
> +     struct cmsghdr *cmsg;
> +
> +     bzero(&msgh, sizeof(msgh));
> +     bzero(control, sizeof(control));
> +
> +     iov.iov_base = (uint8_t *)buf;
> +     iov.iov_len = len;
> +
> +     msgh.msg_iov = &iov;
> +     msgh.msg_iovlen = 1;
> +     msgh.msg_control = control;
> +     msgh.msg_controllen = sizeof(control);
> +
> +     cmsg = CMSG_FIRSTHDR(&msgh);
> +     cmsg->cmsg_len = CMSG_LEN(fd_size);
> +     cmsg->cmsg_level = SOL_SOCKET;
> +     cmsg->cmsg_type = SCM_RIGHTS;
> +     memcpy(CMSG_DATA(cmsg), fds, fd_size);
> +
> +     do {
> +             r = sendmsg(fd, &msgh, 0);
> +     } while (r < 0 && errno == EINTR);
> +
> +     return r;
> +}
> +
> +static int
> +vhost_user_read(int fd, struct vhost_user_msg *msg)
> +{
> +     uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
> +     int ret, sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
> +
> +     ret = recv(fd, (void *)msg, sz_hdr, 0);
> +     if (ret < sz_hdr) {
> +             PMD_DRV_LOG(ERR, "Failed to recv msg hdr: %d instead of %d.",
> +                         ret, sz_hdr);
> +             goto fail;
> +     }
> +
> +     /* validate msg flags */
> +     if (msg->flags != (valid_flags)) {
> +             PMD_DRV_LOG(ERR, "Failed to recv msg: flags %x instead of %x.",
> +                         msg->flags, valid_flags);
> +             goto fail;
> +     }
> +
> +     sz_payload = msg->size;
> +     if (sz_payload) {
> +             ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
> +             if (ret < sz_payload) {
> +                     PMD_DRV_LOG(ERR, "Failed to recv msg payload: %d 
> instead of %d.",
> +                                 ret, msg->size);
> +                     goto fail;
> +             }
> +     }
> +
> +     return 0;
> +
> +fail:
> +     return -1;
> +}
> +
> +static struct vhost_user_msg m __rte_unused;
> +
> +static void
> +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
> +{
> +     int i, num;
> +     struct back_file *huges;
> +     struct vhost_memory_region *mr;
> +
> +     num = rte_eal_get_backfile_info(&huges);
> +
> +     if (num > VHOST_MEMORY_MAX_NREGIONS)
> +             rte_panic("%d files exceed maximum of %d for vhost-user\n",
> +                       num, VHOST_MEMORY_MAX_NREGIONS);
> +
> +     for (i = 0; i < num; ++i) {
> +             mr = &msg->payload.memory.regions[i];
> +             mr->guest_phys_addr = (uint64_t)huges[i].addr; /* use vaddr! */
> +             mr->userspace_addr = (uint64_t)huges[i].addr;
> +             mr->memory_size = huges[i].size;
> +             mr->mmap_offset = 0;
> +             fds[i] = open(huges[i].filepath, O_RDWR);
> +     }
> +
> +     msg->payload.memory.nregions = num;
> +     msg->payload.memory.padding = 0;
> +     free(huges);
> +}
> +
> +static int
> +vhost_user_sock(struct virtio_hw *hw, unsigned long int req, void *arg)
> +{
> +     struct vhost_user_msg msg;
> +     struct vhost_vring_file *file = 0;
> +     int need_reply = 0;
> +     int fds[VHOST_MEMORY_MAX_NREGIONS];
> +     int fd_num = 0;
> +     int i, len;
> +
> +     msg.request = req;
> +     msg.flags = VHOST_USER_VERSION;
> +     msg.size = 0;
> +
> +     switch (req) {
> +     case VHOST_USER_GET_FEATURES:
> +             need_reply = 1;
> +             break;
> +
> +     case VHOST_USER_SET_FEATURES:
> +     case VHOST_USER_SET_LOG_BASE:
> +             msg.payload.u64 = *((__u64 *)arg);
> +             msg.size = sizeof(m.payload.u64);
> +             break;
> +
> +     case VHOST_USER_SET_OWNER:
> +     case VHOST_USER_RESET_OWNER:
> +             break;
> +
> +     case VHOST_USER_SET_MEM_TABLE:
> +             prepare_vhost_memory_user(&msg, fds);
> +             fd_num = msg.payload.memory.nregions;
> +             msg.size = sizeof(m.payload.memory.nregions);
> +             msg.size += sizeof(m.payload.memory.padding);
> +             msg.size += fd_num * sizeof(struct vhost_memory_region);
> +             break;
> +
> +     case VHOST_USER_SET_LOG_FD:
> +             fds[fd_num++] = *((int *)arg);
> +             break;
> +
> +     case VHOST_USER_SET_VRING_NUM:
> +     case VHOST_USER_SET_VRING_BASE:
> +             memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
> +             msg.size = sizeof(m.payload.state);
> +             break;
> +
> +     case VHOST_USER_GET_VRING_BASE:
> +             memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
> +             msg.size = sizeof(m.payload.state);
> +             need_reply = 1;
> +             break;
> +
> +     case VHOST_USER_SET_VRING_ADDR:
> +             memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
> +             msg.size = sizeof(m.payload.addr);
> +             break;
> +
> +     case VHOST_USER_SET_VRING_KICK:
> +     case VHOST_USER_SET_VRING_CALL:
> +     case VHOST_USER_SET_VRING_ERR:
> +             file = arg;
> +             msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
> +             msg.size = sizeof(m.payload.u64);
> +             if (file->fd > 0)
> +                     fds[fd_num++] = file->fd;
> +             else
> +                     msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
> +             break;
> +
> +     default:
> +             PMD_DRV_LOG(ERR, "vhost-user trying to send unhandled msg 
> type");
> +             return -1;
> +     }
> +
> +     len = VHOST_USER_HDR_SIZE + msg.size;
> +     if (vhost_user_write(hw->vhostfd, &msg, len, fds, fd_num) < 0)
> +             return 0;
> +
> +     if (req == VHOST_USER_SET_MEM_TABLE)
> +             for (i = 0; i < fd_num; ++i)
> +                     close(fds[i]);
> +
> +     if (need_reply) {
> +             if (vhost_user_read(hw->vhostfd, &msg) < 0)
> +                     return -1;
> +
> +             if (req != msg.request) {
> +                     PMD_DRV_LOG(ERR, "Received unexpected msg type.");
> +                     return -1;
> +             }
> +
> +             switch (req) {
> +             case VHOST_USER_GET_FEATURES:
> +                     if (msg.size != sizeof(m.payload.u64)) {
> +                             PMD_DRV_LOG(ERR, "Received bad msg size.");
> +                             return -1;
> +                     }
> +                     *((__u64 *)arg) = msg.payload.u64;
> +                     break;
> +             case VHOST_USER_GET_VRING_BASE:
> +                     if (msg.size != sizeof(m.payload.state)) {
> +                             PMD_DRV_LOG(ERR, "Received bad msg size.");
> +                             return -1;
> +                     }
> +                     memcpy(arg, &msg.payload.state,
> +                            sizeof(struct vhost_vring_state));
> +                     break;
> +             default:
> +                     PMD_DRV_LOG(ERR, "Received unexpected msg type.");
> +                     return -1;
> +             }
> +     }
> +
> +     return 0;
> +}
> +
> +static int
> +vhost_kernel_ioctl(struct virtio_hw *hw, unsigned long int req, void *arg)
> +{
> +     return ioctl(hw->vhostfd, req, arg);
> +}
> +
> +enum {
> +     VHOST_MSG_SET_OWNER,
> +     VHOST_MSG_SET_FEATURES,
> +     VHOST_MSG_GET_FEATURES,
> +     VHOST_MSG_SET_VRING_CALL,
> +     VHOST_MSG_SET_VRING_NUM,
> +     VHOST_MSG_SET_VRING_BASE,
> +     VHOST_MSG_GET_VRING_BASE,
> +     VHOST_MSG_SET_VRING_ADDR,
> +     VHOST_MSG_SET_VRING_KICK,
> +     VHOST_MSG_SET_MEM_TABLE,
> +     VHOST_MSG_MAX,
> +};
> +
> +static const char * const vhost_msg_strings[] = {
> +     [VHOST_MSG_SET_OWNER] = "VHOST_MSG_SET_OWNER",
> +     [VHOST_MSG_SET_FEATURES] = "VHOST_MSG_SET_FEATURES",
> +     [VHOST_MSG_GET_FEATURES] = "VHOST_MSG_GET_FEATURES",
> +     [VHOST_MSG_SET_VRING_CALL] = "VHOST_MSG_SET_VRING_CALL",
> +     [VHOST_MSG_SET_VRING_NUM] = "VHOST_MSG_SET_VRING_NUM",
> +     [VHOST_MSG_SET_VRING_BASE] = "VHOST_MSG_SET_VRING_BASE",
> +     [VHOST_MSG_GET_VRING_BASE] = "VHOST_MSG_GET_VRING_BASE",
> +     [VHOST_MSG_SET_VRING_ADDR] = "VHOST_MSG_SET_VRING_ADDR",
> +     [VHOST_MSG_SET_VRING_KICK] = "VHOST_MSG_SET_VRING_KICK",
> +     [VHOST_MSG_SET_MEM_TABLE] = "VHOST_MSG_SET_MEM_TABLE",
> +     NULL,
> +};
> +
> +static unsigned long int vhost_req_map[][2] = {
> +     [VHOST_MSG_SET_OWNER] = {
> +             VHOST_SET_OWNER, VHOST_USER_SET_OWNER
> +     },
> +     [VHOST_MSG_SET_FEATURES] = {
> +             VHOST_SET_FEATURES, VHOST_USER_SET_FEATURES
> +     },
> +     [VHOST_MSG_GET_FEATURES] = {
> +             VHOST_GET_FEATURES, VHOST_USER_GET_FEATURES
> +     },
> +     [VHOST_MSG_SET_VRING_CALL] = {
> +             VHOST_SET_VRING_CALL, VHOST_USER_SET_VRING_CALL
> +     },
> +     [VHOST_MSG_SET_VRING_NUM] = {
> +             VHOST_SET_VRING_NUM, VHOST_USER_SET_VRING_NUM
> +     },
> +     [VHOST_MSG_SET_VRING_BASE] = {
> +             VHOST_SET_VRING_BASE, VHOST_USER_SET_VRING_BASE
> +     },
> +     [VHOST_MSG_GET_VRING_BASE] = {
> +             VHOST_GET_VRING_BASE, VHOST_USER_GET_VRING_BASE
> +     },
> +     [VHOST_MSG_SET_VRING_ADDR] = {
> +             VHOST_SET_VRING_ADDR, VHOST_USER_SET_VRING_ADDR
> +     },
> +     [VHOST_MSG_SET_VRING_KICK] = {
> +             VHOST_SET_VRING_KICK, VHOST_USER_SET_VRING_KICK
> +     },
> +     [VHOST_MSG_SET_MEM_TABLE] = {
> +             VHOST_SET_MEM_TABLE, VHOST_USER_SET_MEM_TABLE
> +     },
> +};
> +
> +static int
> +vhost_call(struct virtio_hw *hw, unsigned long int req_orig, void *arg)
> +{
> +     unsigned long int req_new;
> +     int ret;
> +
> +     if (req_orig >= VHOST_MSG_MAX)
> +             rte_panic("invalid req: %lu\n", req_orig);
> +
> +     PMD_DRV_LOG(INFO, "%s\n", vhost_msg_strings[req_orig]);
> +     req_new = vhost_req_map[req_orig][hw->type];
> +     if (hw->type == VHOST_USER)
> +             ret = vhost_user_sock(hw, req_new, arg);
> +     else
> +             ret = vhost_kernel_ioctl(hw, req_new, arg);
> +
> +     if (ret < 0)
> +             rte_panic("vhost_call %s failed: %s\n",
> +                       vhost_msg_strings[req_orig], strerror(errno));
> +
> +     return ret;
> +}
> +
> +static void
> +kick_one_vq(struct virtio_hw *hw, struct virtqueue *vq, unsigned queue_sel)
> +{
> +     int callfd, kickfd;
> +     struct vhost_vring_file file;
> +     struct vhost_vring_state state;
> +     struct vhost_vring_addr addr = {
> +             .index = queue_sel,
> +             .desc_user_addr = (uint64_t)(uintptr_t)vq->vq_ring.desc,
> +             .avail_user_addr = (uint64_t)(uintptr_t)vq->vq_ring.avail,
> +             .used_user_addr = (uint64_t)(uintptr_t)vq->vq_ring.used,
> +             .log_guest_addr = 0,
> +             .flags = 0, /* disable log */
> +     };
> +
> +     /* or use invalid flag to disable it, but vhost-dpdk uses this to judge
> +      * if dev is alive. so finally we need two real event_fds.
> +      */
> +     /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
> +      * firstly because vhost depends on this msg to allocate virtqueue
> +      * pair.
> +      */
> +     callfd = eventfd(0, O_CLOEXEC | O_NONBLOCK);
> +     if (callfd < 0)
> +             rte_panic("callfd error, %s\n", strerror(errno));
> +
> +     file.index = queue_sel;
> +     file.fd = callfd;
> +     vhost_call(hw, VHOST_MSG_SET_VRING_CALL, &file);
> +     hw->callfds[queue_sel] = callfd;
> +
> +     state.index = queue_sel;
> +     state.num = vq->vq_ring.num;
> +     vhost_call(hw, VHOST_MSG_SET_VRING_NUM, &state);
> +
> +     state.num = 0; /* no reservation */
> +     vhost_call(hw, VHOST_MSG_SET_VRING_BASE, &state);
> +
> +     vhost_call(hw, VHOST_MSG_SET_VRING_ADDR, &addr);
> +
> +     /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_KICK comes
> +      * lastly because vhost depends on this msg to judge if
> +      * virtio_is_ready().
> +      */
> +     kickfd = eventfd(0, O_CLOEXEC | O_NONBLOCK);
> +     if (kickfd < 0)
> +             rte_panic("kickfd error, %s\n", strerror(errno));
> +
> +     file.fd = kickfd;
> +     vhost_call(hw, VHOST_MSG_SET_VRING_KICK, &file);
> +     hw->kickfds[queue_sel] = kickfd;
> +}
> +
> +/**
> + * Merge those virtually adjacent memsegs into one region.
> + */
> +static void
> +prepare_vhost_memory_kernel(struct vhost_memory_kernel **p_vm)
> +{
> +     unsigned i, j, k = 0;
> +     struct rte_memseg *seg;
> +     struct vhost_memory_region *mr;
> +     struct vhost_memory_kernel *vm;
> +
> +     vm = malloc(sizeof(struct vhost_memory_kernel) +
> +                 RTE_MAX_MEMSEG * sizeof(struct vhost_memory_region));
> +
> +     for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
> +             seg = &rte_eal_get_configuration()->mem_config->memseg[i];
> +             if (!seg->addr)
> +                     break;
> +
> +             int new_region = 1;
> +
> +             for (j = 0; j < k; ++j) {
> +                     mr = &vm->regions[j];
> +
> +                     if (mr->userspace_addr + mr->memory_size ==
> +                         (uint64_t)seg->addr) {
> +                             mr->memory_size += seg->len;
> +                             new_region = 0;
> +                             break;
> +                     }
> +
> +                     if ((uint64_t)seg->addr + seg->len ==
> +                         mr->userspace_addr) {
> +                             mr->guest_phys_addr = (uint64_t)seg->addr;
> +                             mr->userspace_addr = (uint64_t)seg->addr;
> +                             mr->memory_size += seg->len;
> +                             new_region = 0;
> +                             break;
> +                     }
> +             }
> +
> +             if (new_region == 0)
> +                     continue;
> +
> +             mr = &vm->regions[k++];
> +             mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */
> +             mr->userspace_addr = (uint64_t)seg->addr;
> +             mr->memory_size = seg->len;
> +             mr->mmap_offset = 0;
> +     }
> +
> +     vm->nregions = k;
> +     vm->padding = 0;
> +     *p_vm = vm;
> +}
> +
> +static void kick_all_vq(struct virtio_hw *hw)
> +{
> +     uint64_t features;
> +     unsigned i, queue_sel, nvqs;
> +     struct rte_eth_dev_data *data = hw->data;
> +
> +     if (hw->type == VHOST_KERNEL) {
> +             struct vhost_memory_kernel *vm = NULL;
> +
> +             prepare_vhost_memory_kernel(&vm);
> +             vhost_call(hw, VHOST_MSG_SET_MEM_TABLE, vm);
> +             free(vm);
> +     } else {
> +             /* construct vhost_memory inside prepare_vhost_memory_user() */
> +             vhost_call(hw, VHOST_MSG_SET_MEM_TABLE, NULL);
> +     }
> +
> +     for (i = 0; i < data->nb_rx_queues; ++i) {
> +             queue_sel = 2 * i + VTNET_SQ_RQ_QUEUE_IDX;
> +             kick_one_vq(hw, data->rx_queues[i], queue_sel);
> +     }
> +     for (i = 0; i < data->nb_tx_queues; ++i) {
> +             queue_sel = 2 * i + VTNET_SQ_TQ_QUEUE_IDX;
> +             kick_one_vq(hw, data->tx_queues[i], queue_sel);
> +     }
> +
> +     /* after setup all virtqueues, we need to set_features again
> +      * so that these features can be set into each virtqueue in
> +      * vhost side.
> +      */
> +     features = hw->guest_features;
> +     features &= ~(1ull << VIRTIO_NET_F_MAC);
> +     vhost_call(hw, VHOST_MSG_SET_FEATURES, &features);
> +     if (hw->type == VHOST_KERNEL)
> +             if (ioctl(hw->backfd, TUNSETVNETHDRSZ,
> +                       &hw->vtnet_hdr_size) == -1)
> +                     rte_panic("TUNSETVNETHDRSZ failed: %s\n",
> +                               strerror(errno));
> +     PMD_DRV_LOG(INFO, "set features:%" PRIx64 "\n", features);
> +
> +     if (hw->type == VHOST_KERNEL) {
> +             struct vhost_vring_file file;
> +
> +             file.fd = hw->backfd;
> +             nvqs = data->nb_rx_queues + data->nb_tx_queues;
> +             for (file.index = 0; file.index < nvqs; ++file.index) {
> +                     if (vhost_kernel_ioctl(hw, VHOST_NET_SET_BACKEND,
> +                                            &file) < 0)
> +                             rte_panic("VHOST_NET_SET_BACKEND failed, %s\n",
> +                                       strerror(errno));
> +             }
> +     }
> +}
> +
> +static void
> +vdev_read_dev_config(struct virtio_hw *hw, uint64_t offset,
> +                  void *dst, int length)
> +{
> +     if (offset == offsetof(struct virtio_net_config, mac) &&
> +         length == ETHER_ADDR_LEN) {
> +             int i;
> +
> +             for (i = 0; i < ETHER_ADDR_LEN; ++i)
> +                     ((uint8_t *)dst)[i] = hw->mac_addr[i];
> +             return;
> +     }
> +
> +     if (offset == offsetof(struct virtio_net_config, status))
> +             *(uint16_t *)dst = hw->status;
> +
> +     if (offset == offsetof(struct virtio_net_config, max_virtqueue_pairs))
> +             *(uint16_t *)dst = hw->max_tx_queues;
> +}
> +
> +static void
> +vdev_write_dev_config(struct virtio_hw *hw, uint64_t offset,
> +                   const void *src, int length)
> +{
> +     int i;
> +
> +     if ((offset == offsetof(struct virtio_net_config, mac)) &&
> +         (length == ETHER_ADDR_LEN))
> +             for (i = 0; i < ETHER_ADDR_LEN; ++i)
> +                     hw->mac_addr[i] = ((const uint8_t *)src)[i];
> +     else
> +             rte_panic("offset=%" PRIu64 ", length=%d\n", offset, length);
> +}
> +
> +static void
> +vdev_set_status(struct virtio_hw *hw, uint8_t status)
> +{
> +     if (status & VIRTIO_CONFIG_S_DRIVER_OK)
> +             kick_all_vq(hw);
> +     hw->status = status;
> +}
> +
> +static void
> +vdev_reset(struct virtio_hw *hw __rte_unused)
> +{
> +     /* do nothing according to qemu vhost user spec */
> +}
> +
> +static uint8_t
> +vdev_get_status(struct virtio_hw *hw)
> +{
> +     return hw->status;
> +}
> +
> +static uint64_t
> +vdev_get_features(struct virtio_hw *hw)
> +{
> +     uint64_t host_features;
> +
> +     vhost_call(hw, VHOST_MSG_GET_FEATURES, &host_features);
> +     if (hw->mac_specified)
> +             host_features |= (1ull << VIRTIO_NET_F_MAC);
> +     /* disable it until we support CQ */
> +     host_features &= ~(1ull << VIRTIO_NET_F_CTRL_VQ);
> +     host_features &= ~(1ull << VIRTIO_NET_F_CTRL_RX);
> +     return host_features;
> +}
> +
> +static void
> +vdev_set_features(struct virtio_hw *hw, uint64_t features)
> +{
> +     features &= ~(1ull << VIRTIO_NET_F_MAC);
> +     vhost_call(hw, VHOST_MSG_SET_FEATURES, &features);
> +}
> +
> +static uint8_t
> +vdev_get_isr(struct virtio_hw *hw __rte_unused)
> +{
> +     rte_panic("");
> +}
> +
> +static uint16_t
> +vdev_set_config_irq(struct virtio_hw *hw __rte_unused,
> +                 uint16_t vec __rte_unused)
> +{
> +     rte_panic("");
> +}
> +
> +static uint16_t
> +vdev_get_queue_num(struct virtio_hw *hw,
> +                uint16_t queue_id __rte_unused)
> +{
> +     return hw->queue_num;
> +}
> +
> +static void
> +vdev_setup_queue(struct virtio_hw *hw __rte_unused,
> +              struct virtqueue *vq __rte_unused)
> +{
> +     /* do nothing */
> +}
> +
> +static void
> +vdev_del_queue(struct virtio_hw *hw __rte_unused,
> +            struct virtqueue *vq)
> +{
> +     struct vhost_vring_state state = {
> +             .index = vq->vq_queue_index,
> +     };
> +
> +     vhost_call(hw, VHOST_MSG_GET_VRING_BASE, &state);
> +     PMD_DRV_LOG(DEBUG, "state.num = %d\n", state.num);
> +}
> +
> +static void
> +vdev_notify_queue(struct virtio_hw *hw, struct virtqueue *vq)
> +{
> +     uint64_t buf = 1;
> +
> +     if (write(hw->kickfds[vq->vq_queue_index],
> +               &buf, sizeof(uint64_t)) == -1)
> +             rte_panic("%s\n", strerror(errno));
> +}
> +
> +static const struct virtio_pci_ops vdev_ops = {
> +     .read_dev_cfg   = vdev_read_dev_config,
> +     .write_dev_cfg  = vdev_write_dev_config,
> +     .reset          = vdev_reset,
> +     .get_status     = vdev_get_status,
> +     .set_status     = vdev_set_status,
> +     .get_features   = vdev_get_features,
> +     .set_features   = vdev_set_features,
> +     .get_isr        = vdev_get_isr,
> +     .set_config_irq = vdev_set_config_irq,
> +     .get_queue_num  = vdev_get_queue_num,
> +     .setup_queue    = vdev_setup_queue,
> +     .del_queue      = vdev_del_queue,
> +     .notify_queue   = vdev_notify_queue,
> +};
> +
> +#define TUN_DEF_SNDBUF       (1ull << 20)
> +
> +static void
> +vhost_kernel_backend_setup(struct virtio_hw *hw, char *ifname)
> +{
> +     int fd;
> +     int len = sizeof(struct virtio_net_hdr);
> +     int req_mq = 0;
> +     int sndbuf = TUN_DEF_SNDBUF;
> +     unsigned int features;
> +     struct ifreq ifr;
> +
> +     /* TODO:
> +      * 1. get/set offload capability, tap_probe_has_ufo, tap_fd_set_offload
> +      * 2. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len
> +      * 3. get number of memory regions from vhost module parameter
> +      * max_mem_regions, supported in newer version linux kernel
> +      */
> +
> +     fd = open(PATH_NET_TUN, O_RDWR);
> +     if (fd < 0)
> +             rte_panic("open %s error, %s\n", PATH_NET_TUN, strerror(errno));
> +
> +     memset(&ifr, 0, sizeof(ifr));
> +     ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
> +
> +     if (ioctl(fd, TUNGETFEATURES, &features) == -1)
> +             rte_panic("TUNGETFEATURES failed: %s", strerror(errno));
> +
> +     if (features & IFF_ONE_QUEUE)
> +             ifr.ifr_flags |= IFF_ONE_QUEUE;
> +
> +     if (features & IFF_VNET_HDR)
> +             ifr.ifr_flags |= IFF_VNET_HDR;
> +     else
> +             rte_panic("vnet_hdr requested, but kernel does not support\n");
> +
> +     if (req_mq) {
> +             if (features & IFF_MULTI_QUEUE)
> +                     ifr.ifr_flags |= IFF_MULTI_QUEUE;
> +             else
> +                     rte_panic("multiqueue requested, but kernel does not 
> support\n");
> +     }
> +
> +     if (ifname)
> +             strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
> +     else
> +             strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ);
> +     if (ioctl(fd, TUNSETIFF, (void *)&ifr) == -1)
> +             rte_panic("TUNSETIFF failed: %s", strerror(errno));
> +     fcntl(fd, F_SETFL, O_NONBLOCK);
> +
> +     if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1)
> +             rte_panic("TUNSETVNETHDRSZ failed: %s\n", strerror(errno));
> +
> +     if (ioctl(fd, TUNSETSNDBUF, &sndbuf) == -1)
> +             rte_panic("TUNSETSNDBUF failed: %s", strerror(errno));
> +
> +     hw->backfd = fd;
> +     hw->vhostfd = open(hw->path, O_RDWR);
> +     if (hw->vhostfd < 0)
> +             rte_panic("open %s failed: %s\n", hw->path, strerror(errno));
> +}
> +
> +static void
> +vhost_user_backend_setup(struct virtio_hw *hw)
> +{
> +     int fd;
> +     int flag;
> +     struct sockaddr_un un;
> +
> +     fd = socket(AF_UNIX, SOCK_STREAM, 0);
> +     if (fd < 0)
> +             rte_panic("socket error, %s\n", strerror(errno));
> +
> +     flag = fcntl(fd, F_GETFD);
> +     fcntl(fd, F_SETFD, flag | FD_CLOEXEC);
> +
> +     memset(&un, 0, sizeof(un));
> +     un.sun_family = AF_UNIX;
> +     snprintf(un.sun_path, sizeof(un.sun_path), "%s", hw->path);
> +     if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
> +             PMD_DRV_LOG(ERR, "connect error, %s\n", strerror(errno));
> +             rte_panic("connect error, %s\n", strerror(errno));
> +     }
> +
> +     hw->vhostfd = fd;
> +}
> +
> +void
> +virtio_vdev_init(struct rte_eth_dev_data *data, char *path,
> +              int nb_rx, int nb_tx, int nb_cq __attribute__ ((unused)),
> +              int queue_num, char *mac, char *ifname)
> +{
> +     int i, r;
> +     struct stat s;
> +     uint32_t tmp[ETHER_ADDR_LEN];
> +     struct virtio_hw *hw = data->dev_private;
> +
> +     hw->vtpci_ops = &vdev_ops;
> +     hw->io_base  = 0;
> +     hw->use_msix = 0;
> +     hw->modern   = 0;
> +
> +     hw->data = data;
> +     hw->path = strdup(path);
> +     hw->max_rx_queues = nb_rx;
> +     hw->max_tx_queues = nb_tx;
> +     hw->queue_num = queue_num;
> +     hw->mac_specified = 0;
> +     if (mac) {
> +             r = sscanf(mac, "%x:%x:%x:%x:%x:%x", &tmp[0],
> +                        &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5]);
> +             if (r == ETHER_ADDR_LEN) {
> +                     for (i = 0; i < ETHER_ADDR_LEN; ++i)
> +                             hw->mac_addr[i] = (uint8_t)tmp[i];
> +                     hw->mac_specified = 1;
> +             } else
> +                     PMD_DRV_LOG(WARN, "wrong format of mac: %s", mac);
> +     }
> +
> +     /* TODO: cq */
> +
> +     if (stat(hw->path, &s) < 0)
> +             rte_panic("stat: %s failed, %s\n", hw->path, strerror(errno));
> +
> +     switch (s.st_mode & S_IFMT) {
> +     case S_IFCHR:
> +             hw->type = VHOST_KERNEL;
> +             vhost_kernel_backend_setup(hw, ifname);
> +             break;
> +     case S_IFSOCK:
> +             hw->type = VHOST_USER;
> +             vhost_user_backend_setup(hw);
> +             break;
> +     default:
> +             rte_panic("unknown file type of %s\n", hw->path);
> +     }
> +     if (vhost_call(hw, VHOST_MSG_SET_OWNER, NULL) == -1)
> +             rte_panic("vhost set_owner failed: %s\n", strerror(errno));
> +}
> diff --git a/drivers/net/virtio/virtio_ethdev.h 
> b/drivers/net/virtio/virtio_ethdev.h
> index fed9571..fde77ca 100644
> --- a/drivers/net/virtio/virtio_ethdev.h
> +++ b/drivers/net/virtio/virtio_ethdev.h
> @@ -123,5 +123,9 @@ uint16_t virtio_xmit_pkts_simple(void *tx_queue, struct 
> rte_mbuf **tx_pkts,
>  #define VTNET_LRO_FEATURES (VIRTIO_NET_F_GUEST_TSO4 | \
>                           VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN)
>  
> -
> +#ifdef RTE_VIRTIO_VDEV
> +void virtio_vdev_init(struct rte_eth_dev_data *data, char *path, int nb_rx,
> +                   int nb_tx, int nb_cq, int queue_num, char *mac,
> +                   char *ifname);
> +#endif
>  #endif /* _VIRTIO_ETHDEV_H_ */
> diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
> index 0544a07..a8394f8 100644
> --- a/drivers/net/virtio/virtio_pci.h
> +++ b/drivers/net/virtio/virtio_pci.h
> @@ -150,7 +150,6 @@ struct virtqueue;
>   * rest are per-device feature bits.
>   */
>  #define VIRTIO_TRANSPORT_F_START 28
> -#define VIRTIO_TRANSPORT_F_END   32
>  
>  /* The Guest publishes the used index for which it expects an interrupt
>   * at the end of the avail ring. Host should ignore the avail->flags field. 
> */
> @@ -266,6 +265,20 @@ struct virtio_hw {
>       struct virtio_pci_common_cfg *common_cfg;
>       struct virtio_net_config *dev_cfg;
>       const struct virtio_pci_ops *vtpci_ops;
> +#ifdef RTE_VIRTIO_VDEV
> +#define VHOST_KERNEL 0
> +#define VHOST_USER   1
> +     int             type; /* type of backend */
> +     uint32_t        queue_num;
> +     char            *path;
> +     int             mac_specified;
> +     int             vhostfd;
> +     int             backfd; /* tap device used in vhost-net */
> +     int             callfds[VIRTIO_MAX_VIRTQUEUES * 2 + 1];
> +     int             kickfds[VIRTIO_MAX_VIRTQUEUES * 2 + 1];
> +     uint8_t         status;
> +     struct rte_eth_dev_data *data;
> +#endif
>  };
>  
>  /*
> -- 
> 2.1.4

Reply via email to