On Tue, Mar 15, 2016 at 05:17:03PM +0800, w...@redhat.com wrote:
> From: Wei Xu <w...@redhat.com>
> 
> All the data packets in a tcp connection will be cached to a big buffer
> in every receive interval, and will be sent out via a timer, the
> 'virtio_net_rsc_timeout' controls the interval, the value will influent the
> performance and response of tcp connection extremely, 50000(50us) is a
> experience value to gain a performance improvement, since the whql test
> sends packets every 100us, so '300000(300us)' can pass the test case,
> this is also the default value, it's gonna to be tunable.
> The timer will only be triggered if the packets pool is not empty,
> and it'll drain off all the cached packets
> 
> 'NetRscChain' is used to save the segments of different protocols in a
> VirtIONet device.
> 
> The main handler of TCP includes TCP window update, duplicated ACK check
> and the real data coalescing if the new segment passed sanity check
> and is identified as an 'wanted' one.
> 
> An 'wanted' segment means:
> 1. Segment is within current window and the sequence is the expected one.
> 2. ACK of the segment is in the valid window.
> 3. If the ACK in the segment is a duplicated one, then it must less than 2,
>    this is to notify upper layer TCP starting retransmission due to the spec.
> 
> Sanity check includes:
> 1. Incorrect version in IP header
> 2. IP options & IP fragment
> 3. Not a TCP packets
> 4. Sanity size check to prevent buffer overflow attack.
> 
> There maybe more cases should be considered such as ip identification other
> flags, while it broke the test because windows set it to the same even it's
> not a fragment.
> 
> Normally it includes 2 typical ways to handle a TCP control flag, 'bypass'
> and 'finalize', 'bypass' means should be sent out directly, and 'finalize'
> means the packets should also be bypassed, and this should be done
> after searching for the same connection packets in the pool and sending
> all of them out, this is to avoid out of data.
> 
> All the 'SYN' packets will be bypassed since this always begin a new'
> connection, other flags such 'FIN/RST' will trigger a finalization, because
> this normally happens upon a connection is going to be closed, an 'URG' packet
> also finalize current coalescing unit while there maybe protocol difference to
> different OS.
> 
> Statistics can be used to monitor the basic coalescing status, the 'out of 
> order'
> and 'out of window' means how many retransmitting packets, thus describe the
> performance intuitively.
> 
> Signed-off-by: Wei Xu <w...@redhat.com>
> ---
>  hw/net/virtio-net.c            | 486 
> ++++++++++++++++++++++++++++++++++++++++-
>  include/hw/virtio/virtio-net.h |   1 +
>  include/hw/virtio/virtio.h     |  72 ++++++
>  3 files changed, 558 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 5798f87..c23b45f 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -15,10 +15,12 @@
>  #include "qemu/iov.h"
>  #include "hw/virtio/virtio.h"
>  #include "net/net.h"
> +#include "net/eth.h"
>  #include "net/checksum.h"
>  #include "net/tap.h"
>  #include "qemu/error-report.h"
>  #include "qemu/timer.h"
> +#include "qemu/sockets.h"
>  #include "hw/virtio/virtio-net.h"
>  #include "net/vhost_net.h"
>  #include "hw/virtio/virtio-bus.h"
> @@ -38,6 +40,35 @@
>  #define endof(container, field) \
>      (offsetof(container, field) + sizeof(((container *)0)->field))
>  
> +#define ETH_HDR_SZ (sizeof(struct eth_header))
> +#define IP4_HDR_SZ (sizeof(struct ip_header))
> +#define TCP_HDR_SZ (sizeof(struct tcp_header))
> +#define ETH_IP4_HDR_SZ (ETH_HDR_SZ + IP4_HDR_SZ)

It's better to open-code these imho.

> +
> +#define IP4_ADDR_SIZE   8                   /* ipv4 saddr + daddr */
> +#define TCP_PORT_SIZE   4                   /* sport + dport */
> +
> +/* IPv4 max payload, 16 bits in the header */
> +#define MAX_IP4_PAYLOAD (65535 - IP4_HDR_SZ)
> +#define MAX_TCP_PAYLOAD 65535
> +
> +/* max payload with virtio header */
> +#define MAX_VIRTIO_PAYLOAD  (sizeof(struct virtio_net_hdr_mrg_rxbuf) \
> +                                + ETH_HDR_SZ + MAX_TCP_PAYLOAD)
> +
> +#define IP4_HEADER_LEN 5 /* header lenght value in ip header without option 
> */
> +
> +/* Purge coalesced packets timer interval */
> +#define RSC_TIMER_INTERVAL  300000

Pls prefix local macros with VIRTIO_NET_


> +
> +/* Switcher to enable/disable rsc */
> +static bool virtio_net_rsc_bypass = 1;
> +
> +/* This value affects the performance a lot, and should be tuned carefully,
> +   '300000'(300us) is the recommended value to pass the WHQL test, '50000' 
> can
> +   gain 2x netperf throughput with tso/gso/gro 'off'. */

So either tests pass or we get good performance, but not both?
Hmm.

> +static uint32_t virtio_net_rsc_timeout = RSC_TIMER_INTERVAL;


This would beed to be tunable.

> +
>  typedef struct VirtIOFeature {
>      uint32_t flags;
>      size_t end;
> @@ -1089,7 +1120,8 @@ static int receive_filter(VirtIONet *n, const uint8_t 
> *buf, int size)
>      return 0;
>  }
>  
> -static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, 
> size_t size)
> +static ssize_t virtio_net_do_receive(NetClientState *nc,
> +                                  const uint8_t *buf, size_t size)
>  {
>      VirtIONet *n = qemu_get_nic_opaque(nc);
>      VirtIONetQueue *q = virtio_net_get_subqueue(nc);
> @@ -1685,6 +1717,456 @@ static int virtio_net_load_device(VirtIODevice *vdev, 
> QEMUFile *f,
>      return 0;
>  }
>  
> +static void virtio_net_rsc_extract_unit4(NetRscChain *chain,
> +                                         const uint8_t *buf, NetRscUnit* 
> unit)
> +{
> +    uint16_t ip_hdrlen;
> +
> +    unit->ip = (struct ip_header *)(buf + chain->hdr_size + ETH_HDR_SZ);
> +    ip_hdrlen = ((0xF & unit->ip->ip_ver_len) << 2);
> +    unit->ip_plen = &unit->ip->ip_len;
> +    unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip) + ip_hdrlen);
> +    unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
> +    unit->payload = htons(*unit->ip_plen) - ip_hdrlen - unit->tcp_hdrlen;
> +}
> +
> +static void virtio_net_rsc_ipv4_checksum(struct ip_header *ip)
> +{
> +    uint32_t sum;
> +
> +    ip->ip_sum = 0;
> +    sum = net_checksum_add_cont(IP4_HDR_SZ, (uint8_t *)ip, 0);
> +    ip->ip_sum = cpu_to_be16(net_checksum_finish(sum));
> +}
> +
> +static size_t virtio_net_rsc_drain_seg(NetRscChain *chain, NetRscSeg *seg)
> +{
> +    int ret;
> +
> +    virtio_net_rsc_ipv4_checksum(seg->unit.ip);
> +    ret = virtio_net_do_receive(seg->nc, seg->buf, seg->size);
> +    QTAILQ_REMOVE(&chain->buffers, seg, next);
> +    g_free(seg->buf);
> +    g_free(seg);
> +
> +    return ret;
> +}
> +
> +static void virtio_net_rsc_purge(void *opq)
> +{
> +    NetRscChain *chain = (NetRscChain *)opq;
> +    NetRscSeg *seg, *rn;
> +
> +    QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn) {
> +        if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
> +            chain->stat.purge_failed++;
> +            continue;
> +        }
> +    }
> +
> +    chain->stat.timer++;
> +    if (!QTAILQ_EMPTY(&chain->buffers)) {
> +        timer_mod(chain->drain_timer,
> +              qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 
> virtio_net_rsc_timeout);
> +    }
> +}
> +
> +static void virtio_net_rsc_cleanup(VirtIONet *n)
> +{
> +    NetRscChain *chain, *rn_chain;
> +    NetRscSeg *seg, *rn_seg;
> +
> +    QTAILQ_FOREACH_SAFE(chain, &n->rsc_chains, next, rn_chain) {
> +        QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn_seg) {
> +            QTAILQ_REMOVE(&chain->buffers, seg, next);
> +            g_free(seg->buf);
> +            g_free(seg);
> +        }
> +
> +        timer_del(chain->drain_timer);
> +        timer_free(chain->drain_timer);
> +        QTAILQ_REMOVE(&n->rsc_chains, chain, next);
> +        g_free(chain);
> +    }
> +}
> +
> +static void virtio_net_rsc_cache_buf(NetRscChain *chain, NetClientState *nc,
> +                                     const uint8_t *buf, size_t size)
> +{
> +    NetRscSeg *seg;
> +
> +    seg = g_malloc(sizeof(NetRscSeg));
> +    seg->buf = g_malloc(MAX_VIRTIO_PAYLOAD);
> +
> +    memmove(seg->buf, buf, size);
> +    seg->size = size;
> +    seg->dup_ack_count = 0;
> +    seg->is_coalesced = 0;
> +    seg->nc = nc;
> +
> +    QTAILQ_INSERT_TAIL(&chain->buffers, seg, next);
> +    chain->stat.cache++;
> +
> +    virtio_net_rsc_extract_unit4(chain, seg->buf, &seg->unit);
> +}
> +
> +static int32_t virtio_net_rsc_handle_ack(NetRscChain *chain, NetRscSeg *seg,
> +                                 const uint8_t *buf, struct tcp_header 
> *n_tcp,
> +                                 struct tcp_header *o_tcp)
> +{
> +    uint32_t nack, oack;
> +    uint16_t nwin, owin;
> +
> +    nack = htonl(n_tcp->th_ack);
> +    nwin = htons(n_tcp->th_win);
> +    oack = htonl(o_tcp->th_ack);
> +    owin = htons(o_tcp->th_win);
> +
> +    if ((nack - oack) >= MAX_TCP_PAYLOAD) {
> +        chain->stat.ack_out_of_win++;
> +        return RSC_FINAL;
> +    } else if (nack == oack) {
> +        /* duplicated ack or window probe */
> +        if (nwin == owin) {
> +            /* duplicated ack, add dup ack count due to whql test up to 1 */
> +            chain->stat.dup_ack++;
> +
> +            if (seg->dup_ack_count == 0) {
> +                seg->dup_ack_count++;
> +                chain->stat.dup_ack1++;
> +                return RSC_COALESCE;
> +            } else {
> +                /* Spec says should send it directly */
> +                chain->stat.dup_ack2++;
> +                return RSC_FINAL;
> +            }
> +        } else {
> +            /* Coalesce window update */
> +            o_tcp->th_win = n_tcp->th_win;
> +            chain->stat.win_update++;
> +            return RSC_COALESCE;
> +        }
> +    } else {
> +        /* pure ack, update ack */
> +        o_tcp->th_ack = n_tcp->th_ack;
> +        chain->stat.pure_ack++;
> +        return RSC_COALESCE;
> +    }
> +}
> +
> +static int32_t virtio_net_rsc_coalesce_data(NetRscChain *chain, NetRscSeg 
> *seg,
> +                                    const uint8_t *buf, NetRscUnit *n_unit)
> +{
> +    void *data;
> +    uint16_t o_ip_len;
> +    uint32_t nseq, oseq;
> +    NetRscUnit *o_unit;
> +
> +    o_unit = &seg->unit;
> +    o_ip_len = htons(*o_unit->ip_plen);
> +    nseq = htonl(n_unit->tcp->th_seq);
> +    oseq = htonl(o_unit->tcp->th_seq);
> +
> +    if (n_unit->tcp_hdrlen > TCP_HDR_SZ) {
> +        /* Log this only for debugging observation */
> +        chain->stat.tcp_option++;
> +    }
> +
> +    /* Ignore packet with more/larger tcp options */
> +    if (n_unit->tcp_hdrlen > o_unit->tcp_hdrlen) {
> +        chain->stat.tcp_larger_option++;
> +        return RSC_FINAL;
> +    }
> +
> +    /* out of order or retransmitted. */
> +    if ((nseq - oseq) > MAX_TCP_PAYLOAD) {
> +        chain->stat.data_out_of_win++;
> +        return RSC_FINAL;
> +    }
> +
> +    data = ((uint8_t *)n_unit->tcp) + n_unit->tcp_hdrlen;
> +    if (nseq == oseq) {
> +        if ((0 == o_unit->payload) && n_unit->payload) {
> +            /* From no payload to payload, normal case, not a dup ack or etc 
> */
> +            chain->stat.data_after_pure_ack++;
> +            goto coalesce;
> +        } else {
> +            return virtio_net_rsc_handle_ack(chain, seg, buf,
> +                                             n_unit->tcp, o_unit->tcp);
> +        }
> +    } else if ((nseq - oseq) != o_unit->payload) {
> +        /* Not a consistent packet, out of order */
> +        chain->stat.data_out_of_order++;
> +        return RSC_FINAL;
> +    } else {
> +coalesce:
> +        if ((o_ip_len + n_unit->payload) > chain->max_payload) {
> +            chain->stat.over_size++;
> +            return RSC_FINAL;
> +        }
> +
> +        /* Here comes the right data, the payload lengh in v4/v6 is 
> different,
> +           so use the field value to update and record the new data len */
> +        o_unit->payload += n_unit->payload; /* update new data len */
> +
> +        /* update field in ip header */
> +        *o_unit->ip_plen = htons(o_ip_len + n_unit->payload);
> +
> +        /* Bring 'PUSH' big, the whql test guide says 'PUSH' can be coalesced
> +           for windows guest, while this may change the behavior for linux
> +           guest. */
> +        o_unit->tcp->th_offset_flags = n_unit->tcp->th_offset_flags;
> +
> +        o_unit->tcp->th_ack = n_unit->tcp->th_ack;
> +        o_unit->tcp->th_win = n_unit->tcp->th_win;
> +
> +        memmove(seg->buf + seg->size, data, n_unit->payload);
> +        seg->size += n_unit->payload;
> +        chain->stat.coalesced++;
> +        return RSC_COALESCE;
> +    }
> +}
> +
> +static int32_t virtio_net_rsc_coalesce4(NetRscChain *chain, NetRscSeg *seg,
> +                        const uint8_t *buf, size_t size, NetRscUnit *unit)
> +{
> +    if ((unit->ip->ip_src ^ seg->unit.ip->ip_src)
> +        || (unit->ip->ip_dst ^ seg->unit.ip->ip_dst)
> +        || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
> +        || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
> +        chain->stat.no_match++;
> +        return RSC_NO_MATCH;
> +    }
> +
> +    return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
> +}
> +
> +/* Pakcets with 'SYN' should bypass, other flag should be sent after drain
> + * to prevent out of order */
> +static int virtio_net_rsc_tcp_ctrl_check(NetRscChain *chain,
> +                                         struct tcp_header *tcp)
> +{
> +    uint16_t tcp_flag;
> +
> +    tcp_flag = htons(tcp->th_offset_flags) & 0x3F;
> +    if (tcp_flag & TH_SYN) {
> +        chain->stat.tcp_syn++;
> +        return RSC_BYPASS;
> +    }
> +
> +    if (tcp_flag & (TH_FIN | TH_URG | TH_RST)) {
> +        chain->stat.tcp_ctrl_drain++;
> +        return RSC_FINAL;
> +    }
> +
> +    return RSC_WANT;
> +}
> +
> +static bool virtio_net_rsc_empty_cache(NetRscChain *chain, NetClientState 
> *nc,
> +                          const uint8_t *buf, size_t size)
> +{
> +    if (QTAILQ_EMPTY(&chain->buffers)) {
> +        chain->stat.empty_cache++;
> +        virtio_net_rsc_cache_buf(chain, nc, buf, size);
> +        timer_mod(chain->drain_timer,
> +              qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 
> virtio_net_rsc_timeout);
> +        return 1;
> +    }
> +
> +    return 0;
> +}
> +
> +static size_t virtio_net_rsc_do_coalesce(NetRscChain *chain, NetClientState 
> *nc,
> +                          const uint8_t *buf, size_t size, NetRscUnit *unit)
> +{
> +    int ret;
> +    NetRscSeg *seg, *nseg;
> +
> +    QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
> +        ret = virtio_net_rsc_coalesce4(chain, seg, buf, size, unit);
> +
> +        if (ret == RSC_FINAL) {
> +            if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
> +                /* Send failed */
> +                chain->stat.final_failed++;
> +                return 0;
> +            }
> +
> +            /* Send current packet */
> +            return virtio_net_do_receive(nc, buf, size);
> +        } else if (ret == RSC_NO_MATCH) {
> +            continue;
> +        } else {
> +            /* Coalesced, mark coalesced flag to tell calc cksum for ipv4 */
> +            seg->is_coalesced = 1;
> +            return size;
> +        }
> +    }
> +
> +    chain->stat.no_match_cache++;
> +    virtio_net_rsc_cache_buf(chain, nc, buf, size);
> +    return size;
> +}
> +
> +/* Drain a connection data, this is to avoid out of order segments */
> +static size_t virtio_net_rsc_drain_flow(NetRscChain *chain, NetClientState 
> *nc,
> +                        const uint8_t *buf, size_t size, uint16_t ip_start,
> +                        uint16_t ip_size, uint16_t tcp_port, uint16_t 
> port_size)
> +{
> +    NetRscSeg *seg, *nseg;
> +
> +    QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
> +        if (memcmp(buf + ip_start, seg->buf + ip_start, ip_size)
> +            || memcmp(buf + tcp_port, seg->buf + tcp_port, port_size)) {
> +            continue;
> +        }
> +        if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
> +            chain->stat.drain_failed++;
> +        }
> +
> +        break;
> +    }
> +
> +    return virtio_net_do_receive(nc, buf, size);
> +}
> +
> +static int32_t virtio_net_rsc_sanity_check4(NetRscChain *chain,
> +                        struct ip_header *ip, const uint8_t *buf, size_t 
> size)
> +{
> +    uint16_t ip_len;
> +
> +    if (size < (chain->hdr_size + ETH_IP4_HDR_SZ + TCP_HDR_SZ)) {
> +        return RSC_BYPASS;
> +    }
> +
> +    /* Not an ipv4 one */
> +    if (((0xF0 & ip->ip_ver_len) >> 4) != IP_HEADER_VERSION_4) {
> +        chain->stat.ip_option++;
> +        return RSC_BYPASS;
> +    }
> +
> +    /* Don't handle packets with ip option */
> +    if (IP4_HEADER_LEN != (0xF & ip->ip_ver_len)) {
> +        chain->stat.ip_option++;
> +        return RSC_BYPASS;
> +    }
> +
> +    /* Don't handle packets with ip fragment */
> +    if (!(htons(ip->ip_off) & IP_DF)) {
> +        chain->stat.ip_frag++;
> +        return RSC_BYPASS;
> +    }
> +
> +    if (ip->ip_p != IPPROTO_TCP) {
> +        chain->stat.bypass_not_tcp++;
> +        return RSC_BYPASS;
> +    }
> +
> +    /* Sanity check */
> +    ip_len = htons(ip->ip_len);
> +    if (ip_len < (IP4_HDR_SZ + TCP_HDR_SZ)
> +        || ip_len > (size - chain->hdr_size - ETH_HDR_SZ)) {
> +        chain->stat.ip_hacked++;
> +        return RSC_BYPASS;
> +    }
> +
> +    return RSC_WANT;
> +}
> +
> +static size_t virtio_net_rsc_receive4(void *opq, NetClientState* nc,
> +                                      const uint8_t *buf, size_t size)
> +{
> +    int32_t ret;
> +    NetRscChain *chain;
> +    NetRscUnit unit;
> +
> +    chain = (NetRscChain *)opq;
> +    virtio_net_rsc_extract_unit4(chain, buf, &unit);
> +    if (RSC_WANT != virtio_net_rsc_sanity_check4(chain, unit.ip, buf, size)) 
> {
> +        return virtio_net_do_receive(nc, buf, size);
> +    }
> +
> +    ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
> +    if (ret == RSC_BYPASS) {
> +        return virtio_net_do_receive(nc, buf, size);
> +    } else if (ret == RSC_FINAL) {
> +        return virtio_net_rsc_drain_flow(chain, nc, buf, size,
> +                        ((chain->hdr_size + ETH_HDR_SZ) + 12), IP4_ADDR_SIZE,
> +                        (chain->hdr_size + ETH_IP4_HDR_SZ), TCP_PORT_SIZE);
> +    }
> +
> +    if (virtio_net_rsc_empty_cache(chain, nc, buf, size)) {
> +        return size;
> +    }
> +
> +    return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
> +}
> +
> +static NetRscChain *virtio_net_rsc_lookup_chain(VirtIONet * n,
> +                                            NetClientState *nc, uint16_t 
> proto)
> +{
> +    NetRscChain *chain;
> +
> +    /* Only handle IPv4/6 */
> +    if (proto != (uint16_t)ETH_P_IP) {
> +        return NULL;
> +    }
> +
> +    QTAILQ_FOREACH(chain, &n->rsc_chains, next) {
> +        if (chain->proto == proto) {
> +            return chain;
> +        }
> +    }
> +
> +    chain = g_malloc(sizeof(*chain));
> +    chain->hdr_size = n->guest_hdr_len;
> +    chain->proto = proto;
> +    chain->max_payload = MAX_IP4_PAYLOAD;
> +    chain->drain_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> +                                      virtio_net_rsc_purge, chain);
> +    memset(&chain->stat, 0, sizeof(chain->stat));
> +
> +    QTAILQ_INIT(&chain->buffers);
> +    QTAILQ_INSERT_TAIL(&n->rsc_chains, chain, next);
> +
> +    return chain;
> +}
> +
> +static ssize_t virtio_net_rsc_receive(NetClientState *nc,
> +                                      const uint8_t *buf, size_t size)
> +{
> +    uint16_t proto;
> +    NetRscChain *chain;
> +    struct eth_header *eth;
> +    VirtIONet *n;
> +
> +    n = qemu_get_nic_opaque(nc);
> +    if (size < (n->guest_hdr_len + ETH_HDR_SZ)) {
> +        return virtio_net_do_receive(nc, buf, size);
> +    }
> +
> +    eth = (struct eth_header *)(buf + n->guest_hdr_len);
> +    proto = htons(eth->h_proto);
> +
> +    chain = virtio_net_rsc_lookup_chain(n, nc, proto);
> +    if (!chain) {
> +        return virtio_net_do_receive(nc, buf, size);
> +    } else {
> +        chain->stat.received++;
> +        return virtio_net_rsc_receive4(chain, nc, buf, size);
> +    }
> +}
> +
> +static ssize_t virtio_net_receive(NetClientState *nc,
> +                                  const uint8_t *buf, size_t size)
> +{
> +    if (virtio_net_rsc_bypass) {
> +        return virtio_net_do_receive(nc, buf, size);
> +    } else {
> +        return virtio_net_rsc_receive(nc, buf, size);
> +    }
> +}
> +
>  static NetClientInfo net_virtio_info = {
>      .type = NET_CLIENT_OPTIONS_KIND_NIC,
>      .size = sizeof(NICState),
> @@ -1814,6 +2296,7 @@ static void virtio_net_device_realize(DeviceState *dev, 
> Error **errp)
>      nc = qemu_get_queue(n->nic);
>      nc->rxfilter_notify_enabled = 1;
>  
> +    QTAILQ_INIT(&n->rsc_chains);
>      n->qdev = dev;
>      register_savevm(dev, "virtio-net", -1, VIRTIO_NET_VM_VERSION,
>                      virtio_net_save, virtio_net_load, n);
> @@ -1848,6 +2331,7 @@ static void virtio_net_device_unrealize(DeviceState 
> *dev, Error **errp)
>      g_free(n->vqs);
>      qemu_del_nic(n->nic);
>      virtio_cleanup(vdev);
> +    virtio_net_rsc_cleanup(n);
>  }
>  
>  static void virtio_net_instance_init(Object *obj)
> diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> index 0cabdb6..6939e92 100644
> --- a/include/hw/virtio/virtio-net.h
> +++ b/include/hw/virtio/virtio-net.h
> @@ -59,6 +59,7 @@ typedef struct VirtIONet {
>      VirtIONetQueue *vqs;
>      VirtQueue *ctrl_vq;
>      NICState *nic;
> +    QTAILQ_HEAD(, NetRscChain) rsc_chains;
>      uint32_t tx_timeout;
>      int32_t tx_burst;
>      uint32_t has_vnet_hdr;
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 2b5b248..3b1dfa8 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -128,6 +128,78 @@ typedef struct VirtioDeviceClass {
>      int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id);
>  } VirtioDeviceClass;
>  
> +/* Coalesced packets type & status */
> +typedef enum {
> +    RSC_COALESCE,           /* Data been coalesced */
> +    RSC_FINAL,              /* Will terminate current connection */
> +    RSC_NO_MATCH,           /* No matched in the buffer pool */
> +    RSC_BYPASS,             /* Packet to be bypass, not tcp, tcp ctrl, etc */
> +    RSC_WANT                /* Data want to be coalesced */
> +} COALESCE_STATUS;
> +
> +typedef struct NetRscStat {
> +    uint32_t received;
> +    uint32_t coalesced;
> +    uint32_t over_size;
> +    uint32_t cache;
> +    uint32_t empty_cache;
> +    uint32_t no_match_cache;
> +    uint32_t win_update;
> +    uint32_t no_match;
> +    uint32_t tcp_syn;
> +    uint32_t tcp_ctrl_drain;
> +    uint32_t dup_ack;
> +    uint32_t dup_ack1;
> +    uint32_t dup_ack2;
> +    uint32_t pure_ack;
> +    uint32_t ack_out_of_win;
> +    uint32_t data_out_of_win;
> +    uint32_t data_out_of_order;
> +    uint32_t data_after_pure_ack;
> +    uint32_t bypass_not_tcp;
> +    uint32_t tcp_option;
> +    uint32_t tcp_larger_option;
> +    uint32_t ip_frag;
> +    uint32_t ip_hacked;
> +    uint32_t ip_option;
> +    uint32_t purge_failed;
> +    uint32_t drain_failed;
> +    uint32_t final_failed;
> +    int64_t  timer;
> +} NetRscStat;
> +
> +/* Rsc unit general info used to checking if can coalescing */
> +typedef struct NetRscUnit {
> +   struct ip_header *ip;   /* ip header */
> +   uint16_t *ip_plen;      /* data len pointer in ip header field */
> +   struct tcp_header *tcp; /* tcp header */
> +   uint16_t tcp_hdrlen;    /* tcp header len */
> +   uint16_t payload;       /* pure payload without virtio/eth/ip/tcp */
> +} NetRscUnit;
> +
> +/* Coalesced segmant */
> +typedef struct NetRscSeg {
> +    QTAILQ_ENTRY(NetRscSeg) next;
> +    void *buf;
> +    size_t size;
> +    uint32_t dup_ack_count;
> +    bool is_coalesced;      /* need recal ipv4 header checksum, mark here */
> +    NetRscUnit unit;
> +    NetClientState *nc;
> +} NetRscSeg;
> +
> +
> +/* Chain is divided by protocol(ipv4/v6) and NetClientInfo */
> +typedef struct NetRscChain {
> +   QTAILQ_ENTRY(NetRscChain) next;
> +   uint16_t hdr_size;
> +   uint16_t proto;
> +   uint16_t max_payload;
> +   QEMUTimer *drain_timer;
> +   QTAILQ_HEAD(, NetRscSeg) buffers;
> +   NetRscStat stat;
> +} NetRscChain;
> +
>  void virtio_instance_init_common(Object *proxy_obj, void *data,
>                                   size_t vdev_size, const char *vdev_name);
>  
> -- 
> 2.5.0

Reply via email to