On Tue, Mar 11, 2014 at 12:45:30PM +0000, anton.iva...@kot-begemot.co.uk wrote: > +/* This protocol number is passed getaddrinfo(), and not > + * used directly. If you give gettaddrinfo() what one is > + * supposed to give - INET, RAW, 0, the result is not > + * set correctly. > + * Setting the args to INET, RAW, L2TPv3 is the "lowest pain" > + * workaround in this case as it allows common raw and udp > + * setup. > + */ > + > +#ifndef IPPROTO_L2TP > +#define IPPROTO_L2TP 0x73 > +#endif
The comment is incorrect. Using IPPROTO_L2TP is not a workaround, it's the expected way to implement an IP-level protocol in userspace. getaddrinfo() returns protocol IPPROTO_L2TP unchanged. Then we call socket(AF_INET, SOCK_RAW, IPPROTO_L2TP). INET, RAW, L2TPv3 tells the kernel that we wish to receive IP packets when the protocol field is equal to IPPROTO_L2TP. If we didn't tell the kernel which IP protocol to listen for then the socket would not receive packets (IPPROTO_RAW is send-only). I suggest changing the comment simply to: /* IANA-assigned IP protocol ID for L2TPv3 */ > +static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc, > + const struct iovec *iov, > + int iovcnt) > +{ > + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); > + > + struct msghdr message; > + int ret; > + > + if (iovcnt > MAX_L2TPV3_IOVCNT - 1) { > + error_report( > + "iovec too long %d > %d, change l2tpv3.h", > + iovcnt, MAX_L2TPV3_IOVCNT > + ); > + return -1; > + } > + l2tpv3_form_header(s); > + memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec)); > + s->vec->iov_base = s->header_buf; > + s->vec->iov_len = s->offset; > + message.msg_name = s->dgram_dst; > + message.msg_namelen = s->dst_size; > + message.msg_iov = s->vec; > + message.msg_iovlen = iovcnt + 1; > + message.msg_control = NULL; > + message.msg_controllen = 0; > + message.msg_flags = 0; > + do { > + ret = sendmsg(s->fd, &message, 0); > + } while ((ret == -1) && (errno == EINTR)); > + if (ret > 0) { > + ret -= s->offset; > + } else if (ret == 0) { > + /* belt and braces - should not occur on DGRAM > + * we should get an error and never a 0 send > + */ > + ret = iov_size(iov, iovcnt); > + } else { > + /* signal upper layer that socket buffer is full */ > + ret = -errno; > + if (ret == -EAGAIN || ret == -ENOBUFS) { > + ret = 0; > + } > + } > + return ret; > +} Returning 0 means the peer (the emulated NIC) will stop sending packets until further notice. Since qemu_flush_queued_packets() is not invoked anywhere in this source file this will result in deadlock! What needs to happen is: 1. We begin monitoring s->fd to see when it becomes writable again. 2. When it becomes writable we call qemu_flush_queued_packets() so the peer can resuming transmission. See net/tap.c or other backends for examples of how this is done. > +static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf) > +{ > + > + uint32_t *session; > + uint64_t cookie; > + > + if ((!s->udp) && (!s->ipv6)) { > + buf += sizeof(struct iphdr) /* fix for ipv4 raw */; > + } > + if (s->cookie) { > + if (s->cookie_is_64) { > + cookie = ldq_be_p(buf + s->cookie_offset); > + } else { > + cookie = ldl_be_p(buf + s->cookie_offset); > + } > + if (cookie != s->rx_cookie) { > + error_report("unknown cookie id\n"); > + return -1; > + } > + } > + session = (uint32_t *) (buf + s->session_offset); > + if (ldl_be_p(session) != s->rx_session) { > + error_report("session mismatch"); > + return -1; > + } > + return 0; > +} > + > +static void net_l2tpv3_send(void *opaque) > +{ > + NetL2TPV3State *s = opaque; > + > + int i, count, offset; > + struct mmsghdr *msgvec; > + struct iovec *vec; > + > + msgvec = s->msgvec; > + offset = s->offset; > + if ((!s->udp) && (!s->ipv6)) { > + offset += sizeof(struct iphdr); > + } > + count = recvmmsg(s->fd, msgvec, MAX_L2TPV3_MSGCNT, MSG_DONTWAIT, NULL); > + for (i = 0; i < count; i++) { > + if (msgvec->msg_len > 0) { > + vec = msgvec->msg_hdr.msg_iov; > + if ((msgvec->msg_len > offset) && > + (l2tpv3_verify_header(s, vec->iov_base) == 0)) { > + vec++; > + qemu_send_packet(&s->nc, vec->iov_base, > + msgvec->msg_len - offset); > + } else { > + error_report("l2tpv3 header verification failed"); This and the error_report() ins l2tpv3_verify_header() could be used as a Denial of Service: a malicious host may be able to send invalid packets to fill up the host's disk with error messages. These are good candidates for rate-limiting or even once-only error messages. The error_report() API currently does not have a way to do that. I don't insist on this but it's something worth changing in the future. > diff --git a/qemu-options.hx b/qemu-options.hx > index 8b94264..ce4d99d 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -1395,19 +1395,28 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > " (default=" DEFAULT_BRIDGE_INTERFACE ") using the > program 'helper'\n" > " (default=" DEFAULT_BRIDGE_HELPER ")\n" > #endif > - "-net > socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" > - " connect the vlan 'n' to another VLAN using a socket > connection\n" > - "-net > socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port[,localaddr=addr]]\n" > - " connect the vlan 'n' to multicast maddr and port\n" > - " use 'localaddr=addr' to specify the host address to > send packets from\n" > - "-net > socket[,vlan=n][,name=str][,fd=h][,udp=host:port][,localaddr=host:port]\n" > - " connect the vlan 'n' to another VLAN using an UDP > tunnel\n" > -#ifdef CONFIG_VDE > - "-net > vde[,vlan=n][,name=str][,sock=socketpath][,port=n][,group=groupname][,mode=octalmode]\n" > - " connect the vlan 'n' to port 'n' of a vde switch > running\n" > - " on host and listening for incoming connections on > 'socketpath'.\n" > - " Use group 'groupname' and mode 'octalmode' to change > default\n" > - " ownership and permissions for communication port.\n" Please leave socket and vde. They are not being deprecated yet.