On 01/14/2014 04:25 PM, Michael S. Tsirkin wrote: > On Tue, Jan 14, 2014 at 02:53:07PM +0800, Jason Wang wrote: >> We used to limit the number of packets queued through tx_queue_length. This >> has several issues: >> >> - tx_queue_length is the control of qdisc queue length, simply reusing it >> to control the packets queued by device may cause confusion. >> - After commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 ("macvtap: Add >> support of packet capture on macvtap device."), an unexpected qdisc >> caused by non-zero tx_queue_length will lead qdisc lock contention for >> multiqueue deivce. >> - What we really want is to limit the total amount of memory occupied not >> the number of packets. >> >> So this patch tries to solve the above issues by using socket rcvbuf to >> limit the packets could be queued for tun/macvtap. This was done by using >> sock_queue_rcv_skb() instead of a direct call to skb_queue_tail(). Also two >> new ioctl() were introduced for userspace to change the rcvbuf like what we >> have done for sndbuf. >> >> With this fix, we can safely change the tx_queue_len of macvtap to >> zero. This will make multiqueue works without extra lock contention. >> >> Cc: Vlad Yasevich <vyase...@redhat.com> >> Cc: Michael S. Tsirkin <m...@redhat.com> >> Cc: John Fastabend <john.r.fastab...@intel.com> >> Cc: Stephen Hemminger <step...@networkplumber.org> >> Cc: Herbert Xu <herb...@gondor.apana.org.au> >> Signed-off-by: Jason Wang <jasow...@redhat.com> > No, I don't think we can change userspace-visible behaviour like that. > > This will break any existing user that tries to control > queue length through sysfs,netlink or device ioctl.
But it looks like a buggy API, since tx_queue_len should be for qdisc queue length instead of device itself. If we really want to preserve the behaviour, how about using a new feature flag and change the behaviour only when the device is created (TUNSETIFF) with the new flag? > > Take a look at my patch in msg ID 20140109071721.gd19...@redhat.com > which gives one way to set tx_queue_len to zero without > breaking userspace. If I read the patch correctly, it will make no way for the user who really want to change the qdisc queue length for tun. > > >> --- >> drivers/net/macvtap.c | 31 ++++++++++++++++++++--------- >> drivers/net/tun.c | 48 >> +++++++++++++++++++++++++++++++++------------ >> include/uapi/linux/if_tun.h | 3 +++ >> 3 files changed, 60 insertions(+), 22 deletions(-) >> >> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c >> index a2c3a89..c429c56 100644 >> --- a/drivers/net/macvtap.c >> +++ b/drivers/net/macvtap.c >> @@ -292,9 +292,6 @@ static rx_handler_result_t macvtap_handle_frame(struct >> sk_buff **pskb) >> if (!q) >> return RX_HANDLER_PASS; >> >> - if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) >> - goto drop; >> - >> skb_push(skb, ETH_HLEN); >> >> /* Apply the forward feature mask so that we perform segmentation >> @@ -310,8 +307,10 @@ static rx_handler_result_t macvtap_handle_frame(struct >> sk_buff **pskb) >> goto drop; >> >> if (!segs) { >> - skb_queue_tail(&q->sk.sk_receive_queue, skb); >> - goto wake_up; >> + if (sock_queue_rcv_skb(&q->sk, skb)) >> + goto drop; >> + else >> + goto wake_up; >> } >> >> kfree_skb(skb); >> @@ -319,11 +318,17 @@ static rx_handler_result_t macvtap_handle_frame(struct >> sk_buff **pskb) >> struct sk_buff *nskb = segs->next; >> >> segs->next = NULL; >> - skb_queue_tail(&q->sk.sk_receive_queue, segs); >> + if (sock_queue_rcv_skb(&q->sk, segs)) { >> + skb = segs; >> + skb->next = nskb; >> + goto drop; >> + } >> + >> segs = nskb; >> } >> } else { >> - skb_queue_tail(&q->sk.sk_receive_queue, skb); >> + if (sock_queue_rcv_skb(&q->sk, skb)) >> + goto drop; >> } >> >> wake_up: >> @@ -333,7 +338,7 @@ wake_up: >> drop: >> /* Count errors/drops only here, thus don't care about args. */ >> macvlan_count_rx(vlan, 0, 0, 0); >> - kfree_skb(skb); >> + kfree_skb_list(skb); >> return RX_HANDLER_CONSUMED; >> } >> >> @@ -414,7 +419,7 @@ static void macvtap_dellink(struct net_device *dev, >> static void macvtap_setup(struct net_device *dev) >> { >> macvlan_common_setup(dev); >> - dev->tx_queue_len = TUN_READQ_SIZE; >> + dev->tx_queue_len = 0; >> } >> >> static struct rtnl_link_ops macvtap_link_ops __read_mostly = { >> @@ -469,6 +474,7 @@ static int macvtap_open(struct inode *inode, struct file >> *file) >> sock_init_data(&q->sock, &q->sk); >> q->sk.sk_write_space = macvtap_sock_write_space; >> q->sk.sk_destruct = macvtap_sock_destruct; >> + q->sk.sk_rcvbuf = TUN_RCVBUF; >> q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; >> q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); >> >> @@ -1040,6 +1046,13 @@ static long macvtap_ioctl(struct file *file, unsigned >> int cmd, >> q->sk.sk_sndbuf = u; >> return 0; >> >> + case TUNSETRCVBUF: >> + if (get_user(u, up)) >> + return -EFAULT; >> + >> + q->sk.sk_rcvbuf = u; >> + return 0; >> + >> case TUNGETVNETHDRSZ: >> s = q->vnet_hdr_sz; >> if (put_user(s, sp)) >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index 09f6662..7a08fa3 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -177,6 +177,7 @@ struct tun_struct { >> >> int vnet_hdr_sz; >> int sndbuf; >> + int rcvbuf; >> struct tap_filter txflt; >> struct sock_fprog fprog; >> /* protected by rtnl lock */ >> @@ -771,17 +772,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, >> struct net_device *dev) >> if (!check_filter(&tun->txflt, skb)) >> goto drop; >> >> - if (tfile->socket.sk->sk_filter && >> - sk_filter(tfile->socket.sk, skb)) >> - goto drop; >> - >> - /* Limit the number of packets queued by dividing txq length with the >> - * number of queues. >> - */ >> - if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) >> - >= dev->tx_queue_len / tun->numqueues) >> - goto drop; >> - >> if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) >> goto drop; >> >> @@ -798,7 +788,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, >> struct net_device *dev) >> nf_reset(skb); >> >> /* Enqueue packet */ >> - skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb); >> + if (sock_queue_rcv_skb(tfile->socket.sk, skb)) >> + goto drop; >> >> /* Notify and wake up reader process */ >> if (tfile->flags & TUN_FASYNC) >> @@ -1668,6 +1659,7 @@ static int tun_set_iff(struct net *net, struct file >> *file, struct ifreq *ifr) >> >> tun->filter_attached = false; >> tun->sndbuf = tfile->socket.sk->sk_sndbuf; >> + tun->rcvbuf = tfile->socket.sk->sk_rcvbuf; >> >> spin_lock_init(&tun->lock); >> >> @@ -1837,6 +1829,17 @@ static void tun_set_sndbuf(struct tun_struct *tun) >> } >> } >> >> +static void tun_set_rcvbuf(struct tun_struct *tun) >> +{ >> + struct tun_file *tfile; >> + int i; >> + >> + for (i = 0; i < tun->numqueues; i++) { >> + tfile = rtnl_dereference(tun->tfiles[i]); >> + tfile->socket.sk->sk_sndbuf = tun->sndbuf; >> + } >> +} >> + >> static int tun_set_queue(struct file *file, struct ifreq *ifr) >> { >> struct tun_file *tfile = file->private_data; >> @@ -1878,7 +1881,7 @@ static long __tun_chr_ioctl(struct file *file, >> unsigned int cmd, >> struct ifreq ifr; >> kuid_t owner; >> kgid_t group; >> - int sndbuf; >> + int sndbuf, rcvbuf; >> int vnet_hdr_sz; >> unsigned int ifindex; >> int ret; >> @@ -2061,6 +2064,22 @@ static long __tun_chr_ioctl(struct file *file, >> unsigned int cmd, >> tun_set_sndbuf(tun); >> break; >> >> + case TUNGETRCVBUF: >> + rcvbuf = tfile->socket.sk->sk_rcvbuf; >> + if (copy_to_user(argp, &rcvbuf, sizeof(rcvbuf))) >> + ret = -EFAULT; >> + break; >> + >> + case TUNSETRCVBUF: >> + if (copy_from_user(&rcvbuf, argp, sizeof(rcvbuf))) { >> + ret = -EFAULT; >> + break; >> + } >> + >> + tun->rcvbuf = rcvbuf; >> + tun_set_rcvbuf(tun); >> + break; >> + >> case TUNGETVNETHDRSZ: >> vnet_hdr_sz = tun->vnet_hdr_sz; >> if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) >> @@ -2139,6 +2158,8 @@ static long tun_chr_compat_ioctl(struct file *file, >> case TUNSETTXFILTER: >> case TUNGETSNDBUF: >> case TUNSETSNDBUF: >> + case TUNGETRCVBUF: >> + case TUNSETRCVBUF: >> case SIOCGIFHWADDR: >> case SIOCSIFHWADDR: >> arg = (unsigned long)compat_ptr(arg); >> @@ -2204,6 +2225,7 @@ static int tun_chr_open(struct inode *inode, struct >> file * file) >> >> tfile->sk.sk_write_space = tun_sock_write_space; >> tfile->sk.sk_sndbuf = INT_MAX; >> + tfile->sk.sk_rcvbuf = TUN_RCVBUF; >> >> file->private_data = tfile; >> set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags); >> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h >> index e9502dd..8e04657 100644 >> --- a/include/uapi/linux/if_tun.h >> +++ b/include/uapi/linux/if_tun.h >> @@ -22,6 +22,7 @@ >> >> /* Read queue size */ >> #define TUN_READQ_SIZE 500 >> +#define TUN_RCVBUF (512 * PAGE_SIZE) >> >> /* TUN device flags */ >> #define TUN_TUN_DEV 0x0001 > That's about 16x less than we were able to queue previously > by default. > How can you be sure this won't break any applications? > Ok, we can change it back to 500 * 65535, but I'm not sure whether this or not this value (about 32M) is too big for a single socket. >> @@ -58,6 +59,8 @@ >> #define TUNSETQUEUE _IOW('T', 217, int) >> #define TUNSETIFINDEX _IOW('T', 218, unsigned int) >> #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) >> +#define TUNGETRCVBUF _IOR('T', 220, int) >> +#define TUNSETRCVBUF _IOW('T', 221, int) >> >> /* TUNSETIFF ifr flags */ >> #define IFF_TUN 0x0001 >> -- >> 1.8.3.2 > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/