On Thu, Aug 10, 2006 at 03:32:49PM +0200, Peter Zijlstra ([EMAIL PROTECTED]) 
wrote:
> Hi,

Hello, Peter.

> So I try again, please tell me if I'm still on crack and should go detox.
> However if you do so, I kindly request some words on the how and why of it.

I think you should talk with doctor in that case, but not with kernel
hackers :)

I have some comments about implementation, not overall design, since we
have slightly diametral points of view there.

...

> --- linux-2.6.orig/net/core/skbuff.c
> +++ linux-2.6/net/core/skbuff.c
> @@ -43,6 +43,7 @@
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
>  #include <linux/mm.h>
> +#include <linux/pagemap.h>
>  #include <linux/interrupt.h>
>  #include <linux/in.h>
>  #include <linux/inet.h>
> @@ -125,6 +126,8 @@ EXPORT_SYMBOL(skb_truesize_bug);
>   *
>   */
>  
> +#define ceiling_log2(x)      fls((x) - 1)
> +
>  /**
>   *   __alloc_skb     -       allocate a network buffer
>   *   @size: size to allocate
> @@ -147,6 +150,59 @@ struct sk_buff *__alloc_skb(unsigned int
>       struct sk_buff *skb;
>       u8 *data;
>  
> +     size = SKB_DATA_ALIGN(size);
> +
> +     if (gfp_mask & __GFP_MEMALLOC) {
> +             /*
> +              * Fallback allocation for memalloc reserves.
> +              *
> +              * the page is populated like so:
> +              *
> +              *   struct sk_buff
> +              *   [ struct sk_buff ]
> +              *   [ atomic_t ]
> +              *   unsigned int
> +              *   struct skb_shared_info
> +              *   char []
> +              *
> +              * We have to do higher order allocations for icky jumbo
> +              * frame drivers :-(. They really should be migrated to
> +              * scather/gather DMA and use skb fragments.
> +              */
> +             unsigned int data_offset =
> +                     sizeof(struct sk_buff) + sizeof(unsigned int);
> +             unsigned long length = size + data_offset +
> +                     sizeof(struct skb_shared_info);
> +             unsigned int pages;
> +             unsigned int order;
> +             struct page *page;
> +             void *kaddr;
> +
> +             /*
> +              * Force fclone alloc in order to fudge a lacking in 
> skb_clone().
> +              */
> +             fclone = 1;
> +             if (fclone) {
> +                     data_offset += sizeof(struct sk_buff) + 
> sizeof(atomic_t);
> +                     length += sizeof(struct sk_buff) + sizeof(atomic_t);
> +             }
> +             pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +             order = ceiling_log2(pages);
> +             skb = NULL;
> +             if (!(page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order)))
> +                     goto out;
> +
> +             kaddr = pfn_to_kaddr(page_to_pfn(page));
> +             skb = (struct sk_buff *)kaddr;
> +
> +             *((unsigned int *)(kaddr + data_offset -
> +                                     sizeof(unsigned int))) = order;
> +             data = (u8 *)(kaddr + data_offset);
> +

Tricky, but since you are using own allocator here, you could change it to
be not so aggressive - i.e. do not round size to number of pages.

> +             goto allocated;
> +     }
> +
>       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>  
>       /* Get the HEAD */
> @@ -155,12 +211,13 @@ struct sk_buff *__alloc_skb(unsigned int
>               goto out;
>  
>       /* Get the DATA. Size must match skb_add_mtu(). */
> -     size = SKB_DATA_ALIGN(size);

Bad sign.

>       data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
>       if (!data)
>               goto nodata;
>  
> +struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
> +             unsigned length, gfp_t gfp_mask)
> +{
> +     struct sk_buff *skb;
> +
> +     WARN_ON(gfp_mask & (__GFP_NOMEMALLOC | __GFP_MEMALLOC));
> +     gfp_mask &= ~(__GFP_NOMEMALLOC | __GFP_MEMALLOC);
> +
> +     skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_NOMEMALLOC);
> +     if (skb)
> +             goto done;
> +
> +     if (atomic_read(&dev->rx_reserve_used) >=
> +                     dev->rx_reserve * dev->memalloc_socks)
> +             goto out;
> +
> +     /*
> +      * pre-inc guards against a race with netdev_wait_memalloc()
> +      */
> +     atomic_inc(&dev->rx_reserve_used);
> +     skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_MEMALLOC);
> +     if (unlikely(!skb)) {
> +             atomic_dec(&dev->rx_reserve_used);
> +             goto out;
> +     }

Since you have added atomic operation in that path, you can use device's
reference counter instead and do not care that it can dissapear.

> +done:
> +     skb->dev = dev;
> +out:
> +     return skb;
> +}
> +
>  static void skb_drop_list(struct sk_buff **listp)
>  {
>       struct sk_buff *list = *listp;
> @@ -313,10 +417,35 @@ static void skb_release_data(struct sk_b
>               if (skb_shinfo(skb)->frag_list)
>                       skb_drop_fraglist(skb);
>  
> -             kfree(skb->head);
> +             if (!skb->memalloc)
> +                     kfree(skb->head);
> +             skb->head = NULL;
>       }
>  }
>  
> +/**
> + *   free_skb_pages - frees a memalloced skbuff
> + *   @cache: fake %kmem_cache argument
> + *   @objp: %sk_buff pointer
> + *
> + *   Function is made to look like %kmem_cache_free so we can easily
> + *   substitue the free function in %kfree_skbmem.
> + */
> +static void free_skb_pages(struct kmem_cache *cache, void *objp)
> +{
> +     struct sk_buff *skb = (struct sk_buff *)objp;
> +     /*
> +      * The input_dev is the initial input device;
> +      * we have it pinned by virtue of rx_reserve_used not being zero.
> +      */
> +     struct net_device *dev = skb->input_dev ?: skb->dev;
> +     unsigned int order =
> +             *(unsigned int *)(skb->head - sizeof(unsigned int));
> +     if (!skb->head)
> +             atomic_dec(&dev->rx_reserve_used);
> +     free_pages((unsigned long)skb, order);
> +}
> +
>  /*
>   *   Free an skbuff by memory without cleaning the state.
>   */
> @@ -324,17 +453,21 @@ void kfree_skbmem(struct sk_buff *skb)
>  {
>       struct sk_buff *other;
>       atomic_t *fclone_ref;
> +     void (*free_skb)(struct kmem_cache *, void *);
>  
>       skb_release_data(skb);
> +
> +     free_skb = skb->memalloc ? free_skb_pages : kmem_cache_free;
> +
>       switch (skb->fclone) {
>       case SKB_FCLONE_UNAVAILABLE:
> -             kmem_cache_free(skbuff_head_cache, skb);
> +             free_skb(skbuff_head_cache, skb);
>               break;
>  
>       case SKB_FCLONE_ORIG:
>               fclone_ref = (atomic_t *) (skb + 2);
>               if (atomic_dec_and_test(fclone_ref))
> -                     kmem_cache_free(skbuff_fclone_cache, skb);
> +                     free_skb(skbuff_fclone_cache, skb);
>               break;
>  
>       case SKB_FCLONE_CLONE:
> @@ -347,7 +480,7 @@ void kfree_skbmem(struct sk_buff *skb)
>               skb->fclone = SKB_FCLONE_UNAVAILABLE;
>  
>               if (atomic_dec_and_test(fclone_ref))
> -                     kmem_cache_free(skbuff_fclone_cache, other);
> +                     free_skb(skbuff_fclone_cache, other);
>               break;
>       };
>  }
> @@ -434,6 +567,12 @@ struct sk_buff *skb_clone(struct sk_buff
>               n->fclone = SKB_FCLONE_CLONE;
>               atomic_inc(fclone_ref);
>       } else {
> +             /*
> +              * should we special-case skb->memalloc cloning?
> +              * for now fudge it by forcing fast-clone alloc.
> +              */
> +             BUG_ON(skb->memalloc);
> +
>               n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
>               if (!n)
>                       return NULL;

Ugh... cloning is a one of the shoulders of giant where Linux network
stack is staying...

> @@ -686,6 +825,8 @@ int pskb_expand_head(struct sk_buff *skb
>       if (skb_shared(skb))
>               BUG();
>  
> +     BUG_ON(skb->memalloc);
> +
>       size = SKB_DATA_ALIGN(size);
>  
>       data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);

And that is a bug.
That operation can happen even with usual receiving processing.

> +/**
> + *   dev_adjust_memalloc - adjust the global memalloc reserve for this device
> + *   @dev: device that has memalloc demands
> + *   @nr_socks: number of new %SOCK_MEMALLOC sockets
> + *
> + *   This function adjusts the memalloc reserve based on device
> + *   demand. For each %SOCK_MEMALLOC socket this device will reserve
> + *   2 * %MAX_PHYS_SEGMENTS pages for outbound traffic (assumption:
> + *   each %SOCK_MEMALLOC socket will have a %request_queue associated)
> + *   and @dev->rx_reserve mtu pages.
> + */
> +int dev_adjust_memalloc(struct net_device *dev, int nr_socks)
> +{
> +     unsigned long flags;
> +     unsigned long reserve;
> +     int err;
> +
> +     spin_lock_irqsave(&dev->memalloc_lock, flags);
> +
> +     dev->memalloc_socks += nr_socks;
> +     BUG_ON(dev->memalloc_socks < 0);
> +
> +     reserve = dev->memalloc_socks *
> +             (2 * MAX_PHYS_SEGMENTS +                 /* outbound */
> +              dev->rx_reserve * mtu_pages(dev->mtu)); /* inbound */
> +
> +     err = adjust_memalloc_reserve(reserve - dev->memalloc_reserve);
> +     if (err) {
> +             printk(KERN_WARNING
> +                     "%s: Unable to change RX reserve to: %lu, error: %d\n",
> +                     dev->name, reserve, err);
> +             goto unlock;
> +     }
> +     dev->memalloc_reserve = reserve;
> +
> +unlock:
> +     spin_unlock_irqrestore(&dev->memalloc_lock, flags);
> +     return err;
> +}
> +EXPORT_SYMBOL_GPL(dev_adjust_memalloc);
>  
>  /*
>   *   Device change register/unregister. These are not inline or static
> @@ -2464,6 +2527,9 @@ int dev_set_mtu(struct net_device *dev, 
>               err = dev->change_mtu(dev, new_mtu);
>       else
>               dev->mtu = new_mtu;
> +
> +     dev_adjust_memalloc(dev, 0);
> +
>       if (!err && dev->flags & IFF_UP)
>               raw_notifier_call_chain(&netdev_chain,
>                               NETDEV_CHANGEMTU, dev);
> @@ -2900,6 +2966,7 @@ int register_netdevice(struct net_device
>  #ifdef CONFIG_NET_CLS_ACT
>       spin_lock_init(&dev->ingress_lock);
>  #endif
> +     spin_lock_init(&dev->memalloc_lock);
>  
>       ret = alloc_divert_blk(dev);
>       if (ret)
> @@ -3106,6 +3173,28 @@ static void netdev_wait_allrefs(struct n
>       }
>  }
>  
> +/* netdev_wait_memalloc - wait for all outstanding memalloc skbs
> + *
> + * This is called when unregistering network devices.
> + *
> + * Better make sure the skb -> dev mapping is correct, if we leak
> + * some this will stall forever.
> + */
> +static void netdev_wait_memalloc(struct net_device *dev)
> +{
> +     unsigned long warning_time = jiffies;
> +     while (atomic_read(&dev->rx_reserve_used) != 0) {
> +             msleep(250);
> +             if (time_after(jiffies, warning_time + 10 * HZ)) {
> +                     printk(KERN_EMERG "netdev_wait_memalloc: "
> +                            "waiting for %s to become free. SKB "
> +                            "count = %d\n",
> +                            dev->name, atomic_read(&dev->rx_reserve_used));
> +                     warning_time = jiffies;
> +             }
> +     }
> +}
> +
>  /* The sequence is:
>   *
>   *   rtnl_lock();
> @@ -3165,6 +3254,14 @@ void netdev_run_todo(void)
>  
>               netdev_wait_allrefs(dev);
>  
> +             netdev_wait_memalloc(dev);
> +
> +             /* Get rid of any SOCK_MEMALLOC reserves. */
> +             if (dev->memalloc_reserve) {
> +                     BUG_ON(!dev->memalloc_socks);
> +                     dev_adjust_memalloc(dev, -dev->memalloc_socks);
> +             }
> +
>               /* paranoia */
>               BUG_ON(atomic_read(&dev->refcnt));
>               BUG_TRAP(!dev->ip_ptr);
> Index: linux-2.6/net/ipv4/af_inet.c
> ===================================================================
> --- linux-2.6.orig/net/ipv4/af_inet.c
> +++ linux-2.6/net/ipv4/af_inet.c
> @@ -132,6 +132,14 @@ void inet_sock_destruct(struct sock *sk)
>  {
>       struct inet_sock *inet = inet_sk(sk);
>  
> +     if (sk_is_memalloc(sk)) {
> +             struct net_device *dev = ip_dev_find(inet->rcv_saddr);
> +             if (dev) {
> +                     dev_adjust_memalloc(dev, -1);
> +                     dev_put(dev);
> +             }
> +     }
> +

This looks very strange - you decrement reference counter both in socket
destruction code and in netdevice destruction code.

-- 
        Evgeniy Polyakov
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to