Currently, the ip frag cache is fragile to overload. With
flow control disabled:

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08
./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
28.66

Once that the overload condition is reached, the system does not
recover until it's almost completely idle:

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; 
done
13.72

This is due to the removal of the fragment cache worker, which
was responsible to free some IP fragment cache memory when the
high threshold was reached, allowing the system to cope successfully
with the next fragmented packets.

This commit re-introduces the worker, on a per netns basis. Thanks
to rhashtable walkers we can block the bh only for an entry removal.

After this commit (and before IP frag worker removal):

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
8599.77

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; 
done
9623.12

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pab...@redhat.com>
---
Note: tweaking ipfrag sysfs does not solve completely the issue:
- raising ipfrag_high_thresh increases the number of parallels
  connections required to degrade the tput, but reached the IP
  fragment cache capacity the good-put still goes almost to 0,
  with the worker we get much more nice behaviour.
- setting ipfrag_time to 2 increases the change to recover from
  overload (the 2# test above), but with several experiments 
  in such test, I got an average of 50% the expected tput with a very
  large variance, with the worker we always see the expected/
  line rate tput.
---
 include/net/inet_frag.h  |  8 ++---
 net/ipv4/inet_fragment.c | 72 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..1f12692d7f7d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -11,6 +11,8 @@ struct netns_frags {
        int                     timeout;
        int                     max_dist;
        struct inet_frags       *f;
+       struct work_struct      frags_work;
+       struct rhashtable_iter  iter;
 
        struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
@@ -101,11 +103,7 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
-{
-       atomic_long_set(&nf->mem, 0);
-       return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
-}
+int inet_frags_init_net(struct netns_frags *nf);
 void inet_frags_exit_net(struct netns_frags *nf);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..0f5b29ce96de 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -88,10 +88,76 @@ static void inet_frags_free_cb(void *ptr, void *arg)
        inet_frag_put(fq);
 }
 
+static void inet_frag_schedule_worker(struct netns_frags *nf)
+{
+       if (unlikely(!work_pending(&nf->frags_work)))
+               schedule_work(&nf->frags_work);
+}
+
+#define INETFRAGS_EVICT_MAX    64
+static void inet_frag_worker(struct work_struct *work)
+{
+       struct netns_frags *nf;
+       bool reschedule;
+       int evicted = 0;
+
+       nf = container_of(work, struct netns_frags, frags_work);
+
+       rhashtable_walk_start(&nf->iter);
+
+       while ((reschedule = (frag_mem_limit(nf) > nf->low_thresh))) {
+               struct inet_frag_queue *fq = rhashtable_walk_next(&nf->iter);
+
+               if (IS_ERR(fq) && PTR_ERR(fq) == -EAGAIN)
+                       continue;
+               if (!fq) {
+                       /* end of table, restart the walk */
+                       rhashtable_walk_stop(&nf->iter);
+                       rhashtable_walk_exit(&nf->iter);
+                       rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+                       rhashtable_walk_start(&nf->iter);
+                       continue;
+               }
+               if (!refcount_inc_not_zero(&fq->refcnt))
+                       continue;
+
+               spin_lock_bh(&fq->lock);
+               inet_frag_kill(fq);
+               spin_unlock_bh(&fq->lock);
+               inet_frag_put(fq);
+
+               /* limit the amount of work we can do before a reschedule,
+                * to avoid starving others queued works
+                */
+               if (++evicted > INETFRAGS_EVICT_MAX)
+                       break;
+       }
+
+       rhashtable_walk_stop(&nf->iter);
+
+       if (reschedule)
+               inet_frag_schedule_worker(nf);
+}
+
+int inet_frags_init_net(struct netns_frags *nf)
+{
+       int ret;
+
+       atomic_long_set(&nf->mem, 0);
+       INIT_WORK(&nf->frags_work, inet_frag_worker);
+       ret = rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+       if (ret)
+               return ret;
+       rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+       return ret;
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
 void inet_frags_exit_net(struct netns_frags *nf)
 {
        nf->low_thresh = 0; /* prevent creation of new frags */
-
+       cancel_work_sync(&nf->frags_work);
+       rhashtable_walk_exit(&nf->iter);
        rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -157,8 +223,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct 
netns_frags *nf,
 {
        struct inet_frag_queue *q;
 
-       if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+       if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
+               inet_frag_schedule_worker(nf);
                return NULL;
+       }
 
        q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
        if (!q)
-- 
2.17.1

Reply via email to