Try the attached patch?
>From 89b144a95e698df4a844157c55b70cef488548ba Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <riastr...@netbsd.org>
Date: Sun, 22 Jan 2023 15:56:55 +0000
Subject: [PATCH] npf(9): Don't hold table lock across copyout.

Instead, block npf_table_gc during copyout of anything that might be
GC'd, so it can't be freed during copyout, but then drop the lock
across copyout.

Make LPM-type tables use the same GC list as thmap-type (`ipset')
tables, since copyout must block GC the same way.

Note that copyout may block indefinitely, so npf_table_gc may also
block indefinitely as a consequence.  If npf_table_gc is required to
return in bounded time, it can't wait for all copyouts to finish, so
if copyouts are in progress we need to find some way to retry the GC
later -- maybe just issue npf_table_gc when npf_table_list returns.

PR kern/57136
PR kern/57181
---
 sys/net/npf/npf_tableset.c | 68 +++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/sys/net/npf/npf_tableset.c b/sys/net/npf/npf_tableset.c
index 51d6268143ce..d67f3dee8fef 100644
--- a/sys/net/npf/npf_tableset.c
+++ b/sys/net/npf/npf_tableset.c
@@ -80,10 +80,7 @@ struct npf_table {
         * There are separate trees for IPv4 and IPv6.
         */
        union {
-               struct {
-                       thmap_t *       t_map;
-                       LIST_HEAD(, npf_tblent) t_gc;
-               };
+               thmap_t *               t_map;
                lpm_t *                 t_lpm;
                struct {
                        void *          t_blob;
@@ -97,6 +94,7 @@ struct npf_table {
                };
        } /* C11 */;
        LIST_HEAD(, npf_tblent)         t_list;
+       LIST_HEAD(, npf_tblent)         t_gc;
        unsigned                        t_nitems;
 
        /*
@@ -107,6 +105,9 @@ struct npf_table {
        unsigned                t_id;
        kmutex_t                t_lock;
 
+       uint64_t                t_gclock;
+       kcondvar_t              t_gclockcv;
+
        /* Reference count and table name. */
        unsigned                t_refcnt;
        char                    t_name[NPF_TABLE_MAXNAMELEN];
@@ -413,6 +414,8 @@ npf_table_create(const char *name, u_int tid, int type,
        mutex_init(&t->t_lock, MUTEX_DEFAULT, IPL_NET);
        t->t_type = type;
        t->t_id = tid;
+       t->t_gclock = 0;
+       cv_init(&t->t_gclockcv, "npftabgc");
        return t;
 out:
        kmem_free(t, sizeof(npf_table_t));
@@ -447,6 +450,8 @@ npf_table_destroy(npf_table_t *t)
        default:
                KASSERT(false);
        }
+       KASSERTMSG(t->t_gclock == 0, "t_gclock=%"PRIu64, t->t_gclock);
+       cv_destroy(&t->t_gclockcv);
        mutex_destroy(&t->t_lock);
        kmem_free(t, sizeof(npf_table_t));
 }
@@ -615,7 +620,7 @@ int
 npf_table_remove(npf_table_t *t, const int alen,
     const npf_addr_t *addr, const npf_netmask_t mask)
 {
-       npf_tblent_t *ent = NULL;
+       npf_tblent_t *ent;
        int error;
 
        error = npf_netmask_check(alen, mask);
@@ -630,7 +635,6 @@ npf_table_remove(npf_table_t *t, const int alen,
                if (__predict_true(ent != NULL)) {
                        LIST_REMOVE(ent, te_listent);
                        LIST_INSERT_HEAD(&t->t_gc, ent, te_listent);
-                       ent = NULL; // to be G/C'ed
                        t->t_nitems--;
                } else {
                        error = ENOENT;
@@ -640,6 +644,7 @@ npf_table_remove(npf_table_t *t, const int alen,
                ent = lpm_lookup(t->t_lpm, addr, alen);
                if (__predict_true(ent != NULL)) {
                        LIST_REMOVE(ent, te_listent);
+                       LIST_INSERT_HEAD(&t->t_gc, ent, te_listent);
                        lpm_remove(t->t_lpm, &ent->te_addr,
                            ent->te_alen, ent->te_preflen);
                        t->t_nitems--;
@@ -653,13 +658,9 @@ npf_table_remove(npf_table_t *t, const int alen,
                break;
        default:
                KASSERT(false);
-               ent = NULL;
        }
        mutex_exit(&t->t_lock);
 
-       if (ent) {
-               pool_cache_put(tblent_cache, ent);
-       }
        return error;
 }
 
@@ -762,18 +763,25 @@ table_ent_copyout(const npf_addr_t *addr, const int alen, 
npf_netmask_t mask,
 }
 
 static int
-table_generic_list(const npf_table_t *t, void *ubuf, size_t len)
+table_generic_list(npf_table_t *t, void *ubuf, size_t len)
 {
        npf_tblent_t *ent;
        size_t off = 0;
        int error = 0;
 
+       mutex_enter(&t->t_lock);
+       t->t_gclock++;
        LIST_FOREACH(ent, &t->t_list, te_listent) {
+               mutex_exit(&t->t_lock);
                error = table_ent_copyout(&ent->te_addr,
                    ent->te_alen, ent->te_preflen, ubuf, len, &off);
+               mutex_enter(&t->t_lock);
                if (error)
                        break;
        }
+       if (--t->t_gclock == 0)
+               cv_broadcast(&t->t_gclockcv);
+       mutex_exit(&t->t_lock);
        return error;
 }
 
@@ -803,7 +811,6 @@ npf_table_list(npf_table_t *t, void *ubuf, size_t len)
 {
        int error = 0;
 
-       mutex_enter(&t->t_lock);
        switch (t->t_type) {
        case NPF_TABLE_IPSET:
                error = table_generic_list(t, ubuf, len);
@@ -820,7 +827,6 @@ npf_table_list(npf_table_t *t, void *ubuf, size_t len)
        default:
                KASSERT(false);
        }
-       mutex_exit(&t->t_lock);
 
        return error;
 }
@@ -855,22 +861,32 @@ npf_table_flush(npf_table_t *t)
 void
 npf_table_gc(npf_t *npf, npf_table_t *t)
 {
-       npf_tblent_t *ent;
-       void *ref;
 
-       if (t->t_type != NPF_TABLE_IPSET || LIST_EMPTY(&t->t_gc)) {
-               return;
-       }
+       if (t->t_type == NPF_TABLE_IPSET) {
+               void *ref = thmap_stage_gc(t->t_map);
 
-       ref = thmap_stage_gc(t->t_map);
-       if (npf) {
-               npf_config_locked_p(npf);
-               npf_config_sync(npf);
+               if (npf) {
+                       npf_config_locked_p(npf);
+                       npf_config_sync(npf);
+               }
+               thmap_gc(t->t_map, ref);
        }
-       thmap_gc(t->t_map, ref);
 
-       while ((ent = LIST_FIRST(&t->t_gc)) != NULL) {
-               LIST_REMOVE(ent, te_listent);
-               pool_cache_put(tblent_cache, ent);
+       if (t->t_type == NPF_TABLE_IPSET || t->t_type == NPF_TABLE_LPM) {
+               npf_tblent_t *ent;
+
+               mutex_enter(&t->t_lock);
+               for (;;) {
+                       if (t->t_gclock > 0) {
+                               cv_wait(&t->t_gclockcv, &t->t_lock);
+                               continue;
+                       }
+                       if ((ent = LIST_FIRST(&t->t_gc)) == NULL)
+                               break;
+                       mutex_exit(&t->t_lock);
+                       pool_cache_put(tblent_cache, ent);
+                       mutex_enter(&t->t_lock);
+               }
+               mutex_exit(&t->t_lock);
        }
 }

Reply via email to