Re: [External] : Re: parallel forwarding vs. bridges

David Gwynne Wed, 02 Jun 2021 23:27:42 -0700

On Thu, Jun 03, 2021 at 01:09:48AM +0200, Alexandr Nedvedicky wrote:
> >  pf_purge_expired_states(u_int32_t maxcheck)
> >  {
> </snip>
> > -           cur = pf_state_ref(next);
> > +   do {
> > +           if ((cur->timeout == PFTM_UNLINKED) ||
> > +               (pf_state_expires(cur) <= getuptime())) {
> > +                   SLIST_INSERT_HEAD(&gcl, pf_state_ref(cur), gc_list);
>                                               ^^^^^^^^^^^^
>     I wonder: is the extra reference you are chasing for coming from here?
>     I suspect pf_state_ref(cur) is being called two times, as macro expands.


I think you're right :D :D

This is an updated diff with a fix for this and the explanation. It also
reduces the scope of the NET_LOCK so scanning the state list shouldn't
block the network stack.

Index: pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.1118
diff -u -p -r1.1118 pf.c
--- pf.c        1 Jun 2021 09:57:11 -0000       1.1118
+++ pf.c        3 Jun 2021 06:24:48 -0000
@@ -308,7 +308,7 @@ static __inline void pf_set_protostate(s
 struct pf_src_tree tree_src_tracking;
 
 struct pf_state_tree_id tree_id;
-struct pf_state_queue state_list;
+struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list);
 
 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
 RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
@@ -440,6 +440,37 @@ pf_check_threshold(struct pf_threshold *
        return (threshold->count > threshold->limit);
 }
 
+void
+pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /*
+        * we can always put states on the end of the list.
+        *
+        * things reading the list should take a read lock, then
+        * the mutex, get the head and tail pointers, release the
+        * mutex, and then they can iterate between the head and tail.
+        */
+
+       pf_state_ref(st); /* get a ref for the list */
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+}
+
+void
+pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /* states can only be removed when the write lock is held */
+       rw_assert_wrlock(&pfs->pfs_rwl);
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_REMOVE(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+
+       pf_state_unref(st); /* list no longer references the state */
+}
+
 int
 pf_src_connlimit(struct pf_state **state)
 {
@@ -986,7 +1017,7 @@ pf_state_insert(struct pfi_kif *kif, str
                PF_STATE_EXIT_WRITE();
                return (-1);
        }
-       TAILQ_INSERT_TAIL(&state_list, s, entry_list);
+       pf_state_list_insert(&pf_state_list, s);
        pf_status.fcounters[FCNT_STATE_INSERT]++;
        pf_status.states++;
        pfi_kif_ref(kif, PFI_KIF_REF_STATE);
@@ -1247,7 +1278,8 @@ pf_purge_expired_rules(void)
 void
 pf_purge_timeout(void *unused)
 {
-       task_add(net_tq(0), &pf_purge_task);
+       /* XXX move to systqmp to avoid KERNEL_LOCK */
+       task_add(systq, &pf_purge_task);
 }
 
 void
@@ -1255,9 +1287,6 @@ pf_purge(void *xnloops)
 {
        int *nloops = xnloops;
 
-       KERNEL_LOCK();
-       NET_LOCK();
-
        /*
         * process a fraction of the state table every second
         * Note:
@@ -1268,6 +1297,8 @@ pf_purge(void *xnloops)
        pf_purge_expired_states(1 + (pf_status.states
            / pf_default_rule.timeout[PFTM_INTERVAL]));
 
+       NET_LOCK();
+
        PF_LOCK();
        /* purge other expired types every PFTM_INTERVAL seconds */
        if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
@@ -1280,11 +1311,10 @@ pf_purge(void *xnloops)
         * Fragments don't require PF_LOCK(), they use their own lock.
         */
        if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
-               pf_purge_expired_fragments();
+               pfgpurge_expired_fragment(s);
                *nloops = 0;
        }
        NET_UNLOCK();
-       KERNEL_UNLOCK();
 
        timeout_add_sec(&pf_purge_to, 1);
 }
@@ -1447,7 +1477,7 @@ pf_free_state(struct pf_state *cur)
        }
        pf_normalize_tcp_cleanup(cur);
        pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
-       TAILQ_REMOVE(&state_list, cur, entry_list);
+       pf_state_list_remove(&pf_state_list, cur);
        if (cur->tag)
                pf_tag_unref(cur->tag);
        pf_state_unref(cur);
@@ -1458,53 +1488,77 @@ pf_free_state(struct pf_state *cur)
 void
 pf_purge_expired_states(u_int32_t maxcheck)
 {
+       /*
+        * this task/thread/context/whatever is the only thing that
+        * removes states from the pf_state_list, so the cur reference
+        * it holds between calls is guaranteed to still be in the
+        * list.
+        */
        static struct pf_state  *cur = NULL;
-       struct pf_state         *next;
-       SLIST_HEAD(pf_state_gcl, pf_state) gcl;
+
+       struct pf_state         *head, *tail;
+       struct pf_state         *st;
+       SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
 
        PF_ASSERT_UNLOCKED();
-       SLIST_INIT(&gcl);
 
-       PF_STATE_ENTER_READ();
-       while (maxcheck--) {
-               /* wrap to start of list when we hit the end */
-               if (cur == NULL) {
-                       cur = pf_state_ref(TAILQ_FIRST(&state_list));
-                       if (cur == NULL)
-                               break;  /* list empty */
-               }
+       rw_enter_read(&pf_state_list.pfs_rwl);
 
-               /* get next state, as cur may get deleted */
-               next = TAILQ_NEXT(cur, entry_list);
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       if (head == NULL) {
+               /* the list is empty */
+               rw_exit_read(&pf_state_list.pfs_rwl);
+               return;
+       }
 
-               if ((cur->timeout == PFTM_UNLINKED) ||
-                   (pf_state_expires(cur) <= getuptime()))
-                       SLIST_INSERT_HEAD(&gcl, cur, gc_list);
-               else
-                       pf_state_unref(cur);
+       /* (re)start at the front of the list */
+       if (cur == NULL)
+               cur = head;
 
-               cur = pf_state_ref(next);
+       do {
+               if ((cur->timeout == PFTM_UNLINKED) ||
+                   (pf_state_expires(cur) <= getuptime())) {
+                       st = pf_state_ref(cur);
+                       SLIST_INSERT_HEAD(&gcl, st, gc_list);
+               }
 
-               if (cur == NULL)
+               /* don't iterate past the end of our view of the list */
+               if (cur == tail) {
+                       cur = NULL;
                        break;
-       }
-       PF_STATE_EXIT_READ();
+               }
+
+               cur = TAILQ_NEXT(cur, entry_list);
+       } while (maxcheck--);
 
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
+       if (SLIST_EMPTY(&gcl))
+               return;
+
+       NET_LOCK();
+       rw_enter_write(&pf_state_list.pfs_rwl);
        PF_LOCK();
        PF_STATE_ENTER_WRITE();
-       while ((next = SLIST_FIRST(&gcl)) != NULL) {
-               SLIST_REMOVE_HEAD(&gcl, gc_list);
-               if (next->timeout == PFTM_UNLINKED)
-                       pf_free_state(next);
-               else {
-                       pf_remove_state(next);
-                       pf_free_state(next);
-               }
+       SLIST_FOREACH(st, &gcl, gc_list) {
+               if (st->timeout != PFTM_UNLINKED)
+                       pf_remove_state(st);
 
-               pf_state_unref(next);
+               pf_free_state(st);
        }
        PF_STATE_EXIT_WRITE();
        PF_UNLOCK();
+       rw_exit_write(&pf_state_list.pfs_rwl);
+       NET_UNLOCK();
+
+       while ((st = SLIST_FIRST(&gcl)) != NULL) {
+               SLIST_REMOVE_HEAD(&gcl, gc_list);
+               pf_state_unref(st);
+       }
 }
 
 int
Index: pf_ioctl.c
===================================================================
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.364
diff -u -p -r1.364 pf_ioctl.c
--- pf_ioctl.c  2 Jun 2021 07:46:22 -0000       1.364
+++ pf_ioctl.c  3 Jun 2021 06:24:48 -0000
@@ -113,6 +113,8 @@ int                  pf_rule_copyin(struct pf_rule *, 
 u_int16_t               pf_qname2qid(char *, int);
 void                    pf_qid2qname(u_int16_t, char *);
 void                    pf_qid_unref(u_int16_t);
+int                     pf_states_clr(struct pfioc_state_kill *);
+int                     pf_states_get(struct pfioc_states *);
 
 struct pf_rule          pf_default_rule, pf_default_rule_new;
 
@@ -206,7 +208,6 @@ pfattach(int num)
        TAILQ_INIT(&pf_queues[1]);
        pf_queues_active = &pf_queues[0];
        pf_queues_inactive = &pf_queues[1];
-       TAILQ_INIT(&state_list);
 
        /* default rule should never be garbage collected */
        pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
@@ -903,6 +904,122 @@ pf_addr_copyout(struct pf_addr_wrap *add
 }
 
 int
+pf_states_clr(struct pfioc_state_kill *psk)
+{
+       struct pf_state         *s, *nexts;
+       struct pf_state         *head, *tail;
+       u_int                    killed = 0;
+       int                      error;
+
+       NET_LOCK();
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               goto unlock;
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       s = NULL;
+       nexts = head;
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+
+       while (s != tail) {
+               s = nexts;
+               nexts = TAILQ_NEXT(s, entry_list);
+
+               if (s->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
+                   s->kif->pfik_name)) {
+#if NPFSYNC > 0
+                       /* don't send out individual delete messages */
+                       SET(s->state_flags, PFSTATE_NOSYNC);
+#endif /* NPFSYNC > 0 */
+                       pf_remove_state(s);
+                       killed++;
+               }
+       }
+
+       PF_STATE_EXIT_WRITE();
+#if NPFSYNC > 0
+       pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
+#endif /* NPFSYNC > 0 */
+       PF_UNLOCK();
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       psk->psk_killed = killed;
+unlock:
+       NET_UNLOCK();
+
+       return (error);
+}
+
+int
+pf_states_get(struct pfioc_states *ps)
+{
+       struct pf_state         *head, *tail;
+       struct pf_state         *next, *state;
+       struct pfsync_state     *p, pstore;
+       u_int32_t                nr = 0;
+       int                      error;
+
+       if (ps->ps_len == 0) {
+               nr = pf_status.states;
+               ps->ps_len = sizeof(struct pfsync_state) * nr;
+               return (0);
+       }
+
+       p = ps->ps_states;
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               return (error);
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       state = NULL;
+       next = head;
+
+       while (state != tail) {
+               state = next;
+               next = TAILQ_NEXT(state, entry_list);
+
+               if (state->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if ((nr+1) * sizeof(*p) > ps->ps_len)
+                       break;
+
+               pf_state_export(&pstore, state);
+               error = copyout(&pstore, p, sizeof(*p));
+               if (error)
+                       goto fail;
+
+               p++;
+               nr++;
+       }
+       ps->ps_len = sizeof(struct pfsync_state) * nr;
+
+fail:
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       return (error);
+}
+
+int
 pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
 {
        int                      error = 0;
@@ -1545,36 +1662,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
                break;
        }
 
-       case DIOCCLRSTATES: {
-               struct pf_state         *s, *nexts;
-               struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
-               u_int                    killed = 0;
-
-               NET_LOCK();
-               PF_LOCK();
-               PF_STATE_ENTER_WRITE();
-               for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) {
-                       nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
-
-                       if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
-                           s->kif->pfik_name)) {
-#if NPFSYNC > 0
-                               /* don't send out individual delete messages */
-                               SET(s->state_flags, PFSTATE_NOSYNC);
-#endif /* NPFSYNC > 0 */
-                               pf_remove_state(s);
-                               killed++;
-                       }
-               }
-               PF_STATE_EXIT_WRITE();
-               psk->psk_killed = killed;
-#if NPFSYNC > 0
-               pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
-#endif /* NPFSYNC > 0 */
-               PF_UNLOCK();
-               NET_UNLOCK();
+       case DIOCCLRSTATES:
+               error = pf_states_clr((struct pfioc_state_kill *)addr);
                break;
-       }
 
        case DIOCKILLSTATES: {
                struct pf_state         *s, *nexts;
@@ -1757,50 +1847,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
                break;
        }
 
-       case DIOCGETSTATES: {
-               struct pfioc_states     *ps = (struct pfioc_states *)addr;
-               struct pf_state         *state;
-               struct pfsync_state     *p, *pstore;
-               u_int32_t                nr = 0;
-
-               if (ps->ps_len == 0) {
-                       nr = pf_status.states;
-                       ps->ps_len = sizeof(struct pfsync_state) * nr;
-                       break;
-               }
-
-               pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
-
-               p = ps->ps_states;
-
-               NET_LOCK();
-               PF_STATE_ENTER_READ();
-               state = TAILQ_FIRST(&state_list);
-               while (state) {
-                       if (state->timeout != PFTM_UNLINKED) {
-                               if ((nr+1) * sizeof(*p) > ps->ps_len)
-                                       break;
-                               pf_state_export(pstore, state);
-                               error = copyout(pstore, p, sizeof(*p));
-                               if (error) {
-                                       free(pstore, M_TEMP, sizeof(*pstore));
-                                       PF_STATE_EXIT_READ();
-                                       NET_UNLOCK();
-                                       goto fail;
-                               }
-                               p++;
-                               nr++;
-                       }
-                       state = TAILQ_NEXT(state, entry_list);
-               }
-               PF_STATE_EXIT_READ();
-               NET_UNLOCK();
-
-               ps->ps_len = sizeof(struct pfsync_state) * nr;
-
-               free(pstore, M_TEMP, sizeof(*pstore));
+       case DIOCGETSTATES: 
+               error = pf_states_get((struct pfioc_states *)addr);
                break;
-       }
 
        case DIOCGETSTATUS: {
                struct pf_status *s = (struct pf_status *)addr;
Index: pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.500
diff -u -p -r1.500 pfvar.h
--- pfvar.h     10 Mar 2021 10:21:48 -0000      1.500
+++ pfvar.h     3 Jun 2021 06:24:48 -0000
@@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state);
 RB_PROTOTYPE(pf_state_tree_id, pf_state,
     entry_id, pf_state_compare_id);
 extern struct pf_state_tree_id tree_id;
-extern struct pf_state_queue state_list;
+extern struct pf_state_list pf_state_list;
 
 TAILQ_HEAD(pf_queuehead, pf_queuespec);
 extern struct pf_queuehead               pf_queues[2];
Index: pfvar_priv.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar_priv.h,v
retrieving revision 1.6
diff -u -p -r1.6 pfvar_priv.h
--- pfvar_priv.h        9 Feb 2021 14:06:19 -0000       1.6
+++ pfvar_priv.h        3 Jun 2021 06:24:48 -0000
@@ -38,6 +38,115 @@
 #ifdef _KERNEL
 
 #include <sys/rwlock.h>
+#include <sys/mutex.h>
+
+/*
+
+states are linked into a global list to support the following
+functionality:
+
+ - garbage collection
+ - pfsync bulk send operations
+ - bulk state fetches via the DIOCGETSTATES ioctl
+ - bulk state clearing via the DIOCCLRSTATES ioctl
+
+states are inserted into the global pf_state_list once it has also
+been successfully added to the various trees that make up the state
+table. states are only removed from the pf_state_list by the garbage
+collection process.
+
+the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD
+structure) and the pointers between the entries on the pf_state_list
+are locked separately. at a high level, this allows for insertion
+of new states into the pf_state_list while other contexts (eg, the
+ioctls) are traversing the state items in the list. for garbage
+collection to remove items from the pf_state_list, it has to exclude
+both modifications to the list head and tail pointers, and traversal
+of the links between the states.
+
+the head and tail pointers are protected by a mutex. the pointers
+between states are protected by an rwlock.
+
+because insertions are only made to the end of the list, if we get
+a snapshot of the head and tail of the list and prevent modifications
+to the links between states, we can safely traverse between the
+head and tail entries. subsequent insertions can add entries after
+our view of the tail, but we don't look past our view.
+
+if both locks must be taken, the rwlock protecting the links between
+states is taken before the mutex protecting the head and tail
+pointer.
+
+insertion into the list follows this pattern:
+
+       // serialise list head/tail modifications
+       mtx_enter(&pf_state_list.pfs_mtx);
+       TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+traversal of the list:
+
+       // lock against the gc removing an item from the list
+       rw_enter_read(&pf_state_list.pfs_rwl);
+
+       // get a snapshot view of the ends of the list
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       state = NULL;
+       next = head;
+
+       while (state != tail) {
+               state = next;
+               next = TAILQ_NEXT(state, entry_list);
+
+               // look at the state
+       }
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
+removing an item from the list:
+
+       // wait for iterators (readers) to get out
+       rw_enter_write(&pf_state_list.pfs_rwl);
+
+       // serialise list head/tail modifications
+       mtx_enter(&pf_state_list.pfs_mtx);
+       TAILQ_REMOVE(&pf_state_list.pfs_list, state, entry_list);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       rw_exit_write(&pf_state_list.pfs_rwl);
+
+the lock ordering for pf_state_list locks and the rest of the pf
+locks are:
+
+1. KERNEL_LOCK
+2. NET_LOCK
+3. pf_state_list.pfs_rwl
+4. PF_LOCK
+5. PF_STATE_LOCK
+6. pf_state_list.pfs_mtx
+
+*/
+
+struct pf_state_list {
+       /* the actual list of states in the system */
+       struct pf_state_queue           pfs_list;
+
+       /* serialises insertion of states in the list */
+       struct mutex                    pfs_mtx;
+
+       /* coordinate access to the list for deletion */
+       struct rwlock                   pfs_rwl;
+};
+
+#define PF_STATE_LIST_INITIALIZER(_pfs) {                              \
+       .pfs_list       = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list),        \
+       .pfs_mtx        = MUTEX_INITIALIZER(IPL_SOFTNET),               \
+       .pfs_rwl        = RWLOCK_INITIALIZER("pfstates"),               \
+}
 
 extern struct rwlock pf_lock;
 
Index: if_pfsync.c
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.288
diff -u -p -r1.288 if_pfsync.c
--- if_pfsync.c 10 Mar 2021 10:21:48 -0000      1.288
+++ if_pfsync.c 3 Jun 2021 06:24:48 -0000
@@ -2509,22 +2509,34 @@ pfsync_bulk_start(void)
 {
        struct pfsync_softc *sc = pfsyncif;
 
+       NET_ASSERT_LOCKED();
+
+       /*
+        * pf gc via pfsync_state_in_use reads sc_bulk_next and
+        * sc_bulk_last while exclusively holding the pf_state_list
+        * rwlock. make sure it can't race with us setting these
+        * pointers. they basically act as hazards, and borrow the
+        * lists state reference count.
+        */
+       rw_enter_read(&pf_state_list.pfs_rwl);
+
+       /* get a consistent view of the list pointers */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       if (sc->sc_bulk_next == NULL)
+               sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
+
+       sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
        DPFPRINTF(LOG_INFO, "received bulk update request");
 
-       if (TAILQ_EMPTY(&state_list))
+       if (sc->sc_bulk_last == NULL)
                pfsync_bulk_status(PFSYNC_BUS_END);
        else {
                sc->sc_ureq_received = getuptime();
 
-               if (sc->sc_bulk_next == NULL) {
-                       PF_STATE_ENTER_READ();
-                       sc->sc_bulk_next = TAILQ_FIRST(&state_list);
-                       pf_state_ref(sc->sc_bulk_next);
-                       PF_STATE_EXIT_READ();
-               }
-               sc->sc_bulk_last = sc->sc_bulk_next;
-               pf_state_ref(sc->sc_bulk_last);
-
                pfsync_bulk_status(PFSYNC_BUS_START);
                timeout_add(&sc->sc_bulk_tmo, 0);
        }
@@ -2534,13 +2546,15 @@ void
 pfsync_bulk_update(void *arg)
 {
        struct pfsync_softc *sc;
-       struct pf_state *st, *st_next;
+       struct pf_state *st;
        int i = 0;
 
        NET_LOCK();
        sc = pfsyncif;
        if (sc == NULL)
                goto out;
+
+       rw_enter_read(&pf_state_list.pfs_rwl);
        st = sc->sc_bulk_next;
        sc->sc_bulk_next = NULL;
 
@@ -2552,23 +2566,9 @@ pfsync_bulk_update(void *arg)
                        i++;
                }
 
-               /*
-                * I wonder how we prevent infinite bulk update.  IMO it can
-                * happen when sc_bulk_last state expires before we iterate
-                * through the whole list.
-                */
-               PF_STATE_ENTER_READ();
-               st_next = TAILQ_NEXT(st, entry_list);
-               pf_state_unref(st);
-               st = st_next;
-               if (st == NULL)
-                       st = TAILQ_FIRST(&state_list);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-
+               st = TAILQ_NEXT(st, entry_list);
                if ((st == NULL) || (st == sc->sc_bulk_last)) {
                        /* we're done */
-                       pf_state_unref(sc->sc_bulk_last);
                        sc->sc_bulk_last = NULL;
                        pfsync_bulk_status(PFSYNC_BUS_END);
                        break;
@@ -2582,6 +2582,8 @@ pfsync_bulk_update(void *arg)
                        break;
                }
        }
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
  out:
        NET_UNLOCK();
 }
@@ -2678,6 +2680,8 @@ pfsync_state_in_use(struct pf_state *st)
 
        if (sc == NULL)
                return (0);
+
+       rw_assert_wrlock(&pf_state_list.pfs_rwl);
 
        if (st->sync_state != PFSYNC_S_NONE ||
            st == sc->sc_bulk_next ||

Re: [External] : Re: parallel forwarding vs. bridges

Reply via email to