On Thu, Jun 03, 2021 at 01:09:48AM +0200, Alexandr Nedvedicky wrote: > > pf_purge_expired_states(u_int32_t maxcheck) > > { > </snip> > > - cur = pf_state_ref(next); > > + do { > > + if ((cur->timeout == PFTM_UNLINKED) || > > + (pf_state_expires(cur) <= getuptime())) { > > + SLIST_INSERT_HEAD(&gcl, pf_state_ref(cur), gc_list); > ^^^^^^^^^^^^ > I wonder: is the extra reference you are chasing for coming from here? > I suspect pf_state_ref(cur) is being called two times, as macro expands.
I think you're right :D :D This is an updated diff with a fix for this and the explanation. It also reduces the scope of the NET_LOCK so scanning the state list shouldn't block the network stack. Index: pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.1118 diff -u -p -r1.1118 pf.c --- pf.c 1 Jun 2021 09:57:11 -0000 1.1118 +++ pf.c 3 Jun 2021 06:24:48 -0000 @@ -308,7 +308,7 @@ static __inline void pf_set_protostate(s struct pf_src_tree tree_src_tracking; struct pf_state_tree_id tree_id; -struct pf_state_queue state_list; +struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list); RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare); RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key); @@ -440,6 +440,37 @@ pf_check_threshold(struct pf_threshold * return (threshold->count > threshold->limit); } +void +pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st) +{ + /* + * we can always put states on the end of the list. + * + * things reading the list should take a read lock, then + * the mutex, get the head and tail pointers, release the + * mutex, and then they can iterate between the head and tail. + */ + + pf_state_ref(st); /* get a ref for the list */ + + mtx_enter(&pfs->pfs_mtx); + TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list); + mtx_leave(&pfs->pfs_mtx); +} + +void +pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st) +{ + /* states can only be removed when the write lock is held */ + rw_assert_wrlock(&pfs->pfs_rwl); + + mtx_enter(&pfs->pfs_mtx); + TAILQ_REMOVE(&pfs->pfs_list, st, entry_list); + mtx_leave(&pfs->pfs_mtx); + + pf_state_unref(st); /* list no longer references the state */ +} + int pf_src_connlimit(struct pf_state **state) { @@ -986,7 +1017,7 @@ pf_state_insert(struct pfi_kif *kif, str PF_STATE_EXIT_WRITE(); return (-1); } - TAILQ_INSERT_TAIL(&state_list, s, entry_list); + pf_state_list_insert(&pf_state_list, s); pf_status.fcounters[FCNT_STATE_INSERT]++; pf_status.states++; pfi_kif_ref(kif, PFI_KIF_REF_STATE); @@ -1247,7 +1278,8 @@ pf_purge_expired_rules(void) void pf_purge_timeout(void *unused) { - task_add(net_tq(0), &pf_purge_task); + /* XXX move to systqmp to avoid KERNEL_LOCK */ + task_add(systq, &pf_purge_task); } void @@ -1255,9 +1287,6 @@ pf_purge(void *xnloops) { int *nloops = xnloops; - KERNEL_LOCK(); - NET_LOCK(); - /* * process a fraction of the state table every second * Note: @@ -1268,6 +1297,8 @@ pf_purge(void *xnloops) pf_purge_expired_states(1 + (pf_status.states / pf_default_rule.timeout[PFTM_INTERVAL])); + NET_LOCK(); + PF_LOCK(); /* purge other expired types every PFTM_INTERVAL seconds */ if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { @@ -1280,11 +1311,10 @@ pf_purge(void *xnloops) * Fragments don't require PF_LOCK(), they use their own lock. */ if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) { - pf_purge_expired_fragments(); + pfgpurge_expired_fragment(s); *nloops = 0; } NET_UNLOCK(); - KERNEL_UNLOCK(); timeout_add_sec(&pf_purge_to, 1); } @@ -1447,7 +1477,7 @@ pf_free_state(struct pf_state *cur) } pf_normalize_tcp_cleanup(cur); pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE); - TAILQ_REMOVE(&state_list, cur, entry_list); + pf_state_list_remove(&pf_state_list, cur); if (cur->tag) pf_tag_unref(cur->tag); pf_state_unref(cur); @@ -1458,53 +1488,77 @@ pf_free_state(struct pf_state *cur) void pf_purge_expired_states(u_int32_t maxcheck) { + /* + * this task/thread/context/whatever is the only thing that + * removes states from the pf_state_list, so the cur reference + * it holds between calls is guaranteed to still be in the + * list. + */ static struct pf_state *cur = NULL; - struct pf_state *next; - SLIST_HEAD(pf_state_gcl, pf_state) gcl; + + struct pf_state *head, *tail; + struct pf_state *st; + SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl); PF_ASSERT_UNLOCKED(); - SLIST_INIT(&gcl); - PF_STATE_ENTER_READ(); - while (maxcheck--) { - /* wrap to start of list when we hit the end */ - if (cur == NULL) { - cur = pf_state_ref(TAILQ_FIRST(&state_list)); - if (cur == NULL) - break; /* list empty */ - } + rw_enter_read(&pf_state_list.pfs_rwl); - /* get next state, as cur may get deleted */ - next = TAILQ_NEXT(cur, entry_list); + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + if (head == NULL) { + /* the list is empty */ + rw_exit_read(&pf_state_list.pfs_rwl); + return; + } - if ((cur->timeout == PFTM_UNLINKED) || - (pf_state_expires(cur) <= getuptime())) - SLIST_INSERT_HEAD(&gcl, cur, gc_list); - else - pf_state_unref(cur); + /* (re)start at the front of the list */ + if (cur == NULL) + cur = head; - cur = pf_state_ref(next); + do { + if ((cur->timeout == PFTM_UNLINKED) || + (pf_state_expires(cur) <= getuptime())) { + st = pf_state_ref(cur); + SLIST_INSERT_HEAD(&gcl, st, gc_list); + } - if (cur == NULL) + /* don't iterate past the end of our view of the list */ + if (cur == tail) { + cur = NULL; break; - } - PF_STATE_EXIT_READ(); + } + + cur = TAILQ_NEXT(cur, entry_list); + } while (maxcheck--); + rw_exit_read(&pf_state_list.pfs_rwl); + + if (SLIST_EMPTY(&gcl)) + return; + + NET_LOCK(); + rw_enter_write(&pf_state_list.pfs_rwl); PF_LOCK(); PF_STATE_ENTER_WRITE(); - while ((next = SLIST_FIRST(&gcl)) != NULL) { - SLIST_REMOVE_HEAD(&gcl, gc_list); - if (next->timeout == PFTM_UNLINKED) - pf_free_state(next); - else { - pf_remove_state(next); - pf_free_state(next); - } + SLIST_FOREACH(st, &gcl, gc_list) { + if (st->timeout != PFTM_UNLINKED) + pf_remove_state(st); - pf_state_unref(next); + pf_free_state(st); } PF_STATE_EXIT_WRITE(); PF_UNLOCK(); + rw_exit_write(&pf_state_list.pfs_rwl); + NET_UNLOCK(); + + while ((st = SLIST_FIRST(&gcl)) != NULL) { + SLIST_REMOVE_HEAD(&gcl, gc_list); + pf_state_unref(st); + } } int Index: pf_ioctl.c =================================================================== RCS file: /cvs/src/sys/net/pf_ioctl.c,v retrieving revision 1.364 diff -u -p -r1.364 pf_ioctl.c --- pf_ioctl.c 2 Jun 2021 07:46:22 -0000 1.364 +++ pf_ioctl.c 3 Jun 2021 06:24:48 -0000 @@ -113,6 +113,8 @@ int pf_rule_copyin(struct pf_rule *, u_int16_t pf_qname2qid(char *, int); void pf_qid2qname(u_int16_t, char *); void pf_qid_unref(u_int16_t); +int pf_states_clr(struct pfioc_state_kill *); +int pf_states_get(struct pfioc_states *); struct pf_rule pf_default_rule, pf_default_rule_new; @@ -206,7 +208,6 @@ pfattach(int num) TAILQ_INIT(&pf_queues[1]); pf_queues_active = &pf_queues[0]; pf_queues_inactive = &pf_queues[1]; - TAILQ_INIT(&state_list); /* default rule should never be garbage collected */ pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; @@ -903,6 +904,122 @@ pf_addr_copyout(struct pf_addr_wrap *add } int +pf_states_clr(struct pfioc_state_kill *psk) +{ + struct pf_state *s, *nexts; + struct pf_state *head, *tail; + u_int killed = 0; + int error; + + NET_LOCK(); + + /* lock against the gc removing an item from the list */ + error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR); + if (error != 0) + goto unlock; + + /* get a snapshot view of the ends of the list to traverse between */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + s = NULL; + nexts = head; + + PF_LOCK(); + PF_STATE_ENTER_WRITE(); + + while (s != tail) { + s = nexts; + nexts = TAILQ_NEXT(s, entry_list); + + if (s->timeout == PFTM_UNLINKED) + continue; + + if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname, + s->kif->pfik_name)) { +#if NPFSYNC > 0 + /* don't send out individual delete messages */ + SET(s->state_flags, PFSTATE_NOSYNC); +#endif /* NPFSYNC > 0 */ + pf_remove_state(s); + killed++; + } + } + + PF_STATE_EXIT_WRITE(); +#if NPFSYNC > 0 + pfsync_clear_states(pf_status.hostid, psk->psk_ifname); +#endif /* NPFSYNC > 0 */ + PF_UNLOCK(); + rw_exit(&pf_state_list.pfs_rwl); + + psk->psk_killed = killed; +unlock: + NET_UNLOCK(); + + return (error); +} + +int +pf_states_get(struct pfioc_states *ps) +{ + struct pf_state *head, *tail; + struct pf_state *next, *state; + struct pfsync_state *p, pstore; + u_int32_t nr = 0; + int error; + + if (ps->ps_len == 0) { + nr = pf_status.states; + ps->ps_len = sizeof(struct pfsync_state) * nr; + return (0); + } + + p = ps->ps_states; + + /* lock against the gc removing an item from the list */ + error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR); + if (error != 0) + return (error); + + /* get a snapshot view of the ends of the list to traverse between */ + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + state = NULL; + next = head; + + while (state != tail) { + state = next; + next = TAILQ_NEXT(state, entry_list); + + if (state->timeout == PFTM_UNLINKED) + continue; + + if ((nr+1) * sizeof(*p) > ps->ps_len) + break; + + pf_state_export(&pstore, state); + error = copyout(&pstore, p, sizeof(*p)); + if (error) + goto fail; + + p++; + nr++; + } + ps->ps_len = sizeof(struct pfsync_state) * nr; + +fail: + rw_exit(&pf_state_list.pfs_rwl); + + return (error); +} + +int pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) { int error = 0; @@ -1545,36 +1662,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a break; } - case DIOCCLRSTATES: { - struct pf_state *s, *nexts; - struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; - u_int killed = 0; - - NET_LOCK(); - PF_LOCK(); - PF_STATE_ENTER_WRITE(); - for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) { - nexts = RB_NEXT(pf_state_tree_id, &tree_id, s); - - if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname, - s->kif->pfik_name)) { -#if NPFSYNC > 0 - /* don't send out individual delete messages */ - SET(s->state_flags, PFSTATE_NOSYNC); -#endif /* NPFSYNC > 0 */ - pf_remove_state(s); - killed++; - } - } - PF_STATE_EXIT_WRITE(); - psk->psk_killed = killed; -#if NPFSYNC > 0 - pfsync_clear_states(pf_status.hostid, psk->psk_ifname); -#endif /* NPFSYNC > 0 */ - PF_UNLOCK(); - NET_UNLOCK(); + case DIOCCLRSTATES: + error = pf_states_clr((struct pfioc_state_kill *)addr); break; - } case DIOCKILLSTATES: { struct pf_state *s, *nexts; @@ -1757,50 +1847,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a break; } - case DIOCGETSTATES: { - struct pfioc_states *ps = (struct pfioc_states *)addr; - struct pf_state *state; - struct pfsync_state *p, *pstore; - u_int32_t nr = 0; - - if (ps->ps_len == 0) { - nr = pf_status.states; - ps->ps_len = sizeof(struct pfsync_state) * nr; - break; - } - - pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK); - - p = ps->ps_states; - - NET_LOCK(); - PF_STATE_ENTER_READ(); - state = TAILQ_FIRST(&state_list); - while (state) { - if (state->timeout != PFTM_UNLINKED) { - if ((nr+1) * sizeof(*p) > ps->ps_len) - break; - pf_state_export(pstore, state); - error = copyout(pstore, p, sizeof(*p)); - if (error) { - free(pstore, M_TEMP, sizeof(*pstore)); - PF_STATE_EXIT_READ(); - NET_UNLOCK(); - goto fail; - } - p++; - nr++; - } - state = TAILQ_NEXT(state, entry_list); - } - PF_STATE_EXIT_READ(); - NET_UNLOCK(); - - ps->ps_len = sizeof(struct pfsync_state) * nr; - - free(pstore, M_TEMP, sizeof(*pstore)); + case DIOCGETSTATES: + error = pf_states_get((struct pfioc_states *)addr); break; - } case DIOCGETSTATUS: { struct pf_status *s = (struct pf_status *)addr; Index: pfvar.h =================================================================== RCS file: /cvs/src/sys/net/pfvar.h,v retrieving revision 1.500 diff -u -p -r1.500 pfvar.h --- pfvar.h 10 Mar 2021 10:21:48 -0000 1.500 +++ pfvar.h 3 Jun 2021 06:24:48 -0000 @@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state); RB_PROTOTYPE(pf_state_tree_id, pf_state, entry_id, pf_state_compare_id); extern struct pf_state_tree_id tree_id; -extern struct pf_state_queue state_list; +extern struct pf_state_list pf_state_list; TAILQ_HEAD(pf_queuehead, pf_queuespec); extern struct pf_queuehead pf_queues[2]; Index: pfvar_priv.h =================================================================== RCS file: /cvs/src/sys/net/pfvar_priv.h,v retrieving revision 1.6 diff -u -p -r1.6 pfvar_priv.h --- pfvar_priv.h 9 Feb 2021 14:06:19 -0000 1.6 +++ pfvar_priv.h 3 Jun 2021 06:24:48 -0000 @@ -38,6 +38,115 @@ #ifdef _KERNEL #include <sys/rwlock.h> +#include <sys/mutex.h> + +/* + +states are linked into a global list to support the following +functionality: + + - garbage collection + - pfsync bulk send operations + - bulk state fetches via the DIOCGETSTATES ioctl + - bulk state clearing via the DIOCCLRSTATES ioctl + +states are inserted into the global pf_state_list once it has also +been successfully added to the various trees that make up the state +table. states are only removed from the pf_state_list by the garbage +collection process. + +the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD +structure) and the pointers between the entries on the pf_state_list +are locked separately. at a high level, this allows for insertion +of new states into the pf_state_list while other contexts (eg, the +ioctls) are traversing the state items in the list. for garbage +collection to remove items from the pf_state_list, it has to exclude +both modifications to the list head and tail pointers, and traversal +of the links between the states. + +the head and tail pointers are protected by a mutex. the pointers +between states are protected by an rwlock. + +because insertions are only made to the end of the list, if we get +a snapshot of the head and tail of the list and prevent modifications +to the links between states, we can safely traverse between the +head and tail entries. subsequent insertions can add entries after +our view of the tail, but we don't look past our view. + +if both locks must be taken, the rwlock protecting the links between +states is taken before the mutex protecting the head and tail +pointer. + +insertion into the list follows this pattern: + + // serialise list head/tail modifications + mtx_enter(&pf_state_list.pfs_mtx); + TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list); + mtx_leave(&pf_state_list.pfs_mtx); + +traversal of the list: + + // lock against the gc removing an item from the list + rw_enter_read(&pf_state_list.pfs_rwl); + + // get a snapshot view of the ends of the list + mtx_enter(&pf_state_list.pfs_mtx); + head = TAILQ_FIRST(&pf_state_list.pfs_list); + tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + state = NULL; + next = head; + + while (state != tail) { + state = next; + next = TAILQ_NEXT(state, entry_list); + + // look at the state + } + + rw_exit_read(&pf_state_list.pfs_rwl); + +removing an item from the list: + + // wait for iterators (readers) to get out + rw_enter_write(&pf_state_list.pfs_rwl); + + // serialise list head/tail modifications + mtx_enter(&pf_state_list.pfs_mtx); + TAILQ_REMOVE(&pf_state_list.pfs_list, state, entry_list); + mtx_leave(&pf_state_list.pfs_mtx); + + rw_exit_write(&pf_state_list.pfs_rwl); + +the lock ordering for pf_state_list locks and the rest of the pf +locks are: + +1. KERNEL_LOCK +2. NET_LOCK +3. pf_state_list.pfs_rwl +4. PF_LOCK +5. PF_STATE_LOCK +6. pf_state_list.pfs_mtx + +*/ + +struct pf_state_list { + /* the actual list of states in the system */ + struct pf_state_queue pfs_list; + + /* serialises insertion of states in the list */ + struct mutex pfs_mtx; + + /* coordinate access to the list for deletion */ + struct rwlock pfs_rwl; +}; + +#define PF_STATE_LIST_INITIALIZER(_pfs) { \ + .pfs_list = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list), \ + .pfs_mtx = MUTEX_INITIALIZER(IPL_SOFTNET), \ + .pfs_rwl = RWLOCK_INITIALIZER("pfstates"), \ +} extern struct rwlock pf_lock; Index: if_pfsync.c =================================================================== RCS file: /cvs/src/sys/net/if_pfsync.c,v retrieving revision 1.288 diff -u -p -r1.288 if_pfsync.c --- if_pfsync.c 10 Mar 2021 10:21:48 -0000 1.288 +++ if_pfsync.c 3 Jun 2021 06:24:48 -0000 @@ -2509,22 +2509,34 @@ pfsync_bulk_start(void) { struct pfsync_softc *sc = pfsyncif; + NET_ASSERT_LOCKED(); + + /* + * pf gc via pfsync_state_in_use reads sc_bulk_next and + * sc_bulk_last while exclusively holding the pf_state_list + * rwlock. make sure it can't race with us setting these + * pointers. they basically act as hazards, and borrow the + * lists state reference count. + */ + rw_enter_read(&pf_state_list.pfs_rwl); + + /* get a consistent view of the list pointers */ + mtx_enter(&pf_state_list.pfs_mtx); + if (sc->sc_bulk_next == NULL) + sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list); + + sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue); + mtx_leave(&pf_state_list.pfs_mtx); + + rw_exit_read(&pf_state_list.pfs_rwl); + DPFPRINTF(LOG_INFO, "received bulk update request"); - if (TAILQ_EMPTY(&state_list)) + if (sc->sc_bulk_last == NULL) pfsync_bulk_status(PFSYNC_BUS_END); else { sc->sc_ureq_received = getuptime(); - if (sc->sc_bulk_next == NULL) { - PF_STATE_ENTER_READ(); - sc->sc_bulk_next = TAILQ_FIRST(&state_list); - pf_state_ref(sc->sc_bulk_next); - PF_STATE_EXIT_READ(); - } - sc->sc_bulk_last = sc->sc_bulk_next; - pf_state_ref(sc->sc_bulk_last); - pfsync_bulk_status(PFSYNC_BUS_START); timeout_add(&sc->sc_bulk_tmo, 0); } @@ -2534,13 +2546,15 @@ void pfsync_bulk_update(void *arg) { struct pfsync_softc *sc; - struct pf_state *st, *st_next; + struct pf_state *st; int i = 0; NET_LOCK(); sc = pfsyncif; if (sc == NULL) goto out; + + rw_enter_read(&pf_state_list.pfs_rwl); st = sc->sc_bulk_next; sc->sc_bulk_next = NULL; @@ -2552,23 +2566,9 @@ pfsync_bulk_update(void *arg) i++; } - /* - * I wonder how we prevent infinite bulk update. IMO it can - * happen when sc_bulk_last state expires before we iterate - * through the whole list. - */ - PF_STATE_ENTER_READ(); - st_next = TAILQ_NEXT(st, entry_list); - pf_state_unref(st); - st = st_next; - if (st == NULL) - st = TAILQ_FIRST(&state_list); - pf_state_ref(st); - PF_STATE_EXIT_READ(); - + st = TAILQ_NEXT(st, entry_list); if ((st == NULL) || (st == sc->sc_bulk_last)) { /* we're done */ - pf_state_unref(sc->sc_bulk_last); sc->sc_bulk_last = NULL; pfsync_bulk_status(PFSYNC_BUS_END); break; @@ -2582,6 +2582,8 @@ pfsync_bulk_update(void *arg) break; } } + + rw_exit_read(&pf_state_list.pfs_rwl); out: NET_UNLOCK(); } @@ -2678,6 +2680,8 @@ pfsync_state_in_use(struct pf_state *st) if (sc == NULL) return (0); + + rw_assert_wrlock(&pf_state_list.pfs_rwl); if (st->sync_state != PFSYNC_S_NONE || st == sc->sc_bulk_next ||