On Wed, Apr 06, 2022 at 05:01:55PM +0200, Alexandr Nedvedicky wrote: > Hello, > > Hrvoje was testing pf(4) and pfsync(4) with parallel forwarding diff > which bluhm@ has shared sometime ago. > > Hrvoje found a bug in my very naive implementation of 'snapshots': > > 1573 void > 1574 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) > 1575 { > 1576 int q; > 1577 > 1578 sn->sn_sc = sc; > 1579 > 1580 mtx_enter(&sc->sc_st_mtx); > 1581 mtx_enter(&sc->sc_upd_req_mtx); > 1582 mtx_enter(&sc->sc_tdb_mtx); > 1583 > 1584 for (q = 0; q < PFSYNC_S_COUNT; q++) { > 1585 TAILQ_INIT(&sn->sn_qs[q]); > 1586 TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list); > 1587 } > 1588 > 1589 TAILQ_INIT(&sn->sn_upd_req_list); > 1590 TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, > ur_entry); > 1591 > 1592 TAILQ_INIT(&sn->sn_tdb_q); > 1593 TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry); > 1594 > > > the problem with code above is that we just take care about heads of various > queues. However individual objects may get re-inserted to queue on behalf of > state update. This creates a havoc. The proposed change introduces a dedicated > link member for snapshot, so we can move elements from sync_list to > snapshot_list. > > The diff below does not hurt pfsync(4) in current tree, because > we still don't forward packets in parallel. It will just make > things bit easier for Hrvoje et. al. so we can keep smaller diff > against current tree. > > > OK ?
I think there is a use after free in you diff. After you return from pfsync_delete_tdb() you must not access the TDB again. Comments inline. > thanks and > regards > sashan > > --------8<---------------8<---------------8<------------------8<-------- > diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c > index cb0f3fbdf52..161f8c89317 100644 > --- a/sys/net/if_pfsync.c > +++ b/sys/net/if_pfsync.c > @@ -181,6 +181,7 @@ void pfsync_q_del(struct pf_state *); > > struct pfsync_upd_req_item { > TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; > + TAILQ_ENTRY(pfsync_upd_req_item) ur_snap; > struct pfsync_upd_req ur_msg; > }; > TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item); > @@ -295,7 +296,7 @@ void pfsync_bulk_update(void *); > void pfsync_bulk_fail(void *); > > void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); > -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); > +void pfsync_drop_snapshot(struct pfsync_snapshot *); > > void pfsync_send_dispatch(void *); > void pfsync_send_pkt(struct mbuf *); > @@ -1574,6 +1575,9 @@ void > pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) > { > int q; > + struct pf_state *st; > + struct pfsync_upd_req_item *ur; > + struct tdb *tdb; > > sn->sn_sc = sc; > > @@ -1583,14 +1587,33 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, > struct pfsync_softc *sc) > > for (q = 0; q < PFSYNC_S_COUNT; q++) { > TAILQ_INIT(&sn->sn_qs[q]); > - TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list); > + > + while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { > +#ifdef PFSYNC_DEBUG > + KASSERT(st->snapped == 0); > +#endif > + TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); > + TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap); > + st->snapped = 1; > + } > } > > TAILQ_INIT(&sn->sn_upd_req_list); > - TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry); > + while (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { > + ur = TAILQ_FIRST(&sc->sc_upd_req_list); Other loops have this idiom while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list) != NULL) { > + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); > + TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap); > + } > > TAILQ_INIT(&sn->sn_tdb_q); > - TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry); > + while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { > +#ifdef PFSYNC_DEBUG > + KASSERT(tdb->snapped == 0); > +#endif > + TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry); > + TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap); > + tdb->tdb_snapped = 1; > + } > > sn->sn_len = sc->sc_len; > sc->sc_len = PFSYNC_MINPKT; > @@ -1606,41 +1629,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, > struct pfsync_softc *sc) > } > > void > -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc) > +pfsync_drop_snapshot(struct pfsync_snapshot *sn) > { > struct pf_state *st; > struct pfsync_upd_req_item *ur; > struct tdb *t; > int q; > > - > for (q = 0; q < PFSYNC_S_COUNT; q++) { > if (TAILQ_EMPTY(&sn->sn_qs[q])) > continue; > > while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { > - TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list); > #ifdef PFSYNC_DEBUG > KASSERT(st->sync_state == q); > + KASSERT(st->snapped == 1); > #endif > + TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap); > st->sync_state = PFSYNC_S_NONE; > + st->snapped = 0; > pf_state_unref(st); > } > } > > while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) { > - TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry); > + TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap); > pool_put(&sn->sn_sc->sc_pool, ur); > } > > - mtx_enter(&sc->sc_tdb_mtx); > while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) { > - TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry); > + TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap); > mtx_enter(&t->tdb_mtx); > +#ifdef PFSYNC_DEBUG > + KASSERT(t->tdb_snapped == 1); > +#endif > + t->tdb_snapped = 0; > CLR(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > } > - mtx_leave(&sc->sc_tdb_mtx); > } > > int > @@ -1667,7 +1693,7 @@ pfsync_drop(struct pfsync_softc *sc) > struct pfsync_snapshot sn; > > pfsync_grab_snapshot(&sn, sc); > - pfsync_drop_snapshot(&sn, sc); > + pfsync_drop_snapshot(&sn); > } > > void > @@ -1759,7 +1785,7 @@ pfsync_sendout(void) > if (m == NULL) { > sc->sc_if.if_oerrors++; > pfsyncstat_inc(pfsyncs_onomem); > - pfsync_drop_snapshot(&sn, sc); > + pfsync_drop_snapshot(&sn); > return; > } > > @@ -1769,7 +1795,7 @@ pfsync_sendout(void) > m_free(m); > sc->sc_if.if_oerrors++; > pfsyncstat_inc(pfsyncs_onomem); > - pfsync_drop_snapshot(&sn, sc); > + pfsync_drop_snapshot(&sn); > return; > } > } > @@ -1799,7 +1825,7 @@ pfsync_sendout(void) > > count = 0; > while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) { > - TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry); > + TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap); > > bcopy(&ur->ur_msg, m->m_data + offset, > sizeof(ur->ur_msg)); > @@ -1827,18 +1853,20 @@ pfsync_sendout(void) > subh = (struct pfsync_subheader *)(m->m_data + offset); > offset += sizeof(*subh); > > - mtx_enter(&sc->sc_tdb_mtx); > count = 0; > while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) { > - TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry); > + TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap); I think the TDB in tdb_sync_snap list may be freed. Maybe you should grab a refcount if you store them into a list. > pfsync_out_tdb(t, m->m_data + offset); > offset += sizeof(struct pfsync_tdb); > +#ifdef PFSYNC_DEBUG > + KASSERT(t->tdb_snapped == 1); > +#endif > + t->tdb_snapped = 0; This may also be use after free. > mtx_enter(&t->tdb_mtx); > CLR(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > count++; > } > - mtx_leave(&sc->sc_tdb_mtx); > > bzero(subh, sizeof(*subh)); > subh->action = PFSYNC_ACT_TDB; > @@ -1856,11 +1884,13 @@ pfsync_sendout(void) > > count = 0; > while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { > - TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list); > + TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap); > #ifdef PFSYNC_DEBUG > KASSERT(st->sync_state == q); > + KASSERT(st->snapped == 1); > #endif > st->sync_state = PFSYNC_S_NONE; > + st->snapped = 0; > pfsync_qs[q].write(st, m->m_data + offset); > offset += pfsync_qs[q].len; > > @@ -2411,8 +2441,9 @@ pfsync_q_ins(struct pf_state *st, int q) > mtx_enter(&sc->sc_st_mtx); > > /* > - * If two threads are competing to insert the same state, then > - * there must be just single winner. > + * There are either two threads trying to update the > + * the same state, or the state is just being processed > + * (is on snapshot queue). > */ > if (st->sync_state != PFSYNC_S_NONE) { > mtx_leave(&sc->sc_st_mtx); > @@ -2450,6 +2481,15 @@ pfsync_q_del(struct pf_state *st) > > mtx_enter(&sc->sc_st_mtx); > q = st->sync_state; > + /* > + * re-check under mutex > + * if state is snapped already, then just bail out, because we came > + * too late, the state is being just processed/dispatched to peer. > + */ > + if ((q == PFSYNC_S_NONE) || (st->snapped)) { > + mtx_leave(&sc->sc_st_mtx); > + return; > + } > atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); > TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); > if (TAILQ_EMPTY(&sc->sc_qs[q])) > @@ -2525,7 +2565,17 @@ pfsync_delete_tdb(struct tdb *t) > > mtx_enter(&sc->sc_tdb_mtx); > > + /* > + * if tdb entry is just being processed (found in snapshot), > + * then it can not be deleted. we just came too late > + */ > + if (t->tdb_snapped) { > + mtx_leave(&sc->sc_tdb_mtx); You must not access the TDB after this point. I thnik you cannot guarantee that. The memory will freed after return. > + return; > + } > + > TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); > + > mtx_enter(&t->tdb_mtx); > CLR(t->tdb_flags, TDBF_PFSYNC); > mtx_leave(&t->tdb_mtx); > diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h > index bd7ec1d88e7..558618a0f14 100644 > --- a/sys/net/pfvar.h > +++ b/sys/net/pfvar.h > @@ -749,6 +749,7 @@ struct pf_state { > u_int8_t pad[3]; > > TAILQ_ENTRY(pf_state) sync_list; > + TAILQ_ENTRY(pf_state) sync_snap; > TAILQ_ENTRY(pf_state) entry_list; > SLIST_ENTRY(pf_state) gc_list; > RB_ENTRY(pf_state) entry_id; > @@ -797,6 +798,7 @@ struct pf_state { > pf_refcnt_t refcnt; > u_int16_t delay; > u_int8_t rt; > + u_int8_t snapped; > }; > > /* > diff --git a/sys/netinet/ip_ipsp.h b/sys/netinet/ip_ipsp.h > index c697994047b..ebdb6ada1ae 100644 > --- a/sys/netinet/ip_ipsp.h > +++ b/sys/netinet/ip_ipsp.h > @@ -405,6 +405,7 @@ struct tdb { /* tunnel > descriptor block */ > u_int8_t tdb_wnd; /* Replay window */ > u_int8_t tdb_satype; /* SA type (RFC2367, PF_KEY) */ > u_int8_t tdb_updates; /* pfsync update counter */ > + u_int8_t tdb_snapped; /* dispatched by pfsync(4) */ u_int8_t is not atomic. I you want to change tdb_snapped, you need a mutex that also protects the othere fields in the same 32 bit word everywhere. I think a new flag TDBF_PFSYNC_SNAPPED in tdb_flags would be easier. The tdb_flags are protected by tdb_mtx. > union sockaddr_union tdb_dst; /* [N] Destination address */ > union sockaddr_union tdb_src; /* [N] Source address */ > @@ -439,6 +440,7 @@ struct tdb { /* tunnel > descriptor block */ > > TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [p] */ > TAILQ_ENTRY(tdb) tdb_sync_entry; > + TAILQ_ENTRY(tdb) tdb_sync_snap; > }; > > enum tdb_counters {