Hello, Hrvoje was testing pf(4) and pfsync(4) with parallel forwarding diff which bluhm@ has shared sometime ago.
Hrvoje found a bug in my very naive implementation of 'snapshots': 1573 void 1574 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) 1575 { 1576 int q; 1577 1578 sn->sn_sc = sc; 1579 1580 mtx_enter(&sc->sc_st_mtx); 1581 mtx_enter(&sc->sc_upd_req_mtx); 1582 mtx_enter(&sc->sc_tdb_mtx); 1583 1584 for (q = 0; q < PFSYNC_S_COUNT; q++) { 1585 TAILQ_INIT(&sn->sn_qs[q]); 1586 TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list); 1587 } 1588 1589 TAILQ_INIT(&sn->sn_upd_req_list); 1590 TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry); 1591 1592 TAILQ_INIT(&sn->sn_tdb_q); 1593 TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry); 1594 the problem with code above is that we just take care about heads of various queues. However individual objects may get re-inserted to queue on behalf of state update. This creates a havoc. The proposed change introduces a dedicated link member for snapshot, so we can move elements from sync_list to snapshot_list. The diff below does not hurt pfsync(4) in current tree, because we still don't forward packets in parallel. It will just make things bit easier for Hrvoje et. al. so we can keep smaller diff against current tree. OK ? thanks and regards sashan --------8<---------------8<---------------8<------------------8<-------- diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index cb0f3fbdf52..161f8c89317 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -181,6 +181,7 @@ void pfsync_q_del(struct pf_state *); struct pfsync_upd_req_item { TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; + TAILQ_ENTRY(pfsync_upd_req_item) ur_snap; struct pfsync_upd_req ur_msg; }; TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item); @@ -295,7 +296,7 @@ void pfsync_bulk_update(void *); void pfsync_bulk_fail(void *); void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); +void pfsync_drop_snapshot(struct pfsync_snapshot *); void pfsync_send_dispatch(void *); void pfsync_send_pkt(struct mbuf *); @@ -1574,6 +1575,9 @@ void pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) { int q; + struct pf_state *st; + struct pfsync_upd_req_item *ur; + struct tdb *tdb; sn->sn_sc = sc; @@ -1583,14 +1587,33 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) for (q = 0; q < PFSYNC_S_COUNT; q++) { TAILQ_INIT(&sn->sn_qs[q]); - TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list); + + while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { +#ifdef PFSYNC_DEBUG + KASSERT(st->snapped == 0); +#endif + TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap); + st->snapped = 1; + } } TAILQ_INIT(&sn->sn_upd_req_list); - TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry); + while (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { + ur = TAILQ_FIRST(&sc->sc_upd_req_list); + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap); + } TAILQ_INIT(&sn->sn_tdb_q); - TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry); + while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { +#ifdef PFSYNC_DEBUG + KASSERT(tdb->snapped == 0); +#endif + TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry); + TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap); + tdb->tdb_snapped = 1; + } sn->sn_len = sc->sc_len; sc->sc_len = PFSYNC_MINPKT; @@ -1606,41 +1629,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) } void -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc) +pfsync_drop_snapshot(struct pfsync_snapshot *sn) { struct pf_state *st; struct pfsync_upd_req_item *ur; struct tdb *t; int q; - for (q = 0; q < PFSYNC_S_COUNT; q++) { if (TAILQ_EMPTY(&sn->sn_qs[q])) continue; while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { - TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list); #ifdef PFSYNC_DEBUG KASSERT(st->sync_state == q); + KASSERT(st->snapped == 1); #endif + TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap); st->sync_state = PFSYNC_S_NONE; + st->snapped = 0; pf_state_unref(st); } } while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry); + TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap); pool_put(&sn->sn_sc->sc_pool, ur); } - mtx_enter(&sc->sc_tdb_mtx); while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) { - TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry); + TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap); mtx_enter(&t->tdb_mtx); +#ifdef PFSYNC_DEBUG + KASSERT(t->tdb_snapped == 1); +#endif + t->tdb_snapped = 0; CLR(t->tdb_flags, TDBF_PFSYNC); mtx_leave(&t->tdb_mtx); } - mtx_leave(&sc->sc_tdb_mtx); } int @@ -1667,7 +1693,7 @@ pfsync_drop(struct pfsync_softc *sc) struct pfsync_snapshot sn; pfsync_grab_snapshot(&sn, sc); - pfsync_drop_snapshot(&sn, sc); + pfsync_drop_snapshot(&sn); } void @@ -1759,7 +1785,7 @@ pfsync_sendout(void) if (m == NULL) { sc->sc_if.if_oerrors++; pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop_snapshot(&sn, sc); + pfsync_drop_snapshot(&sn); return; } @@ -1769,7 +1795,7 @@ pfsync_sendout(void) m_free(m); sc->sc_if.if_oerrors++; pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop_snapshot(&sn, sc); + pfsync_drop_snapshot(&sn); return; } } @@ -1799,7 +1825,7 @@ pfsync_sendout(void) count = 0; while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry); + TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap); bcopy(&ur->ur_msg, m->m_data + offset, sizeof(ur->ur_msg)); @@ -1827,18 +1853,20 @@ pfsync_sendout(void) subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); - mtx_enter(&sc->sc_tdb_mtx); count = 0; while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) { - TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry); + TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap); pfsync_out_tdb(t, m->m_data + offset); offset += sizeof(struct pfsync_tdb); +#ifdef PFSYNC_DEBUG + KASSERT(t->tdb_snapped == 1); +#endif + t->tdb_snapped = 0; mtx_enter(&t->tdb_mtx); CLR(t->tdb_flags, TDBF_PFSYNC); mtx_leave(&t->tdb_mtx); count++; } - mtx_leave(&sc->sc_tdb_mtx); bzero(subh, sizeof(*subh)); subh->action = PFSYNC_ACT_TDB; @@ -1856,11 +1884,13 @@ pfsync_sendout(void) count = 0; while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { - TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list); + TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap); #ifdef PFSYNC_DEBUG KASSERT(st->sync_state == q); + KASSERT(st->snapped == 1); #endif st->sync_state = PFSYNC_S_NONE; + st->snapped = 0; pfsync_qs[q].write(st, m->m_data + offset); offset += pfsync_qs[q].len; @@ -2411,8 +2441,9 @@ pfsync_q_ins(struct pf_state *st, int q) mtx_enter(&sc->sc_st_mtx); /* - * If two threads are competing to insert the same state, then - * there must be just single winner. + * There are either two threads trying to update the + * the same state, or the state is just being processed + * (is on snapshot queue). */ if (st->sync_state != PFSYNC_S_NONE) { mtx_leave(&sc->sc_st_mtx); @@ -2450,6 +2481,15 @@ pfsync_q_del(struct pf_state *st) mtx_enter(&sc->sc_st_mtx); q = st->sync_state; + /* + * re-check under mutex + * if state is snapped already, then just bail out, because we came + * too late, the state is being just processed/dispatched to peer. + */ + if ((q == PFSYNC_S_NONE) || (st->snapped)) { + mtx_leave(&sc->sc_st_mtx); + return; + } atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); if (TAILQ_EMPTY(&sc->sc_qs[q])) @@ -2525,7 +2565,17 @@ pfsync_delete_tdb(struct tdb *t) mtx_enter(&sc->sc_tdb_mtx); + /* + * if tdb entry is just being processed (found in snapshot), + * then it can not be deleted. we just came too late + */ + if (t->tdb_snapped) { + mtx_leave(&sc->sc_tdb_mtx); + return; + } + TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); + mtx_enter(&t->tdb_mtx); CLR(t->tdb_flags, TDBF_PFSYNC); mtx_leave(&t->tdb_mtx); diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index bd7ec1d88e7..558618a0f14 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -749,6 +749,7 @@ struct pf_state { u_int8_t pad[3]; TAILQ_ENTRY(pf_state) sync_list; + TAILQ_ENTRY(pf_state) sync_snap; TAILQ_ENTRY(pf_state) entry_list; SLIST_ENTRY(pf_state) gc_list; RB_ENTRY(pf_state) entry_id; @@ -797,6 +798,7 @@ struct pf_state { pf_refcnt_t refcnt; u_int16_t delay; u_int8_t rt; + u_int8_t snapped; }; /* diff --git a/sys/netinet/ip_ipsp.h b/sys/netinet/ip_ipsp.h index c697994047b..ebdb6ada1ae 100644 --- a/sys/netinet/ip_ipsp.h +++ b/sys/netinet/ip_ipsp.h @@ -405,6 +405,7 @@ struct tdb { /* tunnel descriptor block */ u_int8_t tdb_wnd; /* Replay window */ u_int8_t tdb_satype; /* SA type (RFC2367, PF_KEY) */ u_int8_t tdb_updates; /* pfsync update counter */ + u_int8_t tdb_snapped; /* dispatched by pfsync(4) */ union sockaddr_union tdb_dst; /* [N] Destination address */ union sockaddr_union tdb_src; /* [N] Source address */ @@ -439,6 +440,7 @@ struct tdb { /* tunnel descriptor block */ TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [p] */ TAILQ_ENTRY(tdb) tdb_sync_entry; + TAILQ_ENTRY(tdb) tdb_sync_snap; }; enum tdb_counters {