Hello, below is a diff which hopes to fix the issue. Although diff is fairly large the change itself is kind of straightforward. Let me briefly explain what's going on here. Diff introduces a mutex to pf_state, which protects array of keys (pf_state::key) bound to state.
The panic which diff below hopes to fix is caused by a race between timer thread, which expires state and pfsync dispatch task, which updates a peer. According to data provided by Hrvoje we panic due to NULL pointer dereference in pf_state_export(), which finds sk->key[] to be NULL. This may happen because purge state mechanism detaches state key from state under protection of PF_STATE_LOCK, while pfsync dispatch task just keeps a reference to state without using a PF_STATE_LOCK to access a state instance. In order to synchronize access to pf_statey::key between purge thread and pfsync dispatch task diff below introduces pf_state::mtx. pfsync uses pf_state::mtx to attempt to grab references to keys bound to state, while purge task uses mtx to safely invalidate state keys in pf_detach_state(). Such change requires pfsync(4) to deal with situation when state got detached while waiting in dispatch queue to update a peer. We have to a .write() operation on sync-queue to indicate a failure so pfsync_sendout() will just skip the state when processing dispatch queue. Also diff changes pf_state_key_detach() such caller must pass pointer to state key instead of key index to be detached from state. It also requires caller to invalidate a state key entry in pf_state::key member. I've just smoked tested the diff _without_ pfsync. thanks and regards sashan --------8<---------------8<---------------8<------------------8<-------- diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index c78ca62766e..02d45673e20 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -157,16 +157,16 @@ struct { }; struct pfsync_q { - void (*write)(struct pf_state *, void *); + int (*write)(struct pf_state *, void *); size_t len; u_int8_t action; }; /* we have one of these for every PFSYNC_S_ */ -void pfsync_out_state(struct pf_state *, void *); -void pfsync_out_iack(struct pf_state *, void *); -void pfsync_out_upd_c(struct pf_state *, void *); -void pfsync_out_del(struct pf_state *, void *); +int pfsync_out_state(struct pf_state *, void *); +int pfsync_out_iack(struct pf_state *, void *); +int pfsync_out_upd_c(struct pf_state *, void *); +int pfsync_out_del(struct pf_state *, void *); struct pfsync_q pfsync_qs[] = { { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, @@ -516,10 +516,10 @@ pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, return (0); } -void +int pfsync_state_export(struct pfsync_state *sp, struct pf_state *st) { - pf_state_export(sp, st); + return (pf_state_export(sp, st)); } int @@ -1529,24 +1529,25 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (0); } -void +int pfsync_out_state(struct pf_state *st, void *buf) { struct pfsync_state *sp = buf; - pfsync_state_export(sp, st); + return (pfsync_state_export(sp, st)); } -void +int pfsync_out_iack(struct pf_state *st, void *buf) { struct pfsync_ins_ack *iack = buf; iack->id = st->id; iack->creatorid = st->creatorid; + return (0); } -void +int pfsync_out_upd_c(struct pf_state *st, void *buf) { struct pfsync_upd_c *up = buf; @@ -1557,9 +1558,10 @@ pfsync_out_upd_c(struct pf_state *st, void *buf) pf_state_peer_hton(&st->dst, &up->dst); up->creatorid = st->creatorid; up->timeout = st->timeout; + return (0); } -void +int pfsync_out_del(struct pf_state *st, void *buf) { struct pfsync_del_c *dp = buf; @@ -1568,6 +1570,7 @@ pfsync_out_del(struct pf_state *st, void *buf) dp->creatorid = st->creatorid; SET(st->state_flags, PFSTATE_NOSYNC); + return (0); } void @@ -1881,8 +1884,8 @@ pfsync_sendout(void) KASSERT(st->snapped == 1); st->sync_state = PFSYNC_S_NONE; st->snapped = 0; - pfsync_qs[q].write(st, m->m_data + offset); - offset += pfsync_qs[q].len; + if (pfsync_qs[q].write(st, m->m_data + offset) == 0) + offset += pfsync_qs[q].len; pf_state_unref(st); count++; diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h index bee6c77f228..4440a561854 100644 --- a/sys/net/if_pfsync.h +++ b/sys/net/if_pfsync.h @@ -326,7 +326,7 @@ int pfsync_sysctl(int *, u_int, void *, size_t *, #define PFSYNC_SI_CKSUM 0x02 #define PFSYNC_SI_ACK 0x04 int pfsync_state_import(struct pfsync_state *, int); -void pfsync_state_export(struct pfsync_state *, +int pfsync_state_export(struct pfsync_state *, struct pf_state *); void pfsync_insert_state(struct pf_state *); diff --git a/sys/net/pf.c b/sys/net/pf.c index 93fe5702625..c76f4c71263 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -185,7 +185,8 @@ int pf_translate_icmp_af(struct pf_pdesc*, int, void *); void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, int, sa_family_t, struct pf_rule *, u_int); void pf_detach_state(struct pf_state *); -void pf_state_key_detach(struct pf_state *, int); +void pf_state_key_detach(struct pf_state *, + struct pf_state_key *); u_int32_t pf_tcp_iss(struct pf_pdesc *); void pf_rule_to_actions(struct pf_rule *, struct pf_rule_actions *); @@ -260,6 +261,9 @@ void pf_state_key_unlink_inpcb(struct pf_state_key *); void pf_inpcb_unlink_state_key(struct inpcb *); void pf_pktenqueue_delayed(void *); int32_t pf_state_expires(const struct pf_state *, uint8_t); +void pf_state_keys_take(struct pf_state *, + struct pf_state_key **); +void pf_state_keys_rele(struct pf_state_key **); #if NPFLOG > 0 void pf_log_matches(struct pf_pdesc *, struct pf_rule *, @@ -778,7 +782,8 @@ pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx) s->key[idx] = sk; if ((si = pool_get(&pf_state_item_pl, PR_NOWAIT)) == NULL) { - pf_state_key_detach(s, idx); + pf_state_key_detach(s, s->key[idx]); + s->key[idx] = NULL; return (-1); } si->s = s; @@ -798,42 +803,50 @@ pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx) void pf_detach_state(struct pf_state *s) { - if (s->key[PF_SK_WIRE] == s->key[PF_SK_STACK]) - s->key[PF_SK_WIRE] = NULL; + struct pf_state_key *key[2]; + + mtx_enter(&s->mtx); + key[PF_SK_WIRE] = s->key[PF_SK_WIRE]; + key[PF_SK_STACK] = s->key[PF_SK_STACK]; + s->key[PF_SK_WIRE] = NULL; + s->key[PF_SK_STACK] = NULL; + mtx_leave(&s->mtx); + + if (key[PF_SK_WIRE] == key[PF_SK_STACK]) + key[PF_SK_WIRE] = NULL; - if (s->key[PF_SK_STACK] != NULL) - pf_state_key_detach(s, PF_SK_STACK); + if (key[PF_SK_STACK] != NULL) + pf_state_key_detach(s, key[PF_SK_STACK]); - if (s->key[PF_SK_WIRE] != NULL) - pf_state_key_detach(s, PF_SK_WIRE); + if (key[PF_SK_WIRE] != NULL) + pf_state_key_detach(s, key[PF_SK_WIRE]); } void -pf_state_key_detach(struct pf_state *s, int idx) +pf_state_key_detach(struct pf_state *s, struct pf_state_key *key) { struct pf_state_item *si; - struct pf_state_key *sk; - if (s->key[idx] == NULL) + PF_STATE_ASSERT_LOCKED(); + + if (key == NULL) return; - si = TAILQ_FIRST(&s->key[idx]->states); + si = TAILQ_FIRST(&key->states); while (si && si->s != s) si = TAILQ_NEXT(si, entry); if (si) { - TAILQ_REMOVE(&s->key[idx]->states, si, entry); + TAILQ_REMOVE(&key->states, si, entry); pool_put(&pf_state_item_pl, si); } - sk = s->key[idx]; - s->key[idx] = NULL; - if (TAILQ_EMPTY(&sk->states)) { - RB_REMOVE(pf_state_tree, &pf_statetbl, sk); - sk->removed = 1; - pf_state_key_unlink_reverse(sk); - pf_state_key_unlink_inpcb(sk); - pf_state_key_unref(sk); + if (TAILQ_EMPTY(&key->states)) { + RB_REMOVE(pf_state_tree, &pf_statetbl, key); + key->removed = 1; + pf_state_key_unlink_reverse(key); + pf_state_key_unlink_inpcb(key); + pf_state_key_unref(key); } } @@ -996,7 +1009,9 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw, } *skw = s->key[PF_SK_WIRE]; if (pf_state_key_attach(*sks, s, PF_SK_STACK)) { - pf_state_key_detach(s, PF_SK_WIRE); + pf_state_key_detach(s, s->key[PF_SK_WIRE]); + s->key[PF_SK_WIRE] = NULL; + *skw = NULL; PF_STATE_EXIT_WRITE(); return (-1); } @@ -1185,30 +1200,35 @@ pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) return (ret ? ret->s : NULL); } -void +int pf_state_export(struct pfsync_state *sp, struct pf_state *st) { int32_t expire; + struct pf_state_key *key[2] = { NULL, NULL }; memset(sp, 0, sizeof(struct pfsync_state)); /* copy from state key */ - sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0]; - sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1]; - sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0]; - sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1]; - sp->key[PF_SK_WIRE].rdomain = htons(st->key[PF_SK_WIRE]->rdomain); - sp->key[PF_SK_WIRE].af = st->key[PF_SK_WIRE]->af; - sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0]; - sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1]; - sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0]; - sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1]; - sp->key[PF_SK_STACK].rdomain = htons(st->key[PF_SK_STACK]->rdomain); - sp->key[PF_SK_STACK].af = st->key[PF_SK_STACK]->af; + pf_state_keys_take(st, key); + if ((key[PF_SK_WIRE] == NULL) || (key[PF_SK_STACK] == NULL)) + return (-1); + + sp->key[PF_SK_WIRE].addr[0] = key[PF_SK_WIRE]->addr[0]; + sp->key[PF_SK_WIRE].addr[1] = key[PF_SK_WIRE]->addr[1]; + sp->key[PF_SK_WIRE].port[0] = key[PF_SK_WIRE]->port[0]; + sp->key[PF_SK_WIRE].port[1] = key[PF_SK_WIRE]->port[1]; + sp->key[PF_SK_WIRE].rdomain = htons(key[PF_SK_WIRE]->rdomain); + sp->key[PF_SK_WIRE].af = key[PF_SK_WIRE]->af; + sp->key[PF_SK_STACK].addr[0] = key[PF_SK_STACK]->addr[0]; + sp->key[PF_SK_STACK].addr[1] = key[PF_SK_STACK]->addr[1]; + sp->key[PF_SK_STACK].port[0] = key[PF_SK_STACK]->port[0]; + sp->key[PF_SK_STACK].port[1] = key[PF_SK_STACK]->port[1]; + sp->key[PF_SK_STACK].rdomain = htons(key[PF_SK_STACK]->rdomain); + sp->key[PF_SK_STACK].af = key[PF_SK_STACK]->af; sp->rtableid[PF_SK_WIRE] = htonl(st->rtableid[PF_SK_WIRE]); sp->rtableid[PF_SK_STACK] = htonl(st->rtableid[PF_SK_STACK]); - sp->proto = st->key[PF_SK_WIRE]->proto; - sp->af = st->key[PF_SK_WIRE]->af; + sp->proto = key[PF_SK_WIRE]->proto; + sp->af = key[PF_SK_WIRE]->af; /* copy from state */ strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname)); @@ -1255,6 +1275,10 @@ pf_state_export(struct pfsync_state *sp, struct pf_state *st) sp->set_tos = st->set_tos; sp->set_prio[0] = st->set_prio[0]; sp->set_prio[1] = st->set_prio[1]; + + pf_state_keys_rele(key); + + return (0); } /* END state table stuff */ @@ -4108,6 +4132,7 @@ pf_create_state(struct pf_pdesc *pd, struct pf_rule *r, struct pf_rule *a, * pf_state_inserts() grabs reference for pfsync! */ refcnt_init(&s->refcnt); + mtx_init(&s->mtx, IPL_NET); switch (pd->proto) { case IPPROTO_TCP: @@ -7789,3 +7814,19 @@ pf_pktenqueue_delayed(void *arg) pool_put(&pf_pktdelay_pl, pdy); } + +void +pf_state_keys_take(struct pf_state *st, struct pf_state_key *keys[]) +{ + mtx_enter(&st->mtx); + keys[PF_SK_WIRE] = pf_state_key_ref(st->key[PF_SK_WIRE]); + keys[PF_SK_STACK] = pf_state_key_ref(st->key[PF_SK_STACK]); + mtx_leave(&st->mtx); +} + +void +pf_state_keys_rele(struct pf_state_key *keys[]) +{ + pf_state_key_unref(keys[PF_SK_WIRE]); + pf_state_key_unref(keys[PF_SK_STACK]); +} diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h index 93394e6da5c..4604869ede3 100644 --- a/sys/net/pfvar.h +++ b/sys/net/pfvar.h @@ -763,6 +763,7 @@ struct pf_state { struct pf_sn_head src_nodes; struct pf_state_key *key[2]; /* addresses stack and wire */ struct pfi_kif *kif; + struct mutex mtx; u_int64_t packets[2]; u_int64_t bytes[2]; int32_t creation; @@ -1737,7 +1738,7 @@ void pf_state_rm_src_node(struct pf_state *, extern struct pf_state *pf_find_state_byid(struct pf_state_cmp *); extern struct pf_state *pf_find_state_all(struct pf_state_key_cmp *, u_int, int *); -extern void pf_state_export(struct pfsync_state *, +extern int pf_state_export(struct pfsync_state *, struct pf_state *); extern void pf_print_state(struct pf_state *); extern void pf_print_flags(u_int8_t);