oh, not again... I'm sorry for not attaching patch to the first email
On Tue, May 30, 2017 at 02:34:32PM +0200, Alexandr Nedvedicky wrote: > Hello, > > patch delivers two changes to PF: > > it adds PF_LOCK() et. al. At the moment the PF_LOCK() sort of > duplicates the current NET_LOCK(). It essentially synchronizes > packets with ioctl(2) and timer thread, which purges states. > The future work is going to break PF_LOCK into smaller locks, > which each will protect relevant parts of PF. Think of pf_state_lock, > pf_rule_lock, ... > > The other change, which gets introduced is mutex for IP reassembly > done by PF. The mutex synchronizes fragmented packets with timer > thread, which expires incomplete packets from fragment cache. > > O.K.? > > thanks and > regards > sasha > --------8<---------------8<---------------8<------------------8<-------- diff -r 21414694ee7a .hgtags --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgtags Tue May 30 14:25:01 2017 +0200 @@ -0,0 +1,1 @@ +d545881e2652dbc0c057691a39a095bce92f441f pf-lock.baseline diff -r 21414694ee7a src/sys/net/pf.c --- a/src/sys/net/pf.c Tue May 30 10:55:41 2017 +0200 +++ b/src/sys/net/pf.c Tue May 30 14:25:01 2017 +0200 @@ -923,7 +923,7 @@ pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw, struct pf_state_key **sks, struct pf_state *s) { - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); s->kif = kif; if (*skw == *sks) { @@ -1186,7 +1186,7 @@ { struct pf_rule *r; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); if (SLIST_EMPTY(&pf_rule_gcl)) return; @@ -1207,6 +1207,7 @@ tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz); NET_LOCK(s); + PF_LOCK(s); /* process a fraction of the state table every second */ pf_purge_expired_states(1 + (pf_status.states @@ -1214,13 +1215,20 @@ /* purge other expired types every PFTM_INTERVAL seconds */ if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { - pf_purge_expired_fragments(); pf_purge_expired_src_nodes(0); pf_purge_expired_rules(); nloops = 0; } + PF_UNLOCK(s); NET_UNLOCK(s); + + /* + * Fragments don't require PF_LOCK(), they use their own mutex. + */ + if (nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) + pf_purge_expired_fragments(); + } } @@ -1267,7 +1275,7 @@ { struct pf_src_node *cur, *next; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) { next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur); @@ -1303,7 +1311,7 @@ void pf_remove_state(struct pf_state *cur) { - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); /* handle load balancing related tasks */ pf_postprocess_addr(cur); @@ -1322,9 +1330,17 @@ #if NPFLOW > 0 if (cur->state_flags & PFSTATE_PFLOW) { /* XXXSMP breaks atomicity */ + /* + * The only guy, who kills states (frees from memory) is + * pf_purge_thread(). The pf_purge_thread() kills only states, + * which are marked as PFTM_UNLINKED -> state will stay around, + * once we re-acquire netlock. + */ + rw_exit_write(&pf_lock); rw_exit_write(&netlock); export_pflow(cur); rw_enter_write(&netlock); + rw_enter_write(&pf_lock); } #endif /* NPFLOW > 0 */ #if NPFSYNC > 0 @@ -1354,7 +1370,7 @@ { struct pf_rule_item *ri; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); #if NPFSYNC > 0 if (pfsync_state_in_use(cur)) @@ -1390,7 +1406,7 @@ static struct pf_state *cur = NULL; struct pf_state *next; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); while (maxcheck--) { /* wrap to start of list when we hit the end */ @@ -3142,13 +3158,13 @@ case IPPROTO_TCP: sport = pd->hdr.tcp.th_sport; dport = pd->hdr.tcp.th_dport; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); tb = &tcbtable; break; case IPPROTO_UDP: sport = pd->hdr.udp.uh_sport; dport = pd->hdr.udp.uh_dport; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); tb = &udbtable; break; default: @@ -6611,6 +6627,7 @@ struct pf_pdesc pd; int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; u_int32_t qid, pqid = 0; + int spl; if (!pf_status.running) return (PF_PASS); @@ -6669,6 +6686,7 @@ /* if packet sits in reassembly queue, return without error */ if (pd.m == NULL) return PF_PASS; + if (action != PF_PASS) { #if NPFLOG > 0 pd.pflog |= PF_LOG_FORCE; @@ -6688,6 +6706,9 @@ } pd.m->m_pkthdr.pf.flags |= PF_TAG_PROCESSED; + /* lock the look-up/write section of pf_test() */ + PF_LOCK(spl); + switch (pd.virtual_proto) { case PF_VPROTO_FRAGMENT: { @@ -6707,7 +6728,7 @@ REASON_SET(&reason, PFRES_NORM); DPFPRINTF(LOG_NOTICE, "dropping IPv6 packet with ICMPv4 payload"); - goto done; + goto unlock; } action = pf_test_state_icmp(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { @@ -6732,7 +6753,7 @@ REASON_SET(&reason, PFRES_NORM); DPFPRINTF(LOG_NOTICE, "dropping IPv4 packet with ICMPv6 payload"); - goto done; + goto unlock; } action = pf_test_state_icmp(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { @@ -6756,8 +6777,9 @@ if ((pd.hdr.tcp.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; action = pf_normalize_tcp(&pd); - if (action == PF_DROP) - goto done; + if (action == PF_DROP) { + goto unlock; + } } action = pf_test_state(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { @@ -6784,6 +6806,16 @@ break; } +unlock: + PF_UNLOCK(spl); + /* + * We've just left the look-up section of pf_test(). Code further down + * assumes, all objects (state, rule, anchor,...) are going to stay + * around. It's fair assumption, NET_LOCK() prevents the items + * we've collected above from removing. Once NET_LOCK() will be gone, + * we must have reference counting ready. + */ + done: if (action != PF_DROP) { if (s) { @@ -6987,6 +7019,7 @@ } *m0 = pd.m; + return (action); } diff -r 21414694ee7a src/sys/net/pf_ioctl.c --- a/src/sys/net/pf_ioctl.c Tue May 30 10:55:41 2017 +0200 +++ b/src/sys/net/pf_ioctl.c Tue May 30 14:25:01 2017 +0200 @@ -129,6 +129,8 @@ TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags), pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids); +struct rwlock pf_lock = RWLOCK_INITIALIZER("pf_lock"); + #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE #endif @@ -999,6 +1001,7 @@ } NET_LOCK(s); + PF_LOCK(s); switch (cmd) { case DIOCSTART: @@ -2452,6 +2455,7 @@ break; } fail: + PF_UNLOCK(s); NET_UNLOCK(s); return (error); } diff -r 21414694ee7a src/sys/net/pf_norm.c --- a/src/sys/net/pf_norm.c Tue May 30 10:55:41 2017 +0200 +++ b/src/sys/net/pf_norm.c Tue May 30 14:25:01 2017 +0200 @@ -39,6 +39,7 @@ #include <sys/time.h> #include <sys/pool.h> #include <sys/syslog.h> +#include <sys/mutex.h> #include <net/if.h> #include <net/if_var.h> @@ -135,6 +136,12 @@ struct pool pf_state_scrub_pl; int pf_nfrents; +struct mutex pf_frag_mtx; + +#define PF_FRAG_MTX_INIT() mtx_init(&pf_frag_mtx, IPL_SOFTNET) +#define PF_FRAG_MTX_ENTER() mtx_enter(&pf_frag_mtx) +#define PF_FRAG_MTX_LEAVE() mtx_leave(&pf_frag_mtx) + void pf_normalize_init(void) { @@ -149,6 +156,8 @@ pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0); TAILQ_INIT(&pf_fragqueue); + + PF_FRAG_MTX_INIT(); } static __inline int @@ -176,15 +185,18 @@ struct pf_fragment *frag; int32_t expire; - NET_ASSERT_LOCKED(); + PF_ASSERT_UNLOCKED(); expire = time_uptime - pf_default_rule.timeout[PFTM_FRAG]; + + PF_FRAG_MTX_ENTER(); while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) { if (frag->fr_timeout > expire) break; DPFPRINTF(LOG_NOTICE, "expiring %d(%p)", frag->fr_id, frag); pf_free_fragment(frag); } + PF_FRAG_MTX_LEAVE(); } /* @@ -533,14 +545,19 @@ key.fr_id = ip->ip_id; key.fr_direction = dir; - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) + PF_FRAG_MTX_ENTER(); + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + PF_FRAG_MTX_LEAVE(); return (PF_DROP); + } /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; - if (!pf_isfull_fragment(frag)) + if (!pf_isfull_fragment(frag)) { + PF_FRAG_MTX_LEAVE(); return (PF_PASS); /* drop because *m0 is NULL, no error */ + } /* We have all the data */ frent = TAILQ_FIRST(&frag->fr_queue); @@ -564,6 +581,7 @@ ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { + PF_FRAG_MTX_LEAVE(); DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); @@ -572,6 +590,7 @@ } DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip->ip_len)); + PF_FRAG_MTX_LEAVE(); return (PF_PASS); } @@ -610,14 +629,19 @@ key.fr_id = fraghdr->ip6f_ident; key.fr_direction = dir; - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) + PF_FRAG_MTX_ENTER(); + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + PF_FRAG_MTX_LEAVE(); return (PF_DROP); + } /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; - if (!pf_isfull_fragment(frag)) + if (!pf_isfull_fragment(frag)) { + PF_FRAG_MTX_LEAVE(); return (PF_PASS); /* drop because *m0 is NULL, no error */ + } /* We have all the data */ extoff = frent->fe_extoff; @@ -671,17 +695,20 @@ ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { + PF_FRAG_MTX_LEAVE(); DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6() */ return (PF_DROP); } + PF_FRAG_MTX_LEAVE(); DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip6->ip6_plen)); return (PF_PASS); fail: + PF_FRAG_MTX_LEAVE(); REASON_SET(reason, PFRES_MEMORY); /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later */ return (PF_DROP); diff -r 21414694ee7a src/sys/net/pfvar_priv.h --- a/src/sys/net/pfvar_priv.h Tue May 30 10:55:41 2017 +0200 +++ b/src/sys/net/pfvar_priv.h Tue May 30 14:25:01 2017 +0200 @@ -37,6 +37,10 @@ #ifdef _KERNEL +#include <sys/rwlock.h> + +extern struct rwlock pf_lock; + struct pf_pdesc { struct { int done; @@ -94,6 +98,32 @@ } hdr; }; +extern struct rwlock pf_lock; + +#define PF_LOCK(s) do { \ + NET_ASSERT_LOCKED(); \ + rw_enter_write(&pf_lock); \ + s = splsoftnet(); \ + } while (0) + +#define PF_UNLOCK(s) do { \ + PF_ASSERT_LOCKED(); \ + splx(s); \ + rw_exit_write(&pf_lock); \ + } while (0) + +#define PF_ASSERT_LOCKED() do { \ + if (rw_status(&pf_lock) != RW_WRITE) \ + splassert_fail(RW_WRITE, \ + rw_status(&pf_lock),__func__);\ + splsoftassert(IPL_SOFTNET); \ + } while (0) + +#define PF_ASSERT_UNLOCKED() do { \ + if (rw_status(&pf_lock) == RW_WRITE) \ + splassert_fail(0, rw_status(&pf_lock), __func__);\ + } while (0) + #endif /* _KERNEL */ #endif /* _NET_PFVAR_PRIV_H_ */ --------8<---------------8<---------------8<------------------8<--------