Hello, </snip> > > Could you you make 2 definitions for the lock? It doesn't make sense > to enable them by default for now. I'd like to see you diff committed > now with empty defines and an easy way to enable it. > > That means ok mpi@ if the defines to not take/release locks by default. >
I introduce WITH_PF_LOCK compile time option. To enable PF_LOCK one has to add 'option WITH_PF_LOCK' to kernel build configuration (see config(8) for details). The PF_LOCK is disabled by default. Same goes to mutex, which protects the fragments. If WITH_PF_LOCK is not found the code builds without pf_frag_mtx. This is the diff to pf-lock.diff I've sent earlier [1], the updated diff I'm going to commit is attached to email. --------8<---------------8<---------------8<------------------8<-------- diff -r cf7bdb0a6054 src/sys/net/pf_ioctl.c --- a/src/sys/net/pf_ioctl.c Tue May 30 20:27:39 2017 +0200 +++ b/src/sys/net/pf_ioctl.c Wed May 31 09:40:33 2017 +0200 @@ -129,7 +129,9 @@ struct { TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags), pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids); +#ifdef WITH_PF_LOCK struct rwlock pf_lock = RWLOCK_INITIALIZER("pf_lock"); +#endif /* WITH_PF_LOCK */ #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE diff -r cf7bdb0a6054 src/sys/net/pf_norm.c --- a/src/sys/net/pf_norm.c Tue May 30 20:27:39 2017 +0200 +++ b/src/sys/net/pf_norm.c Wed May 31 09:40:33 2017 +0200 @@ -136,11 +136,17 @@ struct pool pf_frent_pl, pf_frag_pl; struct pool pf_state_scrub_pl; int pf_nfrents; +#ifdef WITH_PF_LOCK struct mutex pf_frag_mtx; #define PF_FRAG_LOCK_INIT() mtx_init(&pf_frag_mtx, IPL_SOFTNET) #define PF_FRAG_LOCK() mtx_enter(&pf_frag_mtx) #define PF_FRAG_UNLOCK() mtx_leave(&pf_frag_mtx) +#else /* !WITH_PF_LOCK */ +#define PF_FRAG_LOCK_INIT() (void)(0) +#define PF_FRAG_LOCK() (void)(0) +#define PF_FRAG_UNLOCK() (void)(0) +#endif /* WITH_PF_LOCK */ void pf_normalize_init(void) diff -r cf7bdb0a6054 src/sys/net/pfvar_priv.h --- a/src/sys/net/pfvar_priv.h Tue May 30 20:27:39 2017 +0200 +++ b/src/sys/net/pfvar_priv.h Wed May 31 09:40:33 2017 +0200 @@ -98,6 +98,7 @@ struct pf_pdesc { } hdr; }; +#ifdef WITH_PF_LOCK extern struct rwlock pf_lock; #define PF_LOCK() do { \ @@ -120,6 +121,12 @@ extern struct rwlock pf_lock; if (rw_status(&pf_lock) == RW_WRITE) \ splassert_fail(0, rw_status(&pf_lock), __func__);\ } while (0) +#else /* !WITH_PF_LOCK */ +#define PF_LOCK() (void)(0) +#define PF_UNLOCK() (void)(0) +#define PF_ASSERT_LOCKED() (void)(0) +#define PF_ASSERT_UNLOCKED() (void)(0) +#endif /* WITH_PF_LOCK */ #endif /* _KERNEL */ --------8<---------------8<---------------8<------------------8<-------- thanks and regards sasha [1] http://openbsd-archive.7691.n7.nabble.com/let-s-add-PF-LOCK-td319624.html#a319661
diff -r 85b6b6ce74cd .hgtags --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgtags Wed May 31 09:42:36 2017 +0200 @@ -0,0 +1,1 @@ +d545881e2652dbc0c057691a39a095bce92f441f pf-lock.baseline diff -r 85b6b6ce74cd src/sys/net/pf.c --- a/src/sys/net/pf.c Tue May 30 20:11:44 2017 +0200 +++ b/src/sys/net/pf.c Wed May 31 09:42:36 2017 +0200 @@ -923,7 +923,7 @@ int pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw, struct pf_state_key **sks, struct pf_state *s) { - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); s->kif = kif; if (*skw == *sks) { @@ -1186,7 +1186,7 @@ pf_purge_expired_rules(void) { struct pf_rule *r; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); if (SLIST_EMPTY(&pf_rule_gcl)) return; @@ -1207,6 +1207,7 @@ pf_purge_thread(void *v) tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz); NET_LOCK(s); + PF_LOCK(); /* process a fraction of the state table every second */ pf_purge_expired_states(1 + (pf_status.states @@ -1214,13 +1215,20 @@ pf_purge_thread(void *v) /* purge other expired types every PFTM_INTERVAL seconds */ if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { - pf_purge_expired_fragments(); pf_purge_expired_src_nodes(0); pf_purge_expired_rules(); + } + + PF_UNLOCK(); + NET_UNLOCK(s); + + /* + * Fragments don't require PF_LOCK(), they use their own mutex. + */ + if (nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) { + pf_purge_expired_fragments(); nloops = 0; } - - NET_UNLOCK(s); } } @@ -1267,7 +1275,7 @@ pf_purge_expired_src_nodes(void) { struct pf_src_node *cur, *next; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) { next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur); @@ -1303,7 +1311,7 @@ pf_src_tree_remove_state(struct pf_state void pf_remove_state(struct pf_state *cur) { - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); /* handle load balancing related tasks */ pf_postprocess_addr(cur); @@ -1320,7 +1328,6 @@ pf_remove_state(struct pf_state *cur) } RB_REMOVE(pf_state_tree_id, &tree_id, cur); #if NPFLOW > 0 - if (cur->state_flags & PFSTATE_PFLOW) export_pflow(cur); #endif /* NPFLOW > 0 */ #if NPFSYNC > 0 @@ -1350,7 +1357,7 @@ pf_free_state(struct pf_state *cur) { struct pf_rule_item *ri; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); #if NPFSYNC > 0 if (pfsync_state_in_use(cur)) @@ -1386,7 +1393,7 @@ pf_purge_expired_states(u_int32_t maxche static struct pf_state *cur = NULL; struct pf_state *next; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); while (maxcheck--) { /* wrap to start of list when we hit the end */ @@ -3146,13 +3153,13 @@ pf_socket_lookup(struct pf_pdesc *pd) case IPPROTO_TCP: sport = pd->hdr.tcp.th_sport; dport = pd->hdr.tcp.th_dport; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); tb = &tcbtable; break; case IPPROTO_UDP: sport = pd->hdr.udp.uh_sport; dport = pd->hdr.udp.uh_dport; - NET_ASSERT_LOCKED(); + PF_ASSERT_LOCKED(); tb = &udbtable; break; default: @@ -6673,6 +6680,7 @@ pf_test(sa_family_t af, int fwdir, struc /* if packet sits in reassembly queue, return without error */ if (pd.m == NULL) return PF_PASS; + if (action != PF_PASS) { #if NPFLOG > 0 pd.pflog |= PF_LOG_FORCE; @@ -6692,6 +6700,9 @@ pf_test(sa_family_t af, int fwdir, struc } pd.m->m_pkthdr.pf.flags |= PF_TAG_PROCESSED; + /* lock the look-up/write section of pf_test() */ + PF_LOCK(); + switch (pd.virtual_proto) { case PF_VPROTO_FRAGMENT: { @@ -6711,7 +6722,7 @@ pf_test(sa_family_t af, int fwdir, struc REASON_SET(&reason, PFRES_NORM); DPFPRINTF(LOG_NOTICE, "dropping IPv6 packet with ICMPv4 payload"); - goto done; + goto unlock; } action = pf_test_state_icmp(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { @@ -6736,7 +6747,7 @@ pf_test(sa_family_t af, int fwdir, struc REASON_SET(&reason, PFRES_NORM); DPFPRINTF(LOG_NOTICE, "dropping IPv4 packet with ICMPv6 payload"); - goto done; + goto unlock; } action = pf_test_state_icmp(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { @@ -6761,7 +6772,7 @@ pf_test(sa_family_t af, int fwdir, struc pqid = 1; action = pf_normalize_tcp(&pd); if (action == PF_DROP) - goto done; + goto unlock; } action = pf_test_state(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { @@ -6788,6 +6799,15 @@ pf_test(sa_family_t af, int fwdir, struc break; } +unlock: + PF_UNLOCK(); + + /* + * At the moment, we rely on NET_LOCK() to prevent removal of items + * we've collected above ('s', 'r', 'anchor' and 'ruleset'). They'll + * have to be refcounted when NET_LOCK() is gone. + */ + done: if (action != PF_DROP) { if (s) { @@ -6991,6 +7011,7 @@ done: } *m0 = pd.m; + return (action); } diff -r 85b6b6ce74cd src/sys/net/pf_ioctl.c --- a/src/sys/net/pf_ioctl.c Tue May 30 20:11:44 2017 +0200 +++ b/src/sys/net/pf_ioctl.c Wed May 31 09:42:36 2017 +0200 @@ -129,6 +129,10 @@ struct { TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags), pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids); +#ifdef WITH_PF_LOCK +struct rwlock pf_lock = RWLOCK_INITIALIZER("pf_lock"); +#endif /* WITH_PF_LOCK */ + #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE) #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE #endif @@ -999,6 +1003,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a } NET_LOCK(s); + PF_LOCK(); switch (cmd) { case DIOCSTART: @@ -2452,6 +2457,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a break; } fail: + PF_UNLOCK(); NET_UNLOCK(s); return (error); } diff -r 85b6b6ce74cd src/sys/net/pf_norm.c --- a/src/sys/net/pf_norm.c Tue May 30 20:11:44 2017 +0200 +++ b/src/sys/net/pf_norm.c Wed May 31 09:42:36 2017 +0200 @@ -39,6 +39,7 @@ #include <sys/time.h> #include <sys/pool.h> #include <sys/syslog.h> +#include <sys/mutex.h> #include <net/if.h> #include <net/if_var.h> @@ -135,6 +136,18 @@ struct pool pf_frent_pl, pf_frag_pl; struct pool pf_state_scrub_pl; int pf_nfrents; +#ifdef WITH_PF_LOCK +struct mutex pf_frag_mtx; + +#define PF_FRAG_LOCK_INIT() mtx_init(&pf_frag_mtx, IPL_SOFTNET) +#define PF_FRAG_LOCK() mtx_enter(&pf_frag_mtx) +#define PF_FRAG_UNLOCK() mtx_leave(&pf_frag_mtx) +#else /* !WITH_PF_LOCK */ +#define PF_FRAG_LOCK_INIT() (void)(0) +#define PF_FRAG_LOCK() (void)(0) +#define PF_FRAG_UNLOCK() (void)(0) +#endif /* WITH_PF_LOCK */ + void pf_normalize_init(void) { @@ -149,6 +162,8 @@ pf_normalize_init(void) pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0); TAILQ_INIT(&pf_fragqueue); + + PF_FRAG_LOCK_INIT(); } static __inline int @@ -176,15 +191,18 @@ pf_purge_expired_fragments(void) struct pf_fragment *frag; int32_t expire; - NET_ASSERT_LOCKED(); + PF_ASSERT_UNLOCKED(); expire = time_uptime - pf_default_rule.timeout[PFTM_FRAG]; + + PF_FRAG_LOCK(); while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) { if (frag->fr_timeout > expire) break; DPFPRINTF(LOG_NOTICE, "expiring %d(%p)", frag->fr_id, frag); pf_free_fragment(frag); } + PF_FRAG_UNLOCK(); } /* @@ -533,14 +551,19 @@ pf_reassemble(struct mbuf **m0, int dir, key.fr_id = ip->ip_id; key.fr_direction = dir; - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) + PF_FRAG_LOCK(); + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + PF_FRAG_UNLOCK(); return (PF_DROP); + } /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; - if (!pf_isfull_fragment(frag)) + if (!pf_isfull_fragment(frag)) { + PF_FRAG_UNLOCK(); return (PF_PASS); /* drop because *m0 is NULL, no error */ + } /* We have all the data */ frent = TAILQ_FIRST(&frag->fr_queue); @@ -564,6 +587,7 @@ pf_reassemble(struct mbuf **m0, int dir, ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { + PF_FRAG_UNLOCK(); DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); @@ -571,6 +595,7 @@ pf_reassemble(struct mbuf **m0, int dir, return (PF_DROP); } + PF_FRAG_UNLOCK(); DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip->ip_len)); return (PF_PASS); } @@ -610,14 +635,19 @@ pf_reassemble6(struct mbuf **m0, struct key.fr_id = fraghdr->ip6f_ident; key.fr_direction = dir; - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) + PF_FRAG_LOCK(); + if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + PF_FRAG_UNLOCK(); return (PF_DROP); + } /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; - if (!pf_isfull_fragment(frag)) + if (!pf_isfull_fragment(frag)) { + PF_FRAG_UNLOCK(); return (PF_PASS); /* drop because *m0 is NULL, no error */ + } /* We have all the data */ extoff = frent->fe_extoff; @@ -671,17 +701,20 @@ pf_reassemble6(struct mbuf **m0, struct ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { + PF_FRAG_UNLOCK(); DPFPRINTF(LOG_NOTICE, "drop: too big: %d", total); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6() */ return (PF_DROP); } + PF_FRAG_UNLOCK(); DPFPRINTF(LOG_INFO, "complete: %p(%d)", m, ntohs(ip6->ip6_plen)); return (PF_PASS); fail: + PF_FRAG_UNLOCK(); REASON_SET(reason, PFRES_MEMORY); /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later */ return (PF_DROP); diff -r 85b6b6ce74cd src/sys/net/pfvar_priv.h --- a/src/sys/net/pfvar_priv.h Tue May 30 20:11:44 2017 +0200 +++ b/src/sys/net/pfvar_priv.h Wed May 31 09:42:36 2017 +0200 @@ -37,6 +37,10 @@ #ifdef _KERNEL +#include <sys/rwlock.h> + +extern struct rwlock pf_lock; + struct pf_pdesc { struct { int done; @@ -94,6 +98,36 @@ struct pf_pdesc { } hdr; }; +#ifdef WITH_PF_LOCK +extern struct rwlock pf_lock; + +#define PF_LOCK() do { \ + NET_ASSERT_LOCKED(); \ + rw_enter_write(&pf_lock); \ + } while (0) + +#define PF_UNLOCK() do { \ + PF_ASSERT_LOCKED(); \ + rw_exit_write(&pf_lock); \ + } while (0) + +#define PF_ASSERT_LOCKED() do { \ + if (rw_status(&pf_lock) != RW_WRITE) \ + splassert_fail(RW_WRITE, \ + rw_status(&pf_lock),__func__);\ + } while (0) + +#define PF_ASSERT_UNLOCKED() do { \ + if (rw_status(&pf_lock) == RW_WRITE) \ + splassert_fail(0, rw_status(&pf_lock), __func__);\ + } while (0) +#else /* !WITH_PF_LOCK */ +#define PF_LOCK() (void)(0) +#define PF_UNLOCK() (void)(0) +#define PF_ASSERT_LOCKED() (void)(0) +#define PF_ASSERT_UNLOCKED() (void)(0) +#endif /* WITH_PF_LOCK */ + #endif /* _KERNEL */ #endif /* _NET_PFVAR_PRIV_H_ */