from:"Alexandr Nedvedicky"

[alexandr.nedvedi...@oracle.com: unused argument at pf_translate() function]

2014-06-21 Thread Alexandr Nedvedicky

Hello,

resending to better alias as recommended by p...@benzedrine.cx subscribers.

regards
sasha

- Forwarded message from Alexandr Nedvedicky 
alexandr.nedvedi...@oracle.com -

From: Alexandr Nedvedicky alexandr.nedvedi...@oracle.com
To: p...@benzedrine.cx
Subject: unused argument at pf_translate() function

Hello,

I'm not sure the alias is right place to submit patches.

Lint on Solaris complains there is unused argument `m` in function
pf_translate(). Patch below makes lint silent. The cvs-diff is against
the CURRENT branch.

kind regards
sasha

 cut here to get patch --


Index: if_pflog.c
===
RCS file: /cvs/src/sys/net/if_pflog.c,v
retrieving revision 1.58
diff -u -r1.58 if_pflog.c
--- if_pflog.c  16 Nov 2013 00:36:01 -  1.58
+++ if_pflog.c  10 Jun 2014 15:32:13 -
@@ -440,7 +440,7 @@
if (pd.virtual_proto != PF_VPROTO_FRAGMENT 
(pfloghdr-rewritten = pf_translate(pd, pfloghdr-saddr,
pfloghdr-sport, pfloghdr-daddr, pfloghdr-dport, 0,
-   pfloghdr-dir, pd.m))) {
+   pfloghdr-dir))) {
m_copyback(pd.m, pd.off, min(pd.m-m_len - pd.off, pd.hdrlen),
pd.hdr.any, M_NOWAIT);
 #if INET  INET6
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.878
diff -u -r1.878 pf.c
--- pf.c20 May 2014 11:03:13 -  1.878
+++ pf.c10 Jun 2014 15:32:13 -
@@ -3417,7 +3417,7 @@
sk-port[pd-af == pd-naf ? pd-sidx : pd-didx],
sk-addr[pd-af == pd-naf ? pd-didx : pd-sidx],
sk-port[pd-af == pd-naf ? pd-didx : pd-sidx],
-   virtual_type, icmp_dir, pd-m);
+   virtual_type, icmp_dir);
}
} else {
while ((ri = SLIST_FIRST(rules))) {
@@ -3657,7 +3657,7 @@
 int
 pf_translate(struct pf_pdesc *pd, struct pf_addr *saddr, u_int16_t sport,
 struct pf_addr *daddr, u_int16_t dport, u_int16_t virtual_type,
-int icmp_dir, struct mbuf *m)
+int icmp_dir)
 {
/*
 * when called from bpf_mtap_pflog, there are extra constraints:
Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.399
diff -u -r1.399 pfvar.h
--- pfvar.h 22 Apr 2014 14:41:03 -  1.399
+++ pfvar.h 10 Jun 2014 15:32:13 -
@@ -1840,7 +1840,7 @@
 void   pf_pkt_addr_changed(struct mbuf *);
 intpf_state_key_attach(struct pf_state_key *, struct pf_state *, int);
 intpf_translate(struct pf_pdesc *, struct pf_addr *, u_int16_t,
-   struct pf_addr *, u_int16_t, u_int16_t, int, struct mbuf *);
+   struct pf_addr *, u_int16_t, u_int16_t, int);
 intpf_translate_af(struct pf_pdesc *);
 void   pf_route(struct mbuf **, struct pf_rule *, int,
struct ifnet *, struct pf_state *);


Index: if_pflog.c
===
RCS file: /cvs/src/sys/net/if_pflog.c,v
retrieving revision 1.58
diff -u -r1.58 if_pflog.c
--- if_pflog.c  16 Nov 2013 00:36:01 -  1.58
+++ if_pflog.c  10 Jun 2014 15:32:13 -
@@ -440,7 +440,7 @@
if (pd.virtual_proto != PF_VPROTO_FRAGMENT 
(pfloghdr-rewritten = pf_translate(pd, pfloghdr-saddr,
pfloghdr-sport, pfloghdr-daddr, pfloghdr-dport, 0,
-   pfloghdr-dir, pd.m))) {
+   pfloghdr-dir))) {
m_copyback(pd.m, pd.off, min(pd.m-m_len - pd.off, pd.hdrlen),
pd.hdr.any, M_NOWAIT);
 #if INET  INET6
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.878
diff -u -r1.878 pf.c
--- pf.c20 May 2014 11:03:13 -  1.878
+++ pf.c10 Jun 2014 15:32:13 -
@@ -3417,7 +3417,7 @@
sk-port[pd-af == pd-naf ? pd-sidx : pd-didx],
sk-addr[pd-af == pd-naf ? pd-didx : pd-sidx],
sk-port[pd-af == pd-naf ? pd-didx : pd-sidx],
-   virtual_type, icmp_dir, pd-m);
+   virtual_type, icmp_dir);
}
} else {
while ((ri = SLIST_FIRST(rules))) {
@@ -3657,7 +3657,7 @@
 int
 pf_translate(struct pf_pdesc *pd, struct pf_addr *saddr, u_int16_t sport,
 struct pf_addr *daddr, u_int16_t dport, u_int16_t virtual_type,
-int icmp_dir, struct mbuf *m)
+int icmp_dir)
 {
/*
 * when called from bpf_mtap_pflog, there are extra constraints:
Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.399
diff -u -r1.399 pfvar.h
--- pfvar.h 22 Apr 2014 14:41:03 -  1.399
+++ pfvar.h 10

[alexandr.nedvedi...@oracle.com: PF Once rules are not removed from main anchor]

2014-06-21 Thread Alexandr Nedvedicky

Hello,

resending to better alias as recommended by p...@benzedrine.cx subscribers.

regards
sasha

- Forwarded message from Alexandr Nedvedicky 
alexandr.nedvedi...@oracle.com -

From: Alexandr Nedvedicky alexandr.nedvedi...@oracle.com
To: p...@benzedrine.cx
Subject: PF Once rules are not removed from main anchor

Hello,

I'm not sure it is the right place to submit patches. Let me know if there is
better/more appropriate address for this.

during our testing we've found the once rules are not removed,
when used in main anchor.

during debugging we found the rules in main anchor have member anchor set to
NULL (pf_rule::anchor). This makes pf_purge_rule() function to bail out
to early without removing the rule from ruleset.

patch below fixed problem for us.

regards
sasha


 cut here to get patch ---
Index: pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.272
diff -u -r1.272 pf_ioctl.c
--- pf_ioctl.c  22 Apr 2014 14:41:03 -  1.272
+++ pf_ioctl.c  20 Jun 2014 14:26:22 -
@@ -312,7 +312,7 @@
 {
u_int32_tnr;
 
-   if (ruleset == NULL || ruleset-anchor == NULL)
+   if (ruleset == NULL)
return;
 
pf_rm_rule(ruleset-rules.active.ptr, rule);
@@ -325,7 +325,10 @@
ruleset-rules.active.ticket++;
 
pf_calc_skip_steps(ruleset-rules.active.ptr);
-   pf_remove_if_empty_ruleset(ruleset);
+
+   if (ruleset != pf_main_ruleset) {
+   pf_remove_if_empty_ruleset(ruleset);
+   }
 }
 
 u_int16_t




Index: pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.272
diff -u -r1.272 pf_ioctl.c
--- pf_ioctl.c  22 Apr 2014 14:41:03 -  1.272
+++ pf_ioctl.c  20 Jun 2014 14:26:22 -
@@ -312,7 +312,7 @@
 {
u_int32_tnr;

-   if (ruleset == NULL || ruleset-anchor == NULL)
+   if (ruleset == NULL)
return;

pf_rm_rule(ruleset-rules.active.ptr, rule);
@@ -325,7 +325,10 @@
ruleset-rules.active.ticket++;

pf_calc_skip_steps(ruleset-rules.active.ptr);
-   pf_remove_if_empty_ruleset(ruleset);
+
+   if (ruleset != pf_main_ruleset) {
+   pf_remove_if_empty_ruleset(ruleset);
+   }
 }

 u_int16_t



- End forwarded message -
Index: pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.272
diff -u -r1.272 pf_ioctl.c
--- pf_ioctl.c  22 Apr 2014 14:41:03 -  1.272
+++ pf_ioctl.c  20 Jun 2014 14:26:22 -
@@ -312,7 +312,7 @@
 {
u_int32_tnr;

-   if (ruleset == NULL || ruleset-anchor == NULL)
+   if (ruleset == NULL)
return;

pf_rm_rule(ruleset-rules.active.ptr, rule);
@@ -325,7 +325,10 @@
ruleset-rules.active.ticket++;

pf_calc_skip_steps(ruleset-rules.active.ptr);
-   pf_remove_if_empty_ruleset(ruleset);
+
+   if (ruleset != pf_main_ruleset) {
+   pf_remove_if_empty_ruleset(ruleset);
+   }
 }

 u_int16_t

Re: PF Once rules are not removed from main anchor

2014-07-02 Thread Alexandr Nedvedicky

Hello,

thanks for clarifying things.

 However, this solution is not correct for us.  Perhaps you have some
 other changes in your tree to make it work.
 

yes, that's correct. We had to make PF SMP friendly. We don't want packet to
remove the ONCE rule from its ruleset. Instead the pf_test_rule(), marks rule
as deleted and schedules it for garbage collection, so the pf_purge_rule() is
never executed by pf_test_rule(). 

looks like your patch still fits well with our implementation, I'll give it a
try.


thanks and regards
sasha

unused argument in pfr_create_kentry()

2014-09-30 Thread Alexandr Nedvedicky

Hello,

while working with PF code we've found the arg1 (flags) of pfr_create_kentry()
is unused.

the patch is trivial, just in case you are interested.

regards
sasha

 cut here to get patch --

Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.103
diff -u -r1.103 pf_table.c
--- pf_table.c  8 Sep 2014 06:24:13 -   1.103
+++ pf_table.c  30 Sep 2014 16:01:47 -
@@ -148,7 +148,7 @@
 voidpfr_mark_addrs(struct pfr_ktable *);
 struct pfr_kentry  *pfr_lookup_addr(struct pfr_ktable *,
struct pfr_addr *, int);
-struct pfr_kentry  *pfr_create_kentry(struct pfr_addr *, u_int32_t);
+struct pfr_kentry  *pfr_create_kentry(struct pfr_addr *);
 voidpfr_destroy_kentries(struct pfr_kentryworkq *);
 voidpfr_destroy_kentry(struct pfr_kentry *);
 voidpfr_insert_kentries(struct pfr_ktable *,
@@ -306,7 +306,7 @@
ad.pfra_fback = PFR_FB_NONE;
}
if (p == NULL  q == NULL) {
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
senderr(ENOMEM);
if (pfr_route_kentry(tmpkt, p)) {
@@ -482,7 +482,7 @@
ad.pfra_fback = PFR_FB_DUPLICATE;
goto _skip;
}
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
senderr(ENOMEM);
if (pfr_route_kentry(tmpkt, p)) {
@@ -820,7 +820,7 @@
 }
 
 struct pfr_kentry *
-pfr_create_kentry(struct pfr_addr *ad, u_int32_t flags)
+pfr_create_kentry(struct pfr_addr *ad)
 {
struct pfr_kentry_all   *ke;
 
@@ -917,7 +917,7 @@
p = pfr_lookup_addr(kt, ad, 1);
if (p != NULL)
return (0);
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
return (EINVAL);
 
@@ -1614,7 +1614,7 @@
senderr(EINVAL);
if (pfr_lookup_addr(shadow, ad, 1) != NULL)
continue;
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
senderr(ENOMEM);
if (pfr_route_kentry(shadow, p)) {
Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.103
diff -u -r1.103 pf_table.c
--- pf_table.c  8 Sep 2014 06:24:13 -   1.103
+++ pf_table.c  30 Sep 2014 16:01:47 -
@@ -148,7 +148,7 @@
 voidpfr_mark_addrs(struct pfr_ktable *);
 struct pfr_kentry  *pfr_lookup_addr(struct pfr_ktable *,
struct pfr_addr *, int);
-struct pfr_kentry  *pfr_create_kentry(struct pfr_addr *, u_int32_t);
+struct pfr_kentry  *pfr_create_kentry(struct pfr_addr *);
 voidpfr_destroy_kentries(struct pfr_kentryworkq *);
 voidpfr_destroy_kentry(struct pfr_kentry *);
 voidpfr_insert_kentries(struct pfr_ktable *,
@@ -306,7 +306,7 @@
ad.pfra_fback = PFR_FB_NONE;
}
if (p == NULL  q == NULL) {
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
senderr(ENOMEM);
if (pfr_route_kentry(tmpkt, p)) {
@@ -482,7 +482,7 @@
ad.pfra_fback = PFR_FB_DUPLICATE;
goto _skip;
}
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
senderr(ENOMEM);
if (pfr_route_kentry(tmpkt, p)) {
@@ -820,7 +820,7 @@
 }
 
 struct pfr_kentry *
-pfr_create_kentry(struct pfr_addr *ad, u_int32_t flags)
+pfr_create_kentry(struct pfr_addr *ad)
 {
struct pfr_kentry_all   *ke;
 
@@ -917,7 +917,7 @@
p = pfr_lookup_addr(kt, ad, 1);
if (p != NULL)
return (0);
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p = pfr_create_kentry(ad);
if (p == NULL)
return (EINVAL);
 
@@ -1614,7 +1614,7 @@
senderr(EINVAL);
if (pfr_lookup_addr(shadow, ad, 1) != NULL)
continue;
-   p = pfr_create_kentry(ad, kt-pfrkt_flags);
+   p =

mismatch for ICMP state created by inound response

2015-05-18 Thread Alexandr Nedvedicky

Hello,

during our testing we've discovered small glitch in ICMP state handling.
we use simple rule as follows:

# pfctl -sr
pass in on vnet2 all flags S/SA

next we create a local outbound traffic using ping to arbitrary destination
over vnet2 interface. This is what we get:

# ping 172.16.1.2
PING 172.16.1.2 (172.16.1.2): 56 data bytes
64 bytes from 172.16.1.2: icmp_seq=0 ttl=255 time=0.718 ms
ping: sendto: No route to host
ping: wrote 172.16.1.2 64 chars, ret=-1
ping: sendto: No route to host
ping: wrote 172.16.1.2 64 chars, ret=-1
/snip
64 bytes from 172.16.1.2: icmp_seq=20 ttl=255 time=0.587 ms
ping: sendto: No route to host
ping: wrote 172.16.1.2 64 chars, ret=-1

it looks like state created by icmp_seq=0 response must expire first before
firewall is able to put next packet to wire.

It took me a while to figure out what's going on here. I think the problem is
PF keeps packet direction in pf_state::direction, when state gets created, while
the pf_icmp_state_lookup() uses icmp_dir to verify whether packet is valid or
invalid for given state.

The idea of the fix is straightforward:

remember ICMP direction in pf_pdesc, so it can be passed to newly created
state for ICMP packet.

the straightforward fix is bit cluttered by change, which switches icmp_dir to
u_int8_t.

as soon as fix get applied the ping command works, all ICMP probes leave
firewall host.

patch is attached.

regards
sasha
? icmp-state.patch
? pf_table.c.diff
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.913
diff -u -r1.913 pf.c
--- pf.c11 May 2015 12:22:14 -  1.913
+++ pf.c18 May 2015 17:07:14 -
@@ -148,8 +148,8 @@
struct pf_state_peer *);
 voidpf_change_a6(struct pf_pdesc *, struct pf_addr *a,
struct pf_addr *an);
-int pf_icmp_mapping(struct pf_pdesc *, u_int8_t, int *,
-   int *, u_int16_t *, u_int16_t *);
+int pf_icmp_mapping(struct pf_pdesc *, u_int8_t,
+   u_int8_t *, int *, u_int16_t *, u_int16_t *);
 voidpf_change_icmp(struct pf_pdesc *, struct pf_addr *,
u_int16_t *, struct pf_addr *, struct pf_addr *,
u_int16_t, sa_family_t);
@@ -197,7 +197,7 @@
u_short *);
 int pf_icmp_state_lookup(struct pf_pdesc *,
struct pf_state_key_cmp *, struct pf_state **,
-   u_int16_t, u_int16_t, int, int *, int, int);
+   u_int16_t, u_int16_t, u_int8_t, int *, int, int);
 int pf_test_state_icmp(struct pf_pdesc *,
struct pf_state **, u_short *);
 u_int8_tpf_get_wscale(struct pf_pdesc *);
@@ -1689,8 +1689,8 @@
 #endif /* INET6 */
 
 int
-pf_icmp_mapping(struct pf_pdesc *pd, u_int8_t type, int *icmp_dir, int *multi,
-u_int16_t *virtual_id, u_int16_t *virtual_type)
+pf_icmp_mapping(struct pf_pdesc *pd, u_int8_t type, u_int8_t *icmp_dir,
+int *multi, u_int16_t *virtual_id, u_int16_t *virtual_type)
 {
/*
 * ICMP types marked with PF_OUT are typically responses to
@@ -3081,9 +3081,10 @@
int  tag = -1;
int  asd = 0;
int  match = 0;
-   int  state_icmp = 0, icmp_dir, multi;
+   int  state_icmp = 0, multi;
u_int16_tvirtual_type, virtual_id;
u_int8_t icmptype = 0, icmpcode = 0;
+   u_int8_t icmp_dir = (u_int8_t)-1;
 
bzero(act, sizeof(act));
bzero(sns, sizeof(sns));
@@ -3124,7 +3125,10 @@
}
break;
 #endif /* INET6 */
+   default:
+   icmp_dir = (u_int8_t)-1;
}
+   pd-icmp_dir = icmp_dir;
 
ruleset = pf_main_ruleset;
r = TAILQ_FIRST(pf_main_ruleset.rules.active.ptr);
@@ -3549,7 +3553,7 @@
goto csfailed;
}
}
-   s-direction = pd-dir;
+   s-direction = (pd-icmp_dir == (u_int8_t)-1) ? pd-dir : pd-icmp_dir;
 
if (pf_state_key_setup(pd, skw, sks, act-rtableid)) {
REASON_SET(reason, PFRES_MEMORY);
@@ -3627,7 +3631,7 @@
 int
 pf_translate(struct pf_pdesc *pd, struct pf_addr *saddr, u_int16_t sport,
 struct pf_addr *daddr, u_int16_t dport, u_int16_t virtual_type,
-int icmp_dir)
+u_int8_t icmp_dir)
 {
/*
 * when called from bpf_mtap_pflog, there are extra constraints:
@@ -4416,7 +4420,7 @@
 int
 pf_icmp_state_lookup(struct pf_pdesc *pd, struct pf_state_key_cmp *key,
 struct pf_state **state, u_int16_t icmpid, u_int16_t type,
-int icmp_dir, int

Re: mismatch for ICMP state created by inound response

2015-05-18 Thread Alexandr Nedvedicky

Hello,

 Thanks for the patch, we'll be investigating this further.
my deep apologize, I was too fast on send trigger. the patch
is toxic. It breaks the opposite case:

pass out on vnet2 all flags S/SA

once rule above is used with patch applied we drop the first
ICMP reply, so ping stops to work completely.

as you've said: 
 This needs a much closer look, but it might be a result of a bad

I need to study PF source code more...

sorry for extra noise.
regards
sasha

copy'n'paste like typo in pf.c

2015-04-05 Thread Alexandr Nedvedicky

Hello,

when we ran PF sources through coverity we got an error
as follows:

8310   if (ri-r-dst.addr.type == PF_ADDR_TABLE)
8311   pfr_update_stats(ri-r-dst.addr.p.tbl,
8312  s-key[(s-direction == PF_IN)]-
8313  addr[(s-direction == PF_IN)],


CID 38100 (#1 of 1): Copy-paste error (COPY_PASTE_ERROR)copy_paste_error: src
in ri-r-src.neg looks like a copy-paste error.

Should it say dst instead?
8314pd, ri-r-action, ri-r-src.neg);
8315}
8316}

(note: line numbers won't match line numbers in OpenBSD).

It seems to me coveirty is right. Patch against CURRENT is attached.

kind regards
sasha


- cut here to get patch --
Warning: Permanently added 'anoncvs.ca.openbsd.org' (ECDSA) to the list of 
known hosts.
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.909
diff -u -r1.909 pf.c
--- pf.c18 Mar 2015 12:23:15 -  1.909
+++ pf.c5 Apr 2015 09:46:46 -
@@ -6277,7 +6277,7 @@
pfr_update_stats(ri-r-dst.addr.p.tbl,
s-key[(s-direction == PF_IN)]-
addr[(s-direction == PF_IN)],
-   pd, ri-r-action, ri-r-src.neg);
+   pd, ri-r-action, ri-r-dst.neg);
}
}
if (r-src.addr.type == PF_ADDR_TABLE)

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.909
diff -u -r1.909 pf.c
--- pf.c18 Mar 2015 12:23:15 -  1.909
+++ pf.c5 Apr 2015 09:47:18 -
@@ -6277,7 +6277,7 @@
pfr_update_stats(ri-r-dst.addr.p.tbl,
s-key[(s-direction == PF_IN)]-
addr[(s-direction == PF_IN)],
-   pd, ri-r-action, ri-r-src.neg);
+   pd, ri-r-action, ri-r-dst.neg);
}
}
if (r-src.addr.type == PF_ADDR_TABLE)

pfi_kif leaks for PBR rules

2015-04-05 Thread Alexandr Nedvedicky

Hello,

while testing PBR on Solaris we found out the pfi_kif instances
are not removed from pfi_ifs table. We took a look at crashdump
and have seen pfik_route counter at those object is still
non-zero, while all rules were gone.

looking at sources we can see the 'pfik_route' (PFI_KIF_REF_ROUTE)
reference is being grabbed in pfr_create_kentry():

840 case PFRKE_ROUTE:
841 if (ad-pfra_ifname[0])
842 ke-pfrke_rkif = pfi_kif_get(ad-pfra_ifname);
843 if (ke-pfrke_rkif)
844 pfi_kif_ref(ke-pfrke_rkif, PFI_KIF_REF_ROUTE);
845 break;
846 default:
847 panic(unknown pfrke_type %d, ke-pfrke_type);
848 break;

however we have not found any matching pfi_kif_ref() command, which
would remove the reference created by pfr_create_kentry(). It seems
to us the call to

pfi_kif_unref(ke-pfrke_rkif, PFI_KIF_REF_ROUTE)

is missing at pfr_destroy_kentry(). We created patch against OpenBSD CURRENT.
We have no OpenBSD boxes around, where we could verify our fix.

also for your info: IPF in Solaris is on its death row. PF in 11.3
release will be available as optional firewall. We hope to make PF
default (and only firewall) in Solaris 12. You've made excellent job,
your PF is crystal-clear design.

kind regards
sasha

--- cut here to get a patch ---

Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.106
diff -u -r1.106 pf_table.c
--- pf_table.c  14 Mar 2015 03:38:51 -  1.106
+++ pf_table.c  5 Apr 2015 09:59:58 -
@@ -877,6 +877,17 @@
 {
if (ke-pfrke_counters)
pool_put(pfr_kcounters_pl, ke-pfrke_counters);
+
+   switch (ke-pfrke_type) {
+   case PFRKE_COST:
+   /* FALLTHROUGH */
+   case PFRKE_ROUTE:
+   if (ke-pfrke_rkif != NULL) {
+   pfi_kif_unref(ke-pfrke_rkif, PFI_KIF_REF_ROUTE);
+   }
+   break;
+   default:
+   }
pool_put(pfr_kentry_pl[ke-pfrke_type], ke);
 }



Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.106
diff -u -r1.106 pf_table.c
--- pf_table.c  14 Mar 2015 03:38:51 -  1.106
+++ pf_table.c  5 Apr 2015 10:00:07 -
@@ -877,6 +877,17 @@
 {
if (ke-pfrke_counters)
pool_put(pfr_kcounters_pl, ke-pfrke_counters);
+
+   switch (ke-pfrke_type) {
+   case PFRKE_COST:
+   /* FALLTHROUGH */
+   case PFRKE_ROUTE:
+   if (ke-pfrke_rkif != NULL) {
+   pfi_kif_unref(ke-pfrke_rkif, PFI_KIF_REF_ROUTE);
+   }
+   break;
+   default:
+   }
pool_put(pfr_kentry_pl[ke-pfrke_type], ke);
 }

Re: mismatch for ICMP state created by inound response

2015-05-21 Thread Alexandr Nedvedicky

Hello,

 On Tue, May 19, 2015 at 14:07 +0200, Alexandr Nedvedicky wrote:
  Hello Mike,
  
  I've reworked patch from yesterday. I've done some quick testing
  to see if it fixes problem. It looks like it works. I have not
  tested NAT-64 yet. Also I'd like to come up with test case, which
  will show the state check is still able to block invalid ICMP packet
  (invalid with respect to state).
  
  The idea of fix is to keep icmp_dir in state as well. The icmp_dir
  indicates whether state got created by ICMP request or response.
  This is useful later in pf_icmp_state_lookup() to check whether
  ICMP request/response matches state direction.
  
 
 This feels slightly convoluted... check my diff out! (:

nice, I like your XOR Magic! comment. Looks like I was trying to
fix the other end... your patch is minimalistic and correct as far
as I can tell.

  P.S. I took discussion off-line not to create extra noise on 
  tech@openbsd.org
  feel free go get the alias back to loop.
 
 Nah, that's what tech@ is for!
O.K. I won't do it again...


regards
sasha

SMP steroids for PF

2015-06-25 Thread Alexandr Nedvedicky

Hello,

attached is SMP patch for PF. consider it as toxic proof of concept as it has
paniced my amd64 system (see attached phone-shot). I have to figure out how to
debug it yet. The problem is the USB keyboard has died, so I had no chance to
type anything.  fortunately the issue is 100% reproducible.

The patch compiles in .MP and non-MP version. As you'll see more work is
needed to stabilize it and get full SMP support of PF. Those PF
features are not covered by SMP changes:
- packet queues
- packet logging
- pf-sync

patch is attached.

regards
sasha


pf_smp.amd64.2015-06-25.diff.gz
Description: application/gunzip

Re: SMP steroids for PF

2015-06-26 Thread Alexandr Nedvedicky

Sorry, fingers were faster than head (again...)

regards
sasha

On Thu, Jun 25, 2015 at 01:39:29PM -0700, Chris Cappuccio wrote:
 You should re-post as a diff -u !!
 
 Alexandr Nedvedicky [alexandr.nedvedi...@oracle.com] wrote:
  Hello,
  
  attached is SMP patch for PF. consider it as toxic proof of concept as it 
  has
  paniced my amd64 system (see attached phone-shot). I have to figure out how 
  to
  debug it yet. The problem is the USB keyboard has died, so I had no chance 
  to
  type anything.  fortunately the issue is 100% reproducible.
  
  The patch compiles in .MP and non-MP version. As you'll see more work is
  needed to stabilize it and get full SMP support of PF. Those PF
  features are not covered by SMP changes:
  - packet queues
  - packet logging
  - pf-sync
  
  patch is attached.
  
  regards
  sasha
 
 
 
 -- 
 The bums will always lose


pf_smp.amd64.2015-06-26.diff.gz
Description: application/gunzip

Re: SMP steroids for PF

2015-06-26 Thread Alexandr Nedvedicky

On Fri, Jun 26, 2015 at 04:34:06PM +0200, Martin Pieuchot wrote:
 On 26/06/15(Fri) 16:00, Alexandr Nedvedicky wrote:
  Hello Martin,
  
  I accept or your comments. I just have few quick notes/questions now.
  
   2)  I saw that you found some ALTQ leftovers, you have some Solaris
  (2) I think ALTQs leftovers are still in CVS repo, will double check
  anyway. Stack alignment is not Solaris compatibility hack it's sparc
  compatibility. May be your C compiler takes care of this and grants
  16/32/64 bit stack alignment. I have not examined build process
  that closely yet.
 
 By Solaris compatibility I'm referring to the size of ``sa_family_t''
 and the corresponding changes in struct pfr_table.
 
I see. sa_family_t is kind of surprise it's defined as uint16_t on Solaris.
PF at various places mixes sa_family_t with u_int8_t, so all af variables
on Solaris had to be turned to sa_family_t. Some of those changes leaked
backed during merge to current.

  (3)
   use atomic operations rather than per-CPU counters or any other
   solution?  I'm also raising this question because some counters are
  can you point me to manual page or source code sample so I can have a look 
  how
  to use per-CPU counter?
 
 There's no such manual.  I was more asking about the reason for using
 atomic operations.  Is it because you're trying to use existing APIs?
I'm taking what's available.

   5)  I'm not sure to understand the goal of the new pf_refcnt_t type
  (5) Solaris defines pf_refcnt_t as 64-bit unsigned integer, pf_refcnt_t 
  hopes
  to make porting easier. It can be defined as 32-bit on 32-bit machines.
 
 Using a long on OpenBSD will grantee that the value fits in a register,
 so it should be fine.
O.K. thanks for clarification.

 
   7)  The PF_SMP_INSERT_WQ() macro to replace SLIST_INSERT() seems over-
  PF_SMP_INSERT_WQ() purpose of those is to allow every CPU/thread to
  operate on its own work-queue of ktables/kentries. The current pf
  uses 'intrusive' link members pfrkt_workq/pfrke_workq in 
  pfr_ktable/pfr_kentry.
  The only idea is to stay as much close to current version as possible.
 
 I understand that you want to stay close to the current version.  I'm
 just saying that we can also modify the current version to reduce the
 size of your diff.

I understand. I'll try to restructure the patches to make them easier
for review.
 
 
 Regards,
 Martin

Re: SMP steroids for PF

2015-06-26 Thread Alexandr Nedvedicky

Hello Martin,

I accept or your comments. I just have few quick notes/questions now.

 2)  I saw that you found some ALTQ leftovers, you have some Solaris
(2) I think ALTQs leftovers are still in CVS repo, will double check
anyway. Stack alignment is not Solaris compatibility hack it's sparc
compatibility. May be your C compiler takes care of this and grants
16/32/64 bit stack alignment. I have not examined build process
that closely yet.

(3)
 use atomic operations rather than per-CPU counters or any other
 solution?  I'm also raising this question because some counters are
can you point me to manual page or source code sample so I can have a look how
to use per-CPU counter?

 5)  I'm not sure to understand the goal of the new pf_refcnt_t type
(5) Solaris defines pf_refcnt_t as 64-bit unsigned integer, pf_refcnt_t hopes
to make porting easier. It can be defined as 32-bit on 32-bit machines.

 7)  The PF_SMP_INSERT_WQ() macro to replace SLIST_INSERT() seems over-
PF_SMP_INSERT_WQ() purpose of those is to allow every CPU/thread to
operate on its own work-queue of ktables/kentries. The current pf
uses 'intrusive' link members pfrkt_workq/pfrke_workq in pfr_ktable/pfr_kentry.
The only idea is to stay as much close to current version as possible.


I'll try to break the patch into smaller chunks of changes. And post them.

regards
sasha




On Fri, Jun 26, 2015 at 02:36:38PM +0200, Martin Pieuchot wrote:
 Hello Sasha,
 
 Alexandr Nedvedicky [alexandr.nedvedi...@oracle.com] wrote:
  Hello,
  
  attached is SMP patch for PF. consider it as toxic proof of concept as it 
  has
  paniced my amd64 system (see attached phone-shot). I have to figure out how 
  to
  debug it yet. The problem is the USB keyboard has died, so I had no chance 
  to
  type anything.  fortunately the issue is 100% reproducible.
  
  The patch compiles in .MP and non-MP version. As you'll see more work is
  needed to stabilize it and get full SMP support of PF. Those PF
  features are not covered by SMP changes:
  - packet queues
  - packet logging
  - pf-sync
 
 This is an impressive diff, wow!  I started to look at it and my first
 impression is that it is too big.  You should really try to split it
 in smaller pieces to get proper reviews.
 
 Anyway here are some comments about splitting/cleaning this diff, I'll
 need more time to be really able to comment on your work.  I'd just
 want to say wow again.  This is amazing!
 
 1)  Your diff includes a lot of cleanups which are not directly
 related to your SMP work.  By cleanups I'm talking about the
 FALLTHROUGH, #ifdef, comments, (void), etc that your added in
 various places.  I'd suggest submitting a first diff including
 all these cleanups.  It can be easily reviewed and committed and
 this will reduce the noise of your SMP work (and size of the diff).
 
 2)  I saw that you found some ALTQ leftovers, you have some Solaris
 compatibility goo, stack alignment tricks or when sometimes you
 need to return a variable to (de)reference it (ie pf_get_sport).
 These could also be single-shot easy-to-review diffs.
 
 3)  A lot of chunks in your diff are related to counter modifications.
 This could be a diff in itself.  I'm a bit afraid by the number of
 different macro to deal with counters.  Then why did you choose to
 use atomic operations rather than per-CPU counters or any other
 solution?  I'm also raising this question because some counters are
 64bit long and there's no atomic operation to modify such value on
 32bit architectures.
 
 4)  Regarding reference counting around pool-allocated object, I'd
 subject to wrap the pool_{get,put} into their own function this
 would greatly reduce the #ifdef _PF_SMP_/#else dances like:
 
   +#ifdef _PF_SMP_
   +   pf_rule_smp_rele(rule);
   +#else  /* !_PF_SMP_ */
   pool_put(pf_rule_pl, rule);
   +#endif /* !_PF_SMP_ */
   
 
 5)  I'm not sure to understand the goal of the new pf_refcnt_t type
 but using a long (why not unsigned?) makes sense with regards to
 atomic operations.  Note however that your comment describing it
 is incorrect.  I'd rather delete the comment.  It's a good
 explanation for an email but the intend is quite obvious.
 
 6)  In pf_osfp.c rather than changing the signature of some functions
 in the _PF_SMP_ case only, I'd suggest to adapt the existing code.
 Having fewer #ifdef _PF_SMP_/#else makes it easier to understand,
 review and work with the code 8)  This comment also applies to
 pfr_pool_get().
 
 7)  The PF_SMP_INSERT_WQ() macro to replace SLIST_INSERT() seems over-
 generic to me.  Do you plan to use it with a different allocator?
 Can't we use it for the SP version of pf_table too or at least 
 create a macro/function that behave differently for the SMP version
 to reduce the #ifdef dances...
 
 8)  Your protection of the pfi_ifhead RB-tree

Re: pf_create_state() is sometimes better to use pf_unlink_state()

2015-05-28 Thread Alexandr Nedvedicky

/snip
 
  But we'll drop this reference in pf_src_tree_remove_state,
  then how will sns[PF_SN_NAT] and sns[PF_SN_ROUTE] be different?
 
  I think I should take PF class again ;-) I've just realized there
  is a test in pf_remove_src_node():
 
  572 if (sn-states  0 || sn-expire  time_uptime)
  573 return;
 
  so it will do the right thing. This is the piece I was missing.
 
 
  sns[] array was prepared for this state so if we can't insert
  the state, sns entries must be cleaned up.  pf_remove_src_node
  checks the number of associated states and if source node should
  expire some time later.
 
  yes, it seems more clear to me now.
 
 
 Good! At least I wasn't blind this time! (:
 
 
  Speaking of PF_SN_ROUTE, pf_set_rt_ifp should be probably called
  before we insert the state for the very same reason, plus it
  should check the pf_map_addr return value and do the cleanup.
 
  I don't feel entirely qualified now, to discuss the matter ;-)
 
 Hey, don't worry about it.  Half of the people reading this have
 zero clue as to what the hell are we talking about.
 
  however
  pf_set_rt_ifp() should indeed test return value of pf_map_addr(), In case of
  failure the error should be thrown further up, so pf_create_state() can 
  handle
  it. Probably jumping to csfailed: should be sufficient.
 
 
 You can't just jump to csfailed unless you do a pf_set_rt_ifp
 before the pf_state_insert, but then it needs an attached key
 to only get it's address family.

true, because once pf_state_insert() succeeds, it's no longer job for csfailed:
branch to clean up a state.  I currently have no better suggestion than taking
the easiest move:

add  af argument to pf_set_rt_ifp() and fetch it from skw,
in pf_create_state().

Actually there is an option: 
remove KASSERT() at 655 in pf_state_key_attach().
659 pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int 
idx)
660 {
661 struct pf_state_item*si;
662 struct pf_state_key *cur;
663 struct pf_state *olds = NULL;
664
665 KASSERT(s-key[idx] == NULL);
Then we can simply do:
 s-key[PF_SK_WIRE] = skw;
in pf_create_state(), so pf_set_rt_ifp() will get what it expects.

adding af argument to pf_set_rt_if() seems more clean approach to me.
below is your patch with reshuffled pf_create_state(), so pf_set_rt_ifp()
gets called before, pf_state_insert(). I've introduced a new reason:
PFRES_NOROUTE
however I'm not sure if it is descriptive enough...

regards
sasha


? create_state.diff
? pf.c.diff
? pf.c.patch
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.916
diff -u -r1.916 pf.c
--- pf.c26 May 2015 16:17:51 -  1.916
+++ pf.c28 May 2015 21:38:09 -
@@ -204,8 +204,8 @@
 u_int16_t   pf_get_mss(struct pf_pdesc *);
 u_int16_t   pf_calc_mss(struct pf_addr *, sa_family_t, int,
u_int16_t);
-voidpf_set_rt_ifp(struct pf_state *,
-   struct pf_addr *);
+int pf_set_rt_ifp(struct pf_state *,
+   struct pf_addr *, sa_family_t);
 struct pf_divert   *pf_get_divert(struct mbuf *);
 int pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
int, int, u_short *);
@@ -2958,32 +2958,39 @@
return (mss);
 }
 
-void
-pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
+int
+pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr, sa_family_t af)
 {
struct pf_rule *r = s-rule.ptr;
struct pf_src_node *sns[PF_SN_MAX];
+   int rv;
 
s-rt_kif = NULL;
if (!r-rt)
-   return;
+   return (0);
+
bzero(sns, sizeof(sns));
-   switch (s-key[PF_SK_WIRE]-af) {
+   switch (af) {
case AF_INET:
-   pf_map_addr(AF_INET, r, saddr, s-rt_addr, NULL, sns,
+   rv = pf_map_addr(AF_INET, r, saddr, s-rt_addr, NULL, sns,
r-route, PF_SN_ROUTE);
-   s-rt_kif = r-route.kif;
-   s-natrule.ptr = r;
break;
 #ifdef INET6
case AF_INET6:
-   pf_map_addr(AF_INET6, r, saddr, s-rt_addr, NULL, sns,
+   rv = pf_map_addr(AF_INET6, r, saddr, s-rt_addr, NULL, sns,
r-route, PF_SN_ROUTE);
-   s-rt_kif = r-route.kif;
-   s-natrule.ptr = r;
break;
 #endif /* INET6 */
+   default:
+   rv = 1;
}
+
+   if (rv == 0) {
+   s-rt_kif = r-route.kif;
+   s-natrule.ptr = r;
+   }
+
+   return (rv);
 }
 
 u_int32_t
@@ -3557,16 +3564,6 @@
goto csfailed;
}
 
-   if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
-

Re: pf_create_state() is sometimes better to use pf_unlink_state()

2015-05-28 Thread Alexandr Nedvedicky

On Thu, May 28, 2015 at 11:43:02AM +0200, Mike Belopuhov wrote:
 On Thu, May 28, 2015 at 01:17 +0200, Alexandr Nedvedicky wrote:
  Hello,
  
  
  On Wed, May 27, 2015 at 07:44:15PM +0200, Mike Belopuhov wrote:
   On Wed, May 27, 2015 at 10:39 +0200, Alexandr Nedvedicky wrote:
Hello,

  -   if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
  -   pf_state_key_detach(s, PF_SK_STACK);
  -   pf_state_key_detach(s, PF_SK_WIRE);
 
 This bug is not yours, but doing two pf_state_key_detach is wrong
 and results in all kinds of protection fault fireworks.  The right
 thing is to do pf_detach_state that would handle the case where
 PF_SK_STACK == PF_SK_WIRE (i.e. we need only one invocation).
good catch.

 
 In csfailed case we do pf_remove_src_node and then call
 pf_src_tree_remove_state.  With your diff this means that
 pf_src_tree_remove_state will be dereferencing sni-sn that
 might be pool_put by the pf_remove_src_node.  Therefore their
 order needs to be reversed.

I see. Another option to fix it would be to do:

sni-sn = sns[i];
sns[i] = NULL;
   
   
   I'm not sure I follow.  Where do you want to do it?
   If it's when we pool_get sni's, then this would mean
   we won't run pf_remove_src_node and cleanup source
   nodes that might have been created.
   
  
  sorry I was too brief. The snippet below comes from
  pf_create_state() with your patch applied:
  
 3560 for (i = 0; i  PF_SN_MAX; i++)
 3561 if (sns[i] != NULL) {
 3562 struct pf_sn_item   *sni;
 3563
 3564 sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
 3565 if (sni == NULL) {
 3566 REASON_SET(reason, PFRES_MEMORY);
 3567 goto csfailed;
 3568 }
 3569 sni-sn = sns[i];
 3570 sns[i] = NULL;
 3571 SLIST_INSERT_HEAD(s-src_nodes, sni, next);
 3572 sni-sn-states++;
 3573 }
 3574
  
  the point of my suggestion is to transfer ownership of source node from
  local variable sns[] to state. so the check performed later in csfailed
  at line 3617, will find NULL pointer in sns[i] field and won't attempt
  to call pf_remove_src_node().
  
 3610 csfailed:
 3611 if (s) {
 3612 pf_normalize_tcp_cleanup(s);/* safe even w/o 
  init */
 3613 pf_src_tree_remove_state(s);
 3614 }
 3615
 3616 for (i = 0; i  PF_SN_MAX; i++)
 3617 if (sns[i] != NULL)
 3618 pf_remove_src_node(sns[i]);
 3619
  
  
  your patch updated by my suggestion is further below.
  
 
 But that introduces a potential leak, as I said in my mail:
 we won't run pf_remove_src_node and cleanup source nodes that
 might have been created.
 
 sns[] is an array of pointers to independently tracked objects
 (inserted into the pf_src_tree).
 

yes you are right, I was completely wrong. Still my only concern is what
happens in case we fail to allocate source node item (sni) for let's say
PF_SN_NAT type, while there is PF_SN_ROUTE type in sns[] to be processed. 

I'm referring to line numbers above. So we fail to allocate PF_SN_NAT
source node and taking goto at 3567, arriving to 3610.

We do the right thing for state clean up (lines 3612, 3613). And now
arriving to for loop 3616, which will remove all source nodes from
sns[], including PF_SN_ROUTE. I completely agree we should do
pf_remove_src_node(sns[PF_SN_NAT]) as we took reference at 3572 for it.
I'm not sure if we should be calling pf_remove_src_node(sns[PF_SN_ROUTE])
here since line 3572 was not executed for it, for loop at 3560 terminated
prematurely. Probably introducing new for-loop counter 'j' in csfailed:
will fix it:
 3616 for (j = 0; j  i; j++)


regards
sasha

Re: pf_create_state() is sometimes better to use pf_unlink_state()

2015-05-27 Thread Alexandr Nedvedicky

Hello,


On Wed, May 27, 2015 at 07:44:15PM +0200, Mike Belopuhov wrote:
 On Wed, May 27, 2015 at 10:39 +0200, Alexandr Nedvedicky wrote:
  Hello,
  
-   if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
-   pf_state_key_detach(s, PF_SK_STACK);
-   pf_state_key_detach(s, PF_SK_WIRE);
   
   This bug is not yours, but doing two pf_state_key_detach is wrong
   and results in all kinds of protection fault fireworks.  The right
   thing is to do pf_detach_state that would handle the case where
   PF_SK_STACK == PF_SK_WIRE (i.e. we need only one invocation).
  good catch.
  
   
   In csfailed case we do pf_remove_src_node and then call
   pf_src_tree_remove_state.  With your diff this means that
   pf_src_tree_remove_state will be dereferencing sni-sn that
   might be pool_put by the pf_remove_src_node.  Therefore their
   order needs to be reversed.
  
  I see. Another option to fix it would be to do:
  
  sni-sn = sns[i];
  sns[i] = NULL;
 
 
 I'm not sure I follow.  Where do you want to do it?
 If it's when we pool_get sni's, then this would mean
 we won't run pf_remove_src_node and cleanup source
 nodes that might have been created.
 

sorry I was too brief. The snippet below comes from
pf_create_state() with your patch applied:

   3560 for (i = 0; i  PF_SN_MAX; i++)
   3561 if (sns[i] != NULL) {
   3562 struct pf_sn_item   *sni;
   3563
   3564 sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
   3565 if (sni == NULL) {
   3566 REASON_SET(reason, PFRES_MEMORY);
   3567 goto csfailed;
   3568 }
   3569 sni-sn = sns[i];
   3570 sns[i] = NULL;
   3571 SLIST_INSERT_HEAD(s-src_nodes, sni, next);
   3572 sni-sn-states++;
   3573 }
   3574

the point of my suggestion is to transfer ownership of source node from
local variable sns[] to state. so the check performed later in csfailed
at line 3617, will find NULL pointer in sns[i] field and won't attempt
to call pf_remove_src_node().

   3610 csfailed:
   3611 if (s) {
   3612 pf_normalize_tcp_cleanup(s);/* safe even w/o init */
   3613 pf_src_tree_remove_state(s);
   3614 }
   3615
   3616 for (i = 0; i  PF_SN_MAX; i++)
   3617 if (sns[i] != NULL)
   3618 pf_remove_src_node(sns[i]);
   3619


your patch updated by my suggestion is further below.

regards
sasha

? pf.c.diff
? pf.c.patch
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.916
diff -u -r1.916 pf.c
--- pf.c26 May 2015 16:17:51 -  1.916
+++ pf.c27 May 2015 22:59:32 -
@@ -3557,16 +3557,6 @@
goto csfailed;
}
 
-   if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
-   pf_state_key_detach(s, PF_SK_STACK);
-   pf_state_key_detach(s, PF_SK_WIRE);
-   *sks = *skw = NULL;
-   REASON_SET(reason, PFRES_STATEINS);
-   goto csfailed;
-   } else
-   *sm = s;
-
-   /* attach src nodes late, otherwise cleanup on error nontrivial */
for (i = 0; i  PF_SN_MAX; i++)
if (sns[i] != NULL) {
struct pf_sn_item   *sni;
@@ -3574,16 +3564,22 @@
sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
if (sni == NULL) {
REASON_SET(reason, PFRES_MEMORY);
-   pf_src_tree_remove_state(s);
-   STATE_DEC_COUNTERS(s);
-   pool_put(pf_state_pl, s);
-   return (PF_DROP);
+   goto csfailed;
}
sni-sn = sns[i];
+   sns[i] = NULL;
SLIST_INSERT_HEAD(s-src_nodes, sni, next);
sni-sn-states++;
}
 
+   if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
+   pf_detach_state(s);
+   *sks = *skw = NULL;
+   REASON_SET(reason, PFRES_STATEINS);
+   goto csfailed;
+   } else
+   *sm = s;
+
pf_set_rt_ifp(s, pd-src);  /* needs s-state_key set */
if (tag  0) {
pf_tag_ref(tag);
@@ -3612,12 +3608,16 @@
return (PF_PASS);
 
 csfailed:
+   if (s) {
+   pf_normalize_tcp_cleanup(s);/* safe even w/o init */
+   pf_src_tree_remove_state(s);
+   }
+
for (i = 0; i  PF_SN_MAX; i++)
if (sns[i] != NULL

Re: mismatch for ICMP state created by inound response

2015-05-24 Thread Alexandr Nedvedicky

Hello,

  I have no objections, just a small wish, can you set icmp_dir to -1,
  if we are not dealing with ICMP? there is a tool we use in Solaris,
  which yells on us because of uninitialized variable. I know it's
  false positive, but I've gave up on explaining...
  
  patch below combines your fix with my 'wish'.
  
 if you don't mind, i'd rather set it to 0 (PF_IN is 1) right
 where it's declared than add an additional case, like so:

absolutely not, 0 is O.K. thank you very much.

regards
sasha

 diff --git sys/net/pf.c sys/net/pf.c
 index 39d5cb6..5d44f43 100644
 --- sys/net/pf.c
 +++ sys/net/pf.c
 @@ -3075,11 +3075,11 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule 
 **rm, struct pf_state **sm,
   u_short  reason;
   int  rewrite = 0;
   int  tag = -1;
   int  asd = 0;
   int  match = 0;
 - int  state_icmp = 0, icmp_dir;
 + int  state_icmp = 0, icmp_dir = 0;
   u_int16_tvirtual_type, virtual_id;
   u_int8_t icmptype = 0, icmpcode = 0;
  
   bzero(act, sizeof(act));
   bzero(sns, sizeof(sns));
 @@ -3201,10 +3201,15 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule 
 **rm, struct pf_state **sm,
   PF_TEST_ATTRIB((r-type  r-type != icmptype + 1),
   TAILQ_NEXT(r, entries));
   /* icmp only. type always 0 in other cases */
   PF_TEST_ATTRIB((r-code  r-code != icmpcode + 1),
   TAILQ_NEXT(r, entries));
 + /* icmp only. don't create states on replies */
 + PF_TEST_ATTRIB((r-keep_state  !state_icmp 
 + (r-rule_flag  PFRULE_STATESLOPPY) == 0 
 + icmp_dir != PF_IN),
 + TAILQ_NEXT(r, entries));
   break;
  
   default:
   break;
   }

IPv6 reassembly does not work with PBR

2015-05-25 Thread Alexandr Nedvedicky

Hello,

It looks like IPv6 fragments are not forwarded properly, when PBR
rules are used. My testing rules looks as follow:

pass in quick on vnet1 from any to self
pass in quick on vnet2 from any to self
pass in on vnet1 from any to any route-to 2006::2@vnet2 no state
pass in on vnet2 from any to any route-to 2005::2@vnet1 no state

(I've just stolen 2006::/64 @vnet2  2005::/64 @vnet1 prefixes since my test
network lives in LDOMs, which don't talk to outside world...)

The attached patch makes sure pf_route6() uses pf_refragment6() before it puts
reassembled packet to destination NIC. It looks like the proposed
fix works for me. I have not tried it with NAT-64 yet.

I've also did not shrink MTU on vnet2 (destination NIC) to see what happens.

MTU(vnet1)  MTU(vnet2)

it's quite possible the patch is not complete yet.

regards
sasha
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.915
diff -u -p -u -r1.915 pf.c
--- pf.c22 May 2015 14:18:55 -  1.915
+++ pf.c25 May 2015 16:12:23 -
@@ -5598,6 +5598,7 @@ pf_route6(struct mbuf **m, struct pf_rul
struct ifnet*ifp = NULL;
struct pf_addr   naddr;
struct pf_src_node  *sns[PF_SN_MAX];
+   struct m_tag*mtag;
 
if (m == NULL || *m == NULL || r == NULL ||
(dir != PF_IN  dir != PF_OUT) || oifp == NULL)
@@ -5632,6 +5633,9 @@ pf_route6(struct mbuf **m, struct pf_rul
dst-sin6_addr = ip6-ip6_dst;
 
if (!r-rt) {
+   /*
+* XXX how we can be so sure m0 fits to wire?
+*/
m0-m_pkthdr.pf.flags |= PF_TAG_GENERATED;
ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
return;
@@ -5672,13 +5676,16 @@ pf_route6(struct mbuf **m, struct pf_rul
 
in6_proto_cksum_out(m0, ifp);
 
-   /*
-* If the packet is too large for the outgoing interface,
-* send back an icmp6 error.
-*/
if (IN6_IS_SCOPE_EMBED(dst-sin6_addr))
dst-sin6_addr.s6_addr16[1] = htons(ifp-if_index);
-   if ((u_long)m0-m_pkthdr.len = ifp-if_mtu) {
+
+   /*
+* If packet has been reassembled by PF earlier, we have to
+* use pf_refragment6() here to turn it back to fragments.
+*/
+   if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) {
+   (void) pf_refragment6(m0, mtag, PF_OUT, dst, ifp);
+   } else if ((u_long)m0-m_pkthdr.len = ifp-if_mtu) {
nd6_output(ifp, m0, dst, NULL);
} else {
in6_ifstat_inc(ifp, ifs6_in_toobig);
@@ -6593,7 +6600,7 @@ done:
struct m_tag*mtag;
 
if ((mtag = m_tag_find(*m0, PACKET_TAG_PF_REASSEMBLED, NULL)))
-   action = pf_refragment6(m0, mtag, fwdir);
+   action = pf_refragment6(m0, mtag, fwdir, NULL, NULL);
}
 #endif
if (s  action != PF_DROP) {
Index: pf_norm.c
===
RCS file: /cvs/src/sys/net/pf_norm.c,v
retrieving revision 1.178
diff -u -p -u -r1.178 pf_norm.c
--- pf_norm.c   5 May 2015 23:27:47 -   1.178
+++ pf_norm.c   25 May 2015 16:12:23 -
@@ -57,6 +57,8 @@
 #ifdef INET6
 #include netinet/ip6.h
 #include netinet6/ip6_var.h
+#include netinet6/in6_var.h
+#include netinet6/nd6.h
 #endif /* INET6 */
 
 #include net/pfvar.h
@@ -680,7 +682,8 @@ fail:
 }
 
 int
-pf_refragment6(struct mbuf **m0, struct m_tag *mtag, int dir)
+pf_refragment6(struct mbuf **m0, struct m_tag *mtag, int dir,
+struct sockaddr_in6 *dst, struct ifnet *ifp)
 {
struct mbuf *m = *m0, *t;
struct pf_fragment_tag  *ftag = (struct pf_fragment_tag *)(mtag + 1);
@@ -743,10 +746,14 @@ pf_refragment6(struct mbuf **m0, struct 
t = m-m_nextpkt;
m-m_nextpkt = NULL;
m-m_pkthdr.pf.flags |= PF_TAG_REFRAGMENTED;
-   if (error == 0)
-   ip6_forward(m, 0);
-   else
+   if (error == 0) {
+   if (ifp == NULL)
+   ip6_forward(m, 0);
+   else
+   nd6_output(ifp, m, dst, NULL);
+   } else {
m_freem(m);
+   }
}
 
return (action);
Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.414
diff -u -p -u -r1.414 pfvar.h
--- pfvar.h 11 Apr 2015 13:00:12 -  1.414
+++ pfvar.h 25 May 2015 16:12:23 -
@@ -1826,7 +1826,8 @@ int   pf_match_port(u_int8_t, u_int16_t, u
 intpf_match_uid(u_int8_t, uid_t, uid_t, uid_t);
 intpf_match_gid(u_int8_t, gid_t, gid_t, gid_t);
 
-intpf_refragment6(struct mbuf **, struct m_tag

Re: mismatch for ICMP state created by inound response

2015-05-21 Thread Alexandr Nedvedicky

Hello,

 
 Well, not entirely (:  I did it while exploring the code and sent
 out to provoke further discussion.  Today I've talked to reyk@ and
 we think that it's better to go down a different road: make sure we
 don't create states on reply packets in the first place.
 
that's actually very wise approach as replies can be spoofed...

 I've tested this with ICMP, ICMPv6 and NAT64 (slightly).  Any OKs?
 Objections?

I have no objections, just a small wish, can you set icmp_dir to -1,
if we are not dealing with ICMP? there is a tool we use in Solaris,
which yells on us because of uninitialized variable. I know it's
false positive, but I've gave up on explaining...

patch below combines your fix with my 'wish'.

thanks a lot
regards
sasha

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.913
diff -u -r1.913 pf.c
--- pf.c11 May 2015 12:22:14 -  1.913
+++ pf.c21 May 2015 18:59:29 -
@@ -3124,6 +3124,8 @@
}
break;
 #endif /* INET6 */
+   default:
+   icmp_dir = -1;
}
 
ruleset = pf_main_ruleset;
@@ -3206,6 +3208,11 @@
TAILQ_NEXT(r, entries));
/* icmp only. type always 0 in other cases */
PF_TEST_ATTRIB((r-code  r-code != icmpcode + 1),
TAILQ_NEXT(r, entries));
+   /* icmp only. don't create states on replies */
+   PF_TEST_ATTRIB((r-keep_state  !state_icmp 
+   (r-rule_flag  PFRULE_STATESLOPPY) == 0 
+   icmp_dir != PF_IN),
TAILQ_NEXT(r, entries));
break;

Re: pf_create_state() is sometimes better to use pf_unlink_state()

2015-05-21 Thread Alexandr Nedvedicky

Hello,

On Thu, May 21, 2015 at 07:43:51PM +0200, Mike Belopuhov wrote:
 On Thu, May 21, 2015 at 17:34 +0200, Alexandr Nedvedicky wrote:
  Hello,
 
 
 Hi,
 
  snippet below comes from pf_create_state():
  
 3559 if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
 3560 pf_state_key_detach(s, PF_SK_STACK);
 3561 pf_state_key_detach(s, PF_SK_WIRE);
 3562 *sks = *skw = NULL;
 3563 REASON_SET(reason, PFRES_STATEINS);
 3564 goto csfailed;
 3565 } else
 3566 *sm = s;
 3567 
 3568 /* attach src nodes late, otherwise cleanup on error 
  nontrivial */
 3569 for (i = 0; i  PF_SN_MAX; i++)
 3570 if (sns[i] != NULL) {
 3571 struct pf_sn_item   *sni;
 3572 
 3573 sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
 3574 if (sni == NULL) {
 3575 REASON_SET(reason, PFRES_MEMORY);
 3576 pf_src_tree_remove_state(s);
 3577 STATE_DEC_COUNTERS(s);
 3578 pool_put(pf_state_pl, s);
 3579 return (PF_DROP);
 3580 }
 3581 sni-sn = sns[i];
 3582 SLIST_INSERT_HEAD(s-src_nodes, sni, next);
 3583 sni-sn-states++;
 3584 }
  
  at line 3559 PF inserts state to table. If insert operation succeeds, then
  state can no longer be killed using simple pool_put() as it currently
  happens at line 3578. I think PF should go for pf_unlink_state() instead.
  
  patch below should kill the bug.
 
 
 Indeed.  But I don't like the comment stating that we're attaching
 src nodes late because the cleanup on error nontrivial.  Perhaps
 we should do a pf_state_insert afterwards?  This might simplify
 locking later on.

perhaps swapping the for loop block with pf_state_insert() will work.
We can then bail out using goto csfailed then (see patch below...)

 we should do a pf_state_insert afterwards?  This might simplify
 locking later on.

speaking of locking... everything is more complicated, however
in this particular case it makes things easier for sure. basically in
our current SMP model, once we insert state to table, the only way
to remove it is to leave the job on garbage collector thread...

 
  also one more off-topic question:
  
  would you be interested in SMP patch for PF?
  it basically introduces fine locking and reference counting
  on PF data objects, so firewall can handle more packets at
  single instance of time.
 
 
 We would definitely be interested in such a diff, but at the moment
 there's no use for it as pf_test is called directly from the IP stack
 and hence IP stack needs to support parallel execution first.  This
 doesn't mean that we can't start integrating bits and pieces, esp.
 if they're presented as separate patches.

we use a compile time switch to enable SMP code (-D_PF_SMP_).  this is
something I'd like to keep around anyway. I'll start on syncing up our changes
with CURRENT. It will take couple of weeks (ENOCYCLES as usually). Once
I'll have something to show I'll share it and you'll see what can be done
with it.

 
 Thanks for your quality diffs btw, help is always much appreciated.
 
I'm just trying to be useful. It's pleasure to work with PF sources.

thanks and
regards
sasha

===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.913
diff -u -r1.913 pf.c
--- pf.c11 May 2015 12:22:14 -  1.913
+++ pf.c21 May 2015 19:18:04 -
@@ -3556,15 +3556,6 @@
goto csfailed;
}

-   if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
-   pf_state_key_detach(s, PF_SK_STACK);
-   pf_state_key_detach(s, PF_SK_WIRE);
-   *sks = *skw = NULL;
-   REASON_SET(reason, PFRES_STATEINS);
-   goto csfailed;
-   } else
-   *sm = s;
-
/* attach src nodes late, otherwise cleanup on error nontrivial */
for (i = 0; i  PF_SN_MAX; i++)
if (sns[i] != NULL) {
@@ -3572,16 +3563,21 @@ 

sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
if (sni == NULL) {
-   REASON_SET(reason, PFRES_MEMORY);
-   pf_src_tree_remove_state(s);
-   STATE_DEC_COUNTERS(s);
-   pool_put(pf_state_pl, s);
-   return (PF_DROP);
+   goto csfailed;
}
sni-sn = sns[i

pf_create_state() is sometimes better to use pf_unlink_state()

2015-05-21 Thread Alexandr Nedvedicky

Hello,

snippet below comes from pf_create_state():

   3559 if (pf_state_insert(BOUND_IFACE(r, pd-kif), skw, sks, s)) {
   3560 pf_state_key_detach(s, PF_SK_STACK);
   3561 pf_state_key_detach(s, PF_SK_WIRE);
   3562 *sks = *skw = NULL;
   3563 REASON_SET(reason, PFRES_STATEINS);
   3564 goto csfailed;
   3565 } else
   3566 *sm = s;
   3567 
   3568 /* attach src nodes late, otherwise cleanup on error nontrivial 
*/
   3569 for (i = 0; i  PF_SN_MAX; i++)
   3570 if (sns[i] != NULL) {
   3571 struct pf_sn_item   *sni;
   3572 
   3573 sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
   3574 if (sni == NULL) {
   3575 REASON_SET(reason, PFRES_MEMORY);
   3576 pf_src_tree_remove_state(s);
   3577 STATE_DEC_COUNTERS(s);
   3578 pool_put(pf_state_pl, s);
   3579 return (PF_DROP);
   3580 }
   3581 sni-sn = sns[i];
   3582 SLIST_INSERT_HEAD(s-src_nodes, sni, next);
   3583 sni-sn-states++;
   3584 }

at line 3559 PF inserts state to table. If insert operation succeeds, then
state can no longer be killed using simple pool_put() as it currently
happens at line 3578. I think PF should go for pf_unlink_state() instead.

patch below should kill the bug.

also one more off-topic question:

would you be interested in SMP patch for PF?
it basically introduces fine locking and reference counting
on PF data objects, so firewall can handle more packets at
single instance of time.

regards
sasha

===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.913
diff -u -r1.913 pf.c
--- pf.c11 May 2015 12:22:14 -  1.913
+++ pf.c21 May 2015 15:20:01 -
@@ -3573,9 +3573,7 @@   
sni = pool_get(pf_sn_item_pl, PR_NOWAIT);
if (sni == NULL) {
REASON_SET(reason, PFRES_MEMORY);
-   pf_src_tree_remove_state(s);
-   STATE_DEC_COUNTERS(s);
-   pool_put(pf_state_pl, s);
+   pf_unlink_state(s);
return (PF_DROP);
}
sni-sn = sns[i];

Re: PF SMP: making anchor stack multithreaded

2015-08-08 Thread Alexandr Nedvedicky

Hello,

I've reworked the anchor handling so the traversal uses true recursion now.
Using recursion here will allow us to implement ruleset locking in nicer
fashion.  The idea is to split current pf_test_rule() into two functions:
pf_test_rule() and pf_match_rule().

pf_step_into_anchor() is changed to drive recursive anchor traversal. It calls
pf_match_rule() to match rules in nested rulesets.  pf_step_out_of_anchor() has
been merged into new pf_step_into_anchor()

To minimize stack frame size a pf_test_ctx is introduced. Its members are
various variables, which used to be local at former pf_test_rule().  The
pf_test_ctx instance is a local variable of new pf_test_rule().
pf_match_rule() receives pointer to pf_test_ctx as its argument so it can reach
all variables it needs. The goal is to move out as many local variables from
pf_match_rule() and pf_step_into_anchor() as possible to save memory.

To minimize amount of differences macros to access members in pf_test_ctx
are introduced. Once consensus on proposed approach will be reached, we
can polish the patch a bit.

I did some basic testing with rules as follows:

pass all
anchor ap self to 10.0.0.0/8 {
block proto tcp from self to 10.0.0.138 port 23
pass proto tcp from self to 10.0.0.138 port 23 once
}

and wildcard variant. It seems to me it works, but I'll be glad for any
further testing tips.

regards
sasha

-88---8

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.935
diff -u -p -r1.935 pf.c
--- pf.c21 Jul 2015 02:32:04 -  1.935
+++ pf.c8 Aug 2015 09:44:01 -
@@ -114,13 +114,36 @@ u_char pf_tcp_secret[16];
 int pf_tcp_secret_init;
 int pf_tcp_iss_off;
 
-struct pf_anchor_stackframe {
-   struct pf_ruleset   *rs;
-   struct pf_rule  *r;
-   struct pf_anchor_node   *parent;
-   struct pf_anchor*child;
-} pf_anchor_stack[64];
+struct pf_test_ctx {
+   int _test_status;
+   struct pf_pdesc *_pd;
+   struct pf_rule_actions  _act;
+   u_int8_t_icmpcode;
+   u_int8_t_icmptype;
+   int _icmp_dir;
+   int _state_icmp;
+   int _tag;
+   u_short _reason;
+   struct pf_rule_item *_ri;
+   struct pf_src_node  *_sns[PF_SN_MAX];
+   struct pf_rule_slist_rules;
+   struct pf_rule  *_nr;
+   struct pf_rule  **_rm;
+   struct pf_rule  *_a;
+   struct pf_rule  **_am;
+   struct pf_ruleset   **_rsm;
+   struct pf_ruleset   *_arsm;
+   struct pf_ruleset   *_aruleset;
+   int  _depth;
+};
+
+#definePF_ANCHOR_STACK_MAX 64
 
+enum {
+   PF_FAIL = -1,
+   PF_OK,
+   PF_QUICK
+};
 /*
  * Cannot fold into pf_pdesc directly, unknown storage size outside pf.c.
  * Keep in sync with union pf_headers in pflog_bpfcopy() in if_pflog.c.
@@ -225,11 +248,8 @@ struct pf_state*pf_find_state(struct p
struct pf_state_key_cmp *, u_int, struct mbuf *);
 int pf_src_connlimit(struct pf_state **);
 int pf_match_rcvif(struct mbuf *, struct pf_rule *);
-voidpf_step_into_anchor(int *, struct pf_ruleset **,
-   struct pf_rule **, struct pf_rule **);
-int pf_step_out_of_anchor(int *, struct pf_ruleset **,
-struct pf_rule **, struct pf_rule **,
-int *);
+int pf_step_into_anchor(struct pf_test_ctx *, struct 
pf_rule *);
+int pf_match_rule(struct pf_test_ctx *, struct pf_ruleset 
*);
 voidpf_counters_inc(int, struct pf_pdesc *,
struct pf_state *, struct pf_rule *,
struct pf_rule *);
@@ -2626,74 +2646,44 @@ pf_tag_packet(struct mbuf *m, int tag, i
m-m_pkthdr.ph_rtableid = (u_int)rtableid;
 }
 
-void
-pf_step_into_anchor(int *depth, struct pf_ruleset **rs,
-struct pf_rule **r, struct pf_rule **a)
+int
+pf_step_into_anchor(struct pf_test_ctx *tctx, struct pf_rule *r)
 {
-   struct pf_anchor_stackframe *f;
+#definedepth   (tctx-_depth)
+#defineam  (tctx-_am)
+#defineparent  r-anchor-children
+   int rv;
 
-   if (*depth = sizeof(pf_anchor_stack) /
-   sizeof(pf_anchor_stack[0])) {
+   if (depth = PF_ANCHOR_STACK_MAX) {
log(LOG_ERR, pf_step_into_anchor: stack overflow\n);
-   *r = TAILQ_NEXT(*r,

patch fixes IPv6 reassembly when packet hits PBR rule

2015-08-17 Thread Alexandr Nedvedicky

Hello,

I'm sending finalized patch. The final shape has been discussed with bluhm@,
who was kind enough to do thorough testing on OpenBSD.

The patch solves the problem for deployment as follows:


Client  -- MTU_1 -- PF -- MTU_1 -- Router -- MTU_2 -- Server

MTU_2 = MTU_1/2

PF does IPv6 reassembly for PBRed packets (packets forwarded on behalf of
route-to action). Let's assume client sends packet of MTU_1 size. PF forwards
packet to destination. Router discards packet, because it does not
fit to wire. It sends packet too big suggesting MTU_2.

Client retries with fragmented packet this time. In our scenario it uses
MTU_1/2. PF reassembles the packet and is going to forwarded to destination
as ordered by route-to action. Snippet below comes from pf_route6():

5710/*
5711 * If the packet is too large for the outgoing interface,
5712 * send back an icmp6 error.
5713 */
5714if (IN6_IS_SCOPE_EMBED(dst-sin6_addr))
5715dst-sin6_addr.s6_addr16[1] = htons(ifp-if_index);
5716if ((u_long)m0-m_pkthdr.len = ifp-if_mtu) {
5717nd6_output(ifp, m0, dst, NULL);
5718} else {
5719in6_ifstat_inc(ifp, ifs6_in_toobig);
5720if (r-rt != PF_DUPTO)
5721icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, 
ifp-if_mtu);
5722else
5723goto bad;
5724}
 
test at line 5716 is invalid for our scenario. In our case condition
is met, so PF puts IPv6 reassembled packet to wire, while it should turn
the packet back to fragments as sent by client.

The fix is straightforward. Whenever pf_route6() deals with reassembled
packet it must use pf_refragment6() to turn it back to fragments.
pf_refragment6() must use nd6_output() to put fragment to wire, if it is
invoked on behalf of pf_route6().

bluhm@ exchanged couple of emails with me to finalize the patch I intend to
commit. These are the two points I'd like to highlight:

- PF should report ICMP packet too big for every fragment
  if it fails to put to wire (1)

- the dup-to actions should not be treated as special case
  curent version does not send ICMP pakcet too big for duped
  packets. proposed patch changes that (2)

Ad (1) the earlier patch version I've sent in June sends only one ICMP packet
too big message as it hits the first for the first fragment, which exceeds MTU,
all other fragments are discarded only. This would be a deviation to regular
IPv6 router, which sends MTU exceeded to every packet, which exceeds MTU. The
patch proposed here makes PF to act as regular router.

Ad (2) dup-to actions: we assume the dup-to is used to pass duped packets to
IDS deployed somewhere on network. So if duped packet exceeds MTU we let PF to
send ICMP packet too big back to packet source, so it will somewhat enforce
correct fragment size to keep IDS in picture. Still the dup-to usecase is kind
of blurry to me, if someone can point me to yet another use case for dup-to
I'll be glad.

I believe the what we have is optimal fix.

regards
sasha

8---8---8--8

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.935
diff -u -p -r1.935 pf.c
--- pf.c21 Jul 2015 02:32:04 -  1.935
+++ pf.c17 Aug 2015 17:58:55 -
@@ -5633,6 +5633,7 @@ pf_route6(struct mbuf **m, struct pf_rul
struct ifnet*ifp = NULL;
struct pf_addr   naddr;
struct pf_src_node  *sns[PF_SN_MAX];
+   struct m_tag*mtag;
 
if (m == NULL || *m == NULL || r == NULL ||
(dir != PF_IN  dir != PF_OUT) || oifp == NULL)
@@ -5707,20 +5708,20 @@ pf_route6(struct mbuf **m, struct pf_rul
 
in6_proto_cksum_out(m0, ifp);
 
-   /*
-* If the packet is too large for the outgoing interface,
-* send back an icmp6 error.
-*/
if (IN6_IS_SCOPE_EMBED(dst-sin6_addr))
dst-sin6_addr.s6_addr16[1] = htons(ifp-if_index);
-   if ((u_long)m0-m_pkthdr.len = ifp-if_mtu) {
+
+   /*
+* If packet has been reassembled by PF earlier, we have to
+* use pf_refragment6() here to turn it back to fragments.
+*/
+   if ((mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL))) {
+   (void) pf_refragment6(m0, mtag, dst, ifp);
+   } else if ((u_long)m0-m_pkthdr.len = ifp-if_mtu) {
nd6_output(ifp, m0, dst, NULL);
} else {
in6_ifstat_inc(ifp, ifs6_in_toobig);
-   if (r-rt != PF_DUPTO)
-   icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp-if_mtu);
-   else
-   goto bad;
+   icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp-if_mtu);
}
 
 done:

potential memory leak when pf_create_state() fails

2015-07-16 Thread Alexandr Nedvedicky

Hello,

It seems to me PF might leak rule items when pf_create_state() fails to create
state for matching packet.

The scenario is as follows:

packet matches couple of 'match' rules before it hits a 'pass' rule
match rules are kept in `rules` single list, which is a local variable
of pf_test_rule()

3238 if (r-action == PF_MATCH) {
3239 if ((ri = pool_get(pf_rule_item_pl,
3240 PR_NOWAIT)) == NULL) {
3241 REASON_SET(reason, PFRES_MEMORY);
3242 goto cleanup;
3243 }
3244 ri-r = r;
3245 /* order is irrelevant */
3246 SLIST_INSERT_HEAD(rules, ri, entry);
3247 pf_rule_to_actions(r, act);


as soon as pf_test_rule() is done with rules it proceeds to state creation,
we are passing the `rules` as arg9

3373 
3374 action = pf_create_state(pd, r, a, nr, skw, sks, 
rewrite,
3375 sm, tag, rules, act, sns);
3376 
3377 if (action != PF_PASS)
3378 return (action);
3379 if (sks != skw) {

note we are doing return at line 3378, when pf_create_state() fails to 
create
state and does not return PF_PASS. pf_create_state() assumes `rules` are 
either
bound to state or released by pf_create_state().

So let's see what happens in pf_create_state():

3451 s = pool_get(pf_state_pl, PR_NOWAIT | PR_ZERO);
3452 if (s == NULL) {
3453 REASON_SET(reason, PFRES_MEMORY);
3454 goto csfailed;
3455 }
3456 s-rule.ptr = r;
3457 s-anchor.ptr = a;
3458 s-natrule.ptr = nr;
3459 memcpy(s-match_rules, rules, sizeof(s-match_rules));

if we fail to allocate state we jump right to csfailed returning PF_DROP

3614 csfailed:
3615 if (s) {
3616 pf_normalize_tcp_cleanup(s);/* safe even w/o init */
3617 pf_src_tree_remove_state(s);
3618 pool_put(pf_state_pl, s);
3619 }
3620 
3621 for (i = 0; i  PF_SN_MAX; i++)
3622 if (sns[i] != NULL)
3623 pf_remove_src_node(sns[i]);
3624 
3625 return (PF_DROP);

without releasing `rules` (list of pf_rule_item)

The easiest way to fix it is to make sure pf_test_rule() releases `rules` 
whenever
pf_create_state() fails. Patches (-p, -u) are attached.

regards
sasha


? create_state.diff
? meleak.diff-p
? meleak.diff-u
? pbr-ipv6-reass.diff-p
? pbr-ipv6-reass.diff-u
? pf.c.diff
? pf.c.patch
? sa_family_t.diff
? sa_family_t.diff-p
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.924
diff -p -r1.924 pf.c
*** pf.c16 Jul 2015 21:14:21 -  1.924
--- pf.c16 Jul 2015 22:56:17 -
*** pf_test_rule(struct pf_pdesc *pd, struct
*** 3055,3060 
--- 3055,3061 
int  state_icmp = 0, icmp_dir = 0;
u_int16_tvirtual_type, virtual_id;
u_int8_t icmptype = 0, icmpcode = 0;
+   int  action = PF_DROP;
  
bzero(act, sizeof(act));
bzero(sns, sizeof(sns));
*** pf_test_rule(struct pf_pdesc *pd, struct
*** 3330,3336 
  
if (pd-virtual_proto != PF_VPROTO_FRAGMENT
 !state_icmp  r-keep_state) {
-   int action;
  
if (r-rule_flag  PFRULE_SRCTRACK 
pf_insert_src_node(sns[PF_SN_NONE], r, PF_SN_NONE, pd-af,
--- 3331,3336 
*** pf_test_rule(struct pf_pdesc *pd, struct
*** 3349,3355 
sm, tag, rules, act, sns);
  
if (action != PF_PASS)
!   return (action);
if (sks != skw) {
struct pf_state_key *sk;
  
--- 3349,3355 
sm, tag, rules, act, sns);
  
if (action != PF_PASS)
!   goto cleanup;
if (sks != skw) {
struct pf_state_key *sk;
  
*** cleanup:
*** 3407,3413 
pool_put(pf_rule_item_pl, ri);
}
  
!   return (PF_DROP);
  }
  
  static __inline int
--- 3407,3413 
pool_put(pf_rule_item_pl, ri);
}
  
!   return (action);
  }
  
  static __inline int
*** pf_create_state(struct pf_pdesc *pd, str
*** 3430,3436 
s-rule.ptr = r;
s-anchor.ptr = a;
s-natrule.ptr = nr;
-   memcpy(s-match_rules, rules, sizeof(s-match_rules));
if (r-allow_opts)
s-state_flags |= PFSTATE_ALLOWOPTS;
if (r-rule_flag  PFRULE_STATESLOPPY)
---

sa_family_t is not always equal to u_int8_t

2015-07-16 Thread Alexandr Nedvedicky

Hello,

we hit this problem while building PF on Solaris, where sizeof(sa_family_t) == 2
patch below fixes the problem for Solaris.

regards
sasha

cvs diff -p output:

--8--8--8--

Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.416
diff -p -r1.416 pfvar.h
*** pfvar.h 16 Jul 2015 21:14:21 -  1.416
--- pfvar.h 16 Jul 2015 22:10:10 -
*** extern void  pf_print_flags(u_int8_t);
*** 1791,1797 
  extern struct ifnet   *sync_ifp;
  extern struct pf_rule  pf_default_rule;
  extern voidpf_addrcpy(struct pf_addr *, struct pf_addr *,
!   u_int8_t);
  void   pf_rm_rule(struct pf_rulequeue *,
struct pf_rule *);
  void   pf_purge_rule(struct pf_ruleset *,
--- 1791,1797 
  extern struct ifnet   *sync_ifp;
  extern struct pf_rule  pf_default_rule;
  extern voidpf_addrcpy(struct pf_addr *, struct pf_addr *,
!   sa_family_t);
  void   pf_rm_rule(struct pf_rulequeue *,
struct pf_rule *);
  void   pf_purge_rule(struct pf_ruleset *,
*** struct pf_os_fingerprint *
*** 1984,1990 
pf_osfp_validate(void);
  
  #ifdef _KERNEL
! void   pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
  
  intpf_get_transaddr(struct pf_rule *, struct pf_pdesc *,
struct pf_src_node **, struct pf_rule **);
--- 1984,1990 
pf_osfp_validate(void);
  
  #ifdef _KERNEL
! void   pf_print_host(struct pf_addr *, u_int16_t, 
sa_family_t);
  
  intpf_get_transaddr(struct pf_rule *, struct pf_pdesc *,
struct pf_src_node **, struct pf_rule **);

potential memory leak in SIOCADDRULE

2015-07-16 Thread Alexandr Nedvedicky

Hello,

PF can leak memory in DIOCADDRULE code path in case something goes wrong with
rule creation.

Our story begins when we do pf_rule_copyin():

1070 if ((error = pf_rule_copyin(pr-rule, rule, 
ruleset))) {
1071 pool_put(pf_rule_pl, rule);
1072 break;
1073 }
1074 rule-cuid = p-p_ucred-cr_ruid;


Note pf_rule_copyin() takes couple of references to various PF objects:

2362 if (pf_kif_setup(to-ifname, to-kif))
2363 return (EINVAL);
2364 if (pf_kif_setup(to-rcv_ifname, to-rcv_kif))
2365 return (EINVAL);
2366 if (to-overload_tblname[0]) {
2367 if ((to-overload_tbl = pfr_attach_table(ruleset,
2368 to-overload_tblname, 0)) == NULL)
2369 return (EINVAL);

those references are never released if PF uses pool_put() to release rule
at 1071. We must tell PF to use pf_rm_rule(NULL, rule). Patch is further
below.

regards
sasha

8---

Index: pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.285
diff -u -p -r1.285 pf_ioctl.c
--- pf_ioctl.c  11 Apr 2015 13:00:12 -  1.285
+++ pf_ioctl.c  16 Jul 2015 23:21:48 -
@@ -1068,7 +1068,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
break;
}
if ((error = pf_rule_copyin(pr-rule, rule, ruleset))) {
-   pool_put(pf_rule_pl, rule);
+   pf_rm_rule(NULL, rule);
+   rule = NULL;
break;
}
rule-cuid = p-p_ucred-cr_ruid;
@@ -1084,7 +1085,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
break;
 #endif /* INET6 */
default:
-   pool_put(pf_rule_pl, rule);
+   pf_rm_rule(NULL, rule);
+   rule = NULL;
error = EAFNOSUPPORT;
goto fail;
}

Re: sa_family_t is not always equal to u_int8_t

2015-07-16 Thread Alexandr Nedvedicky

On Thu, Jul 16, 2015 at 11:10:06PM +, Miod Vallat wrote:
  cvs diff -p output:
 
 Please send unified diffs (diff -u). The easiest way is to have a
   diff -up
 line in your ~/.cvsrc file. Or diff -uNp if you want cvs diff to show
 new files as well.
 
 Miod


Sorry, now I got it..
regards
sasha

Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.416
diff -u -p -r1.416 pfvar.h
--- pfvar.h 16 Jul 2015 21:14:21 -  1.416
+++ pfvar.h 16 Jul 2015 23:39:21 -
@@ -1791,7 +1791,7 @@ extern void
pf_print_flags(u_int8_t);
 extern struct ifnet*sync_ifp;
 extern struct pf_rule   pf_default_rule;
 extern void pf_addrcpy(struct pf_addr *, struct pf_addr *,
-   u_int8_t);
+   sa_family_t);
 voidpf_rm_rule(struct pf_rulequeue *,
struct pf_rule *);
 voidpf_purge_rule(struct pf_ruleset *,
@@ -1984,7 +1984,7 @@ struct pf_os_fingerprint *
pf_osfp_validate(void);

 #ifdef _KERNEL
-voidpf_print_host(struct pf_addr *, u_int16_t, u_int8_t);
+voidpf_print_host(struct pf_addr *, u_int16_t, 
sa_family_t);

 int pf_get_transaddr(struct pf_rule *, struct pf_pdesc *,
struct pf_src_node **, struct pf_rule **);

Re: potential memory leak when pf_create_state() fails

2015-07-19 Thread Alexandr Nedvedicky

On Mon, Jul 20, 2015 at 04:27:45AM +0900, Ryan McBride wrote:
 ok mcbride@
 
err

I took a look at the patch one more time. I've realized PF must bind the rules
to state before STATE_INC_COUNTERS() gets called.  Not doing so makes PF to
play games with dangling pointers to rule from state. State still references
rules via match_rules list, but it has not actually bumped states_cur for
them. 

from caller point of view the PF_SYNPROXY_DROP actually means success.
we have a state in table, but want still drop the packet as we are proxying
handshake.

updated patch is below.

sorry for extra round trip...
regards
sasha

-888---8

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.931
diff -u -p -r1.931 pf.c
--- pf.c19 Jul 2015 05:48:11 -  1.931
+++ pf.c19 Jul 2015 21:08:31 -
@@ -3068,6 +3068,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
int  state_icmp = 0, icmp_dir = 0;
u_int16_tvirtual_type, virtual_id;
u_int8_t icmptype = 0, icmpcode = 0;
+   int  action = PF_DROP;
 
bzero(act, sizeof(act));
bzero(sns, sizeof(sns));
@@ -3351,7 +3352,6 @@ pf_test_rule(struct pf_pdesc *pd, struct
 
if (pd-virtual_proto != PF_VPROTO_FRAGMENT
 !state_icmp  r-keep_state) {
-   int action;
 
if (r-rule_flag  PFRULE_SRCTRACK 
pf_insert_src_node(sns[PF_SN_NONE], r, PF_SN_NONE, pd-af,
@@ -3370,7 +3370,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
sm, tag, rules, act, sns);
 
if (action != PF_PASS)
-   return (action);
+   goto cleanup;
if (sks != skw) {
struct pf_state_key *sk;
 
@@ -3428,7 +3428,7 @@ cleanup:
pool_put(pf_rule_item_pl, ri);
}
 
-   return (PF_DROP);
+   return (action);
 }
 
 static __inline int
@@ -3451,7 +3451,6 @@ pf_create_state(struct pf_pdesc *pd, str
s-rule.ptr = r;
s-anchor.ptr = a;
s-natrule.ptr = nr;
-   memcpy(s-match_rules, rules, sizeof(s-match_rules));
if (r-allow_opts)
s-state_flags |= PFSTATE_ALLOWOPTS;
if (r-rule_flag  PFRULE_STATESLOPPY)
@@ -3580,6 +3579,11 @@ pf_create_state(struct pf_pdesc *pd, str
} else
*sm = s;
 
+   /*
+* Make state responsible for rules it binds here.
+*/
+   memcpy(s-match_rules, rules, sizeof(s-match_rules));
+   bzero(rules, sizeof(*rules));
STATE_INC_COUNTERS(s);
 
if (tag  0) {

Re: preparing pfi_kif to MP world

2015-10-29 Thread Alexandr Nedvedicky

Hello,

On Fri, Oct 16, 2015 at 01:51:31PM +0200, Alexandr Nedvedicky wrote:
> On Fri, Oct 16, 2015 at 01:41:50PM +0200, Mike Belopuhov wrote:
> > On 16 October 2015 at 13:28, Alexandr Nedvedicky
> > <alexandr.nedvedi...@oracle.com> wrote:
> > >
> > > may be it's kind of bike shading...
> > > How about make kifs to stick to convention we see for other objects
> > > such as rulesets/anchors:
> > >
> > > pfi_kif_find()
> > > pfi_kif_find_or_create()
> > >
> > 
> > Personally I don't like "_or_create" style of function naming and
> > I would rather see those renamed to something else
> > 
> 

just blink of idea before I'll fall asleep, so using email to remember it...

we should eventually rename all pf_*_find_or_create() functions to
pf_*_create() and that's it, no change to current behavior.

regards
sasha

Patch 1/3 - make DIOCRADDADDRS to accept on IP address per ioctl() call

2015-10-28 Thread Alexandr Nedvedicky

Hello,

this is the first patch in series of three. All patches modify PF radix
table API such the ioctl() functions accept one IP address per call.
The idea has been proposed by Claudio at Varazdin.

I still have to untangle pfr_commit_ktable() and DIOCRSETADDRS ioctl.  Both
seem to be more complicated than DIOCRADDADDRS and DIOCRDELADDRS (subject of
next patch).

Patch changes DIOCRADDADDRS ioctl to DIOCRADDADDRS, which accepts one IP
address only per ioctl(2) call. Patch updates kernel and pfctl(8) only. Other
components, which happen to use DIOCRADDADDRS will be updated by extra patch.

thanks and
regards
sasha

P.S. I still need to update pf(4) manpage, I'll do it as soon as there we
will reach agreement on proposed patches.

8<---8<---8<--8<

Index: sbin/pfctl/pfctl_radix.c
===
RCS file: /cvs/src/sbin/pfctl/pfctl_radix.c,v
retrieving revision 1.32
diff -u -p -r1.32 pfctl_radix.c
--- sbin/pfctl/pfctl_radix.c21 Jan 2015 21:50:33 -  1.32
+++ sbin/pfctl/pfctl_radix.c27 Oct 2015 23:24:59 -
@@ -184,6 +184,7 @@ pfr_add_addrs(struct pfr_table *tbl, str
 int *nadd, int flags)
 {
struct pfioc_table io;
+   int i, rv, add = 0;
 
if (tbl == NULL || size < 0 || (size && addr == NULL)) {
errno = EINVAL;
@@ -192,14 +193,18 @@ pfr_add_addrs(struct pfr_table *tbl, str
bzero(, sizeof io);
io.pfrio_flags = flags;
io.pfrio_table = *tbl;
-   io.pfrio_buffer = addr;
io.pfrio_esize = sizeof(*addr);
-   io.pfrio_size = size;
-   if (ioctl(dev, DIOCRADDADDRS, ))
-   return (-1);
-   if (nadd != NULL)
-   *nadd = io.pfrio_nadd;
-   return (0);
+   io.pfrio_size = 1;  /* TODO: check .pfrio_size is needed */
+   for (i = 0; (i < size) && (rv == 0); i++) {
+   io.pfrio_buffer = addr++;
+   rv = ioctl(dev, DIOCRADDADDR, );
+   add++;
+   }
+
+   if ((rv == 0) && (nadd != NULL))
+   *nadd = add;
+
+   return (rv);
 }
 
 int
Index: sys/net/pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.291
diff -u -p -r1.291 pf_ioctl.c
--- sys/net/pf_ioctl.c  13 Oct 2015 19:32:31 -  1.291
+++ sys/net/pf_ioctl.c  27 Oct 2015 23:25:23 -
@@ -834,7 +834,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
case DIOCRGETTSTATS:
case DIOCRCLRTSTATS:
case DIOCRCLRADDRS:
-   case DIOCRADDADDRS:
+   case DIOCRADDADDR:
case DIOCRDELADDRS:
case DIOCRSETADDRS:
case DIOCRGETASTATS:
@@ -887,7 +887,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
case DIOCRDELTABLES:
case DIOCRCLRTSTATS:
case DIOCRCLRADDRS:
-   case DIOCRADDADDRS:
+   case DIOCRADDADDR:
case DIOCRDELADDRS:
case DIOCRSETADDRS:
case DIOCRSETTFLAGS:
@@ -1816,16 +1816,15 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
break;
}
 
-   case DIOCRADDADDRS: {
+   case DIOCRADDADDR: {
struct pfioc_table *io = (struct pfioc_table *)addr;
 
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
break;
}
-   error = pfr_add_addrs(>pfrio_table, io->pfrio_buffer,
-   io->pfrio_size, >pfrio_nadd, io->pfrio_flags |
-   PFR_FLAG_USERIOCTL);
+   error = pfr_add_addr(>pfrio_table, io->pfrio_buffer,
+   io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
break;
}
 
Index: sys/net/pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.115
diff -u -p -r1.115 pf_table.c
--- sys/net/pf_table.c  7 Oct 2015 11:57:44 -   1.115
+++ sys/net/pf_table.c  27 Oct 2015 23:25:26 -
@@ -266,6 +266,54 @@ pfr_clr_addrs(struct pfr_table *tbl, int
 }
 
 int
+pfr_add_addr(struct pfr_table *tbl, struct pfr_addr *addr, int size, int flags)
+{
+   struct pfr_ktable   *kt;
+   struct pfr_kentry   *p;
+   struct pfr_addr  ad;
+   int  rv;
+   time_t   tzero = time_second;
+
+   ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+   if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+   return (EINVAL);
+   kt = pfr_lookup_table(tbl);
+   if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+   return (ESRCH);
+   if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+   return (EPERM);
+   if (COPYIN(addr, , sizeof(ad), flags))
+

Patch 2/3 - make DIOCRDELADDRS to accept on IP address per ioctl() call

2015-10-28 Thread Alexandr Nedvedicky

Hello,

this is the second patch in 3-patch series.

Patch changes DIOCRDELADDRS ioctl to DIOCRDELADDR, which accepts one IP address
only per ioctl(2) call. Patch updates kernel and pfctl(8) only. Other 
components,
which happen to use DIOCRDELADDRS will be updated by extra patch.

thanks and
regards
sasha

8<---8<---8<--8<

Index: sbin/pfctl/pfctl_radix.c
===
RCS file: /cvs/src/sbin/pfctl/pfctl_radix.c,v
retrieving revision 1.32
diff -u -p -r1.32 pfctl_radix.c
--- sbin/pfctl/pfctl_radix.c21 Jan 2015 21:50:33 -  1.32
+++ sbin/pfctl/pfctl_radix.c27 Oct 2015 22:56:54 -
@@ -207,6 +207,7 @@ pfr_del_addrs(struct pfr_table *tbl, str
 int *ndel, int flags)
 {
struct pfioc_table io;
+   int i, rv, del = 0;
 
if (tbl == NULL || size < 0 || (size && addr == NULL)) {
errno = EINVAL;
@@ -215,14 +216,18 @@ pfr_del_addrs(struct pfr_table *tbl, str
bzero(, sizeof io);
io.pfrio_flags = flags;
io.pfrio_table = *tbl;
-   io.pfrio_buffer = addr;
io.pfrio_esize = sizeof(*addr);
-   io.pfrio_size = size;
-   if (ioctl(dev, DIOCRDELADDRS, ))
-   return (-1);
-   if (ndel != NULL)
-   *ndel = io.pfrio_ndel;
-   return (0);
+   io.pfrio_size = 1;
+   for (i = 0; (i < size) && (rv == 0); i++) {
+   io.pfrio_buffer = addr++;
+   rv = ioctl(dev, DIOCRDELADDR, );
+   del++;
+   }
+
+   if ((rv == 0) && (ndel != NULL))
+   *ndel = del;
+
+   return (rv);
 }
 
 int
Index: sys/net/pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.291
diff -u -p -r1.291 pf_ioctl.c
--- sys/net/pf_ioctl.c  13 Oct 2015 19:32:31 -  1.291
+++ sys/net/pf_ioctl.c  27 Oct 2015 22:57:20 -
@@ -835,7 +835,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
case DIOCRCLRTSTATS:
case DIOCRCLRADDRS:
case DIOCRADDADDRS:
-   case DIOCRDELADDRS:
+   case DIOCRDELADDR:
case DIOCRSETADDRS:
case DIOCRGETASTATS:
case DIOCRCLRASTATS:
@@ -888,7 +888,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
case DIOCRCLRTSTATS:
case DIOCRCLRADDRS:
case DIOCRADDADDRS:
-   case DIOCRDELADDRS:
+   case DIOCRDELADDR:
case DIOCRSETADDRS:
case DIOCRSETTFLAGS:
if (((struct pfioc_table *)addr)->pfrio_flags &
@@ -1829,16 +1829,15 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
break;
}
 
-   case DIOCRDELADDRS: {
+   case DIOCRDELADDR: {
struct pfioc_table *io = (struct pfioc_table *)addr;
 
if (io->pfrio_esize != sizeof(struct pfr_addr)) {
error = ENODEV;
break;
}
-   error = pfr_del_addrs(>pfrio_table, io->pfrio_buffer,
-   io->pfrio_size, >pfrio_ndel, io->pfrio_flags |
-   PFR_FLAG_USERIOCTL);
+   error = pfr_del_addr(>pfrio_table, io->pfrio_buffer,
+   io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
break;
}
 
Index: sys/net/pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.115
diff -u -p -r1.115 pf_table.c
--- sys/net/pf_table.c  7 Oct 2015 11:57:44 -   1.115
+++ sys/net/pf_table.c  27 Oct 2015 22:57:24 -
@@ -152,6 +152,8 @@ void pfr_destroy_kentries(struct 
pfr_
 voidpfr_destroy_kentry(struct pfr_kentry *);
 voidpfr_insert_kentries(struct pfr_ktable *,
struct pfr_kentryworkq *, time_t);
+voidpfr_remove_kentry(struct pfr_ktable *,
+   struct pfr_kentry *);
 voidpfr_remove_kentries(struct pfr_ktable *,
struct pfr_kentryworkq *);
 voidpfr_clstats_kentries(struct pfr_kentryworkq *, time_t,
@@ -343,14 +345,12 @@ _bad:
 }
 
 int
-pfr_del_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
-int *ndel, int flags)
+pfr_del_addr(struct pfr_table *tbl, struct pfr_addr *addr, int size, int flags)
 {
struct pfr_ktable   *kt;
-   struct pfr_kentryworkq   workq;
struct pfr_kentry   *p;
struct pfr_addr  ad;
-   int  i, rv, xdel = 0, log = 1;
+   int  rv;
 
ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
@@ -360,70 +360,29 @@

Patch 3/3 - update userland to reflect DIOCRADDADDRS/DIOCRDELADDRS changes

2015-10-28 Thread Alexandr Nedvedicky

Hello,

this is the third patch in the first PF radix changes batch.  Patch requires
earlier patches to be in place, otherwise compilation will fail.

Patch updates various user land tools by new PF radix table changes:
s/DIOCRADDADDRS/DIOCRADDADDR
s/DIOCRDELADDRS/DIOCRDELADDR   
it's also no longer possible to pass more than 1 IP address per ioctl(2) call.
Patch updates those tools:
usr.sbin/authpf/authpf.c
usr.sbin/bgpd/pftable.c
usr.sbin/dhcpd/pfutils.c

thanks and
regards
sasha

8<---8<---8<--8<

Index: usr.sbin/authpf/authpf.c
===
RCS file: /cvs/src/usr.sbin/authpf/authpf.c,v
retrieving revision 1.123
diff -u -p -r1.123 authpf.c
--- usr.sbin/authpf/authpf.c21 Jan 2015 21:50:32 -  1.123
+++ usr.sbin/authpf/authpf.c27 Oct 2015 23:54:48 -
@@ -872,7 +872,7 @@ change_table(int add, const char *ipsrc)
return (-1);
}
 
-   if (ioctl(dev, add ? DIOCRADDADDRS : DIOCRDELADDRS, ) &&
+   if (ioctl(dev, add ? DIOCRADDADDR : DIOCRDELADDR, ) &&
errno != ESRCH) {
syslog(LOG_ERR, "cannot %s %s from table %s: %s",
add ? "add" : "remove", ipsrc, tablename,
Index: usr.sbin/bgpd/pftable.c
===
RCS file: /cvs/src/usr.sbin/bgpd/pftable.c,v
retrieving revision 1.8
diff -u -p -r1.8 pftable.c
--- usr.sbin/bgpd/pftable.c 21 Jan 2015 21:50:32 -  1.8
+++ usr.sbin/bgpd/pftable.c 27 Oct 2015 23:54:49 -
@@ -57,6 +57,8 @@ pftable_change(struct pf_table *pft)
 {
struct pfioc_table tio;
int ret;
+   int i;
+   struct pfr_addr *addr;
 
if (pft->naddrs == 0 || pft->what == 0)
return (0);
@@ -67,11 +69,15 @@ pftable_change(struct pf_table *pft)
bzero(, sizeof(tio));
strlcpy(tio.pfrio_table.pfrt_name, pft->name,
sizeof(tio.pfrio_table.pfrt_name));
-   tio.pfrio_buffer = pft->worklist;
tio.pfrio_esize = sizeof(*pft->worklist);
-   tio.pfrio_size = pft->naddrs;
+   tio.pfrio_size = 1;
 
ret = ioctl(devpf, pft->what, );
+   addr = pft->worklist;
+   for (i = 0; (i < pft->naddrs) && (ret == 0); i++) {
+   tio.pfrio_buffer = addr++;
+   ret = ioctl(devpf, pft->what, );
+   }
 
/* bad prefixes shouldn't cause us to die */
if (ret == -1) {
@@ -193,7 +199,7 @@ pftable_add_work(const char *table, stru
}
 
/* Only one type of work on the list at a time */
-   what = del ? DIOCRDELADDRS : DIOCRADDADDRS;
+   what = del ? DIOCRDELADDR : DIOCRADDADDR;
if (pft->naddrs != 0 && pft->what != what)
fatal("attempt to mix pf table additions/deletions");
 
Index: usr.sbin/dhcpd/pfutils.c
===
RCS file: /cvs/src/usr.sbin/dhcpd/pfutils.c,v
retrieving revision 1.13
diff -u -p -r1.13 pfutils.c
--- usr.sbin/dhcpd/pfutils.c5 Feb 2015 09:42:52 -   1.13
+++ usr.sbin/dhcpd/pfutils.c27 Oct 2015 23:54:51 -
@@ -154,7 +154,7 @@ pf_change_table(int fd, int op, struct i
addr.pfra_af = AF_INET;
addr.pfra_net = 32;
 
-   if (ioctl(fd, op ? DIOCRADDADDRS : DIOCRDELADDRS, ) &&
+   if (ioctl(fd, op ? DIOCRADDADDR : DIOCRDELADDR, ) &&
errno != ESRCH) {
warning( "DIOCR%sADDRS on table %s: %s",
op ? "ADD" : "DEL", table, strerror(errno));

Re: preparing pfi_kif to MP world

2015-10-28 Thread Alexandr Nedvedicky

Hello Mike,

just a quick question:

are you going to commit your pfi_kif_find() et. al.?
or more work is needed there?

thanks a lot
regards
sasha

> 
> Turns out this is a rather simple issue that got slightly
> complicated by the code diverging quite a bit since the
> inception.  Essentially the clr->ifname comes from the
> interface specification in the "pfctl -i foo0 -Fs" for
> if-bound states (floating states use fake interface "any").
> 
> Previously states have been hanging off of kif nodes but it's
> long gone and we can simply iterate over the state table tree
> (or even a state list like it's done in the DIOCGETSTATES in
> pf_ioctl).
> 
> Calling pf_kif_get here wouldn't be prudent because spawning
> new objects while disposing of the other ones seems somewhat
> counterproductive.
> 
diff --git sys/net/if_pfsync.c sys/net/if_pfsync.c
index 7d633db..fcaf5f5 100644
--- sys/net/if_pfsync.c
+++ sys/net/if_pfsync.c
@@ -752,46 +752,28 @@ done:
 
 int
 pfsync_in_clr(caddr_t buf, int len, int count, int flags)
 {
struct pfsync_clr *clr;
-   int i;
-
struct pf_state *st, *nexts;
-   struct pf_state_key *sk, *nextsk;
-   struct pf_state_item *si;
+   struct pfi_kif *kif = NULL;
u_int32_t creatorid;
+   int i;
 
for (i = 0; i < count; i++) {
clr = (struct pfsync_clr *)buf + len * i;
creatorid = clr->creatorid;
+   if (strlen(clr->ifname) &&
+   (kif = pfi_kif_find(clr->ifname)) == NULL)
+   continue;
 
-   if (clr->ifname[0] == '\0') {
-   for (st = RB_MIN(pf_state_tree_id, _id);
-   st; st = nexts) {
-   nexts = RB_NEXT(pf_state_tree_id, _id, st);
-   if (st->creatorid == creatorid) {
-   SET(st->state_flags, PFSTATE_NOSYNC);
-   pf_unlink_state(st);
-   }
-   }
-   } else {
-   if (pfi_kif_get(clr->ifname) == NULL)
-   continue;
-
-   /* XXX correct? */
-   for (sk = RB_MIN(pf_state_tree, _statetbl);
-   sk; sk = nextsk) {
-   nextsk = RB_NEXT(pf_state_tree,
-   _statetbl, sk);
-   TAILQ_FOREACH(si, >states, entry) {
-   if (si->s->creatorid == creatorid) {
-   SET(si->s->state_flags,
-   PFSTATE_NOSYNC);
-   pf_unlink_state(si->s);
-   }
-   }
+   for (st = RB_MIN(pf_state_tree_id, _id); st; st = nexts) {
+   nexts = RB_NEXT(pf_state_tree_id, _id, st);
+   if (st->creatorid == creatorid &&
+   ((kif && st->kif == kif) || !kif)) {
+   SET(st->state_flags, PFSTATE_NOSYNC);
+   pf_unlink_state(st);
}
}
}
 
return (0);
diff --git sys/net/pf_if.c sys/net/pf_if.c
index caaf9f9..bf77184 100644
--- sys/net/pf_if.c
+++ sys/net/pf_if.c
@@ -97,18 +97,25 @@ pfi_initialize(void)
if ((pfi_all = pfi_kif_get(IFG_ALL)) == NULL)
panic("pfi_kif_get for pfi_all failed");
 }
 
 struct pfi_kif *
-pfi_kif_get(const char *kif_name)
+pfi_kif_find(const char *kif_name)
 {
-   struct pfi_kif  *kif;
struct pfi_kif_cmp   s;
 
bzero(, sizeof(s));
strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name));
-   if ((kif = RB_FIND(pfi_ifhead, _ifs, (struct pfi_kif *))) != NULL)
+   return (RB_FIND(pfi_ifhead, _ifs, (struct pfi_kif *)));
+}
+
+struct pfi_kif *
+pfi_kif_get(const char *kif_name)
+{
+   struct pfi_kif  *kif;
+
+   if ((kif = pfi_kif_find(kif_name)))
return (kif);
 
/* create new one */
if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT|M_ZERO)) == NULL)
return (NULL);
diff --git sys/net/pfvar.h sys/net/pfvar.h
index cdb2f7f..76a98a9 100644
--- sys/net/pfvar.h
+++ sys/net/pfvar.h
@@ -1808,10 +1808,11 @@ int pfr_ina_define(struct pfr_table *, struct 
pfr_addr *, int, int *,
int *, u_int32_t, int);
 
 extern struct pfi_kif  *pfi_all;
 
 voidpfi_initialize(void);
+struct pfi_kif *pfi_kif_find(const char *);
 struct pfi_kif *pfi_kif_get(const char *);
 voidpfi_kif_ref(struct pfi_kif *, enum pfi_kif_refs);
 voidpfi_kif_unref(struct pfi_kif *, enum pfi_kif_refs);
 int pfi_kif_match(struct pfi_kif *, struct pfi_kif *);
 void

Re: Patch 2/3 - make DIOCRDELADDRS to accept on IP address per ioctl() call

2015-10-28 Thread Alexandr Nedvedicky

Hello,

> > Index: sbin/pfctl/pfctl_radix.c
> > +   io.pfrio_size = 1;
> 
> in 1/3 you have annotated like this
> 
> + io.pfrio_size = 1;  /* TODO: check .pfrio_size is needed */
> 

sorry this has leaked out from my internal repo. The .pfrio_size member will be
dropped as soon as I'll be done with the rest of PF radix table ioctls.

thanks for catching that.

regards
sasha

patch saves some cycles by extending pfr_walktree() a bit

2015-10-28 Thread Alexandr Nedvedicky

Hello,

This is yet another patch, which 'scratches surface', this time in pf_table.c.
As briefly discussed in Varazdin the plan is to clean up pf_table.c a bit, to
make implementation of reference handling and further MP stuff bit easier.

I've noticed sub-optimal implementation table entries at two or three places.
The code typically looks as follows:

xxx pfr_enqueue_addrs(kt, , NULL, 0);
yyy SLIST_FOREACH(p, , pfrke_workq)
yyy pfr_ktable_winfo_update(kt, p);

at xxx we traverse whole table using underlying pfr_walktree() function, which
links the whole table into single link list (a.k.a. work queue).  at yyy we
walk through whole list applying pfr_ktable_winfo_update() to every address
(kentry). This requires 2 x n steps. Patch below improves that to n by
extending pfr_walktree() so it is able to call desired function.

Also the patch is part of my effort to kill work queues in radix tables.

thanks and
regards
sasha

8<---8<---8<--8<

Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.115
diff -u -p -r1.115 pf_table.c
--- pf_table.c  7 Oct 2015 11:57:44 -   1.115
+++ pf_table.c  27 Oct 2015 21:40:18 -
@@ -107,7 +107,9 @@ struct pfr_walktree {
PFRW_GET_ADDRS,
PFRW_GET_ASTATS,
PFRW_POOL_GET,
-   PFRW_DYNADDR_UPDATE
+   PFRW_DYNADDR_UPDATE,
+   PFRW_WININFO_UPDATE,
+   PFRW_CLSTAT
}pfrw_op;
union {
struct pfr_addr *pfrw1_addr;
@@ -115,15 +117,22 @@ struct pfr_walktree {
struct pfr_kentryworkq  *pfrw1_workq;
struct pfr_kentry   *pfrw1_kentry;
struct pfi_dynaddr  *pfrw1_dyn;
+   struct {
+   time_t  tzero;
+   int negchange;
+   }pfrw1_clstat;
}pfrw_1;
int  pfrw_free;
int  pfrw_flags;
+   struct pfr_ktable   *pfrw_kt;
 };
 #define pfrw_addr  pfrw_1.pfrw1_addr
 #define pfrw_astatspfrw_1.pfrw1_astats
 #define pfrw_workq pfrw_1.pfrw1_workq
 #define pfrw_kentrypfrw_1.pfrw1_kentry
 #define pfrw_dyn   pfrw_1.pfrw1_dyn
+#definepfrw_tzero  pfrw_1.pfrw1_clstat.tzero
+#definepfrw_negchange  pfrw_1.pfrw1_clstat.negchange
 #define pfrw_cnt   pfrw_free
 
 #define senderr(e) do { rv = (e); goto _bad; } while (0)
@@ -156,6 +165,8 @@ void pfr_remove_kentries(struct 
pfr_k
struct pfr_kentryworkq *);
 voidpfr_clstats_kentries(struct pfr_kentryworkq *, time_t,
int);
+voidpfr_clstats_kentries_pfrw(struct pfr_ktable *, time_t,
+   int);
 voidpfr_reset_feedback(struct pfr_addr *, int, int);
 voidpfr_prepare_network(union sockaddr_union *, int, int);
 int pfr_route_kentry(struct pfr_ktable *,
@@ -181,6 +192,7 @@ int  pfr_ktable_compare(struct pfr_kta
struct pfr_ktable *);
 voidpfr_ktable_winfo_update(struct pfr_ktable *,
struct pfr_kentry *);
+voidpfr_ktable_winfo_update_sum(struct pfr_ktable *);
 struct pfr_ktable  *pfr_lookup_table(struct pfr_table *);
 voidpfr_clean_node_mask(struct pfr_ktable *,
struct pfr_kentryworkq *);
@@ -189,6 +201,8 @@ int  pfr_skip_table(struct pfr_table *
struct pfr_ktable *, int);
 struct pfr_kentry  *pfr_kentry_byidx(struct pfr_ktable *, int, int);
 int pfr_islinklocal(sa_family_t, struct pf_addr *);
+voidpfr_walk(struct pfr_ktable *, struct pfr_walktree *,
+   const char *);
 
 RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
 RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
@@ -631,7 +645,6 @@ pfr_get_astats(struct pfr_table *tbl, st
 {
struct pfr_ktable   *kt;
struct pfr_walktree  w;
-   struct pfr_kentryworkq   workq;
int  rv;
time_t   tzero = time_second;
 
@@ -653,10 +666,8 @@ pfr_get_astats(struct pfr_table *tbl, st
rv = rn_walktree(kt->pfrkt_ip4, pfr_walktree, );
if (!rv)
rv = rn_walktree(kt->pfrkt_ip6, pfr_walktree, );
-   if (!rv && (flags & PFR_FLAG_CLSTATS)) {
-   pfr_enqueue_addrs(kt, , NULL, 0);
-   pfr_clstats_kentries(, tzero, 0);
-   }
+   if (!rv && (flags & PFR_FLAG_CLSTATS))
+   pfr_clstats_kentries_pfrw(kt, tzero,

Re: preparing pfi_kif to MP world

2015-10-29 Thread Alexandr Nedvedicky

On Thu, Oct 29, 2015 at 02:49:40AM +0100, Mike Belopuhov wrote:
> On 28 October 2015 at 18:41, Alexandr Nedvedicky
> <alexandr.nedvedi...@oracle.com> wrote:
> > Hello Mike,
> >
> > just a quick question:
> >
> > are you going to commit your pfi_kif_find() et. al.?
> > or more work is needed there?
> >
> 
> I need OKs
> 

OK for pf_if.c, I'm still not sure, what's going on in if_pfsync.c


thanks and
regards
sasha

Re: Patch 1/3 - make DIOCRADDADDRS to accept on IP address per ioctl() call

2015-11-09 Thread Alexandr Nedvedicky

Hello,

> On Wed, Oct 28, 2015 at 06:19:48PM +0100, Alexandr Nedvedicky wrote:
> > The idea has been proposed by Claudio at Varazdin.
> 
> I guess the idea is to eliminate the workq.  Or is ther naother
> reason to change it?

the primary goal is to kill work queues.

> 
> Comments inline
> 
thank you very much for very good code review.

> > -   *nadd = io.pfrio_nadd;
> > -   return (0);
> > +   io.pfrio_size = 1;  /* TODO: check .pfrio_size is needed */
> > +   for (i = 0; (i < size) && (rv == 0); i++) {
> 
> rv is unitialized in the first interation

it's fixed by change below:

@@ -184,7 +184,8 @@
 int *nadd, int flags)
 {
struct pfioc_table io;
-   int i, rv, add = 0;
+   int i, add = 0;
+   int rv = 0; 
 
> 
> > +   io.pfrio_buffer = addr++;
> > +   rv = ioctl(dev, DIOCRADDADDR, );
> 
> I would suggest to return (-1) if ioctl fails...
I'll write my response in email answering your note on 'illusion of atomicity'

> 
> pfr_add_addr() handles exactly 1 address, don't pass io->pfrio_size.
> 

yes that's true...

error = pfr_add_addr(>pfrio_table, io->pfrio_buffer,
-   io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+   io->pfrio_flags | PFR_FLAG_USERIOCTL);
break;

> >  
> >  int
> > +pfr_add_addr(struct pfr_table *tbl, struct pfr_addr *addr, int size, int 
> > flags)
> 
> Do not pass size.

-pfr_add_addr(struct pfr_table *tbl, struct pfr_addr *addr, int size, int flags)
+pfr_add_addr(struct pfr_table *tbl, struct pfr_addr *addr, int flags)

> 
> Should you check for !(flags & PFR_FLAG_DUMMY) here?  It was done
> in the old code before pfr_insert_kentries().
> 
> > +   rv = pfr_insert_kentry(kt, , tzero);
> 
> The old code ignored wether pfr_insert_kentries() succeeded.
> 

thanks for catching this.

@@ -288,20 +288,20 @@
senderr(EINVAL);
p = pfr_lookup_addr(kt, , 1);
if (p == NULL) {
-   rv = pfr_insert_kentry(kt, , tzero);
+   if (!(flags & PFR_FLAG_DUMMY))
+   rv = pfr_insert_kentry(kt, , tzero);
+   else
+   rv = 0;
}

> 
> No { } for one line if block.

there is no longer one line if block after fixing  PFR_FLAG_DUMMY flag.

> 
> PFR_FB_DUPLICATE was used when there were two identical addresses
> in the passed list.  This cannot happen anymore.  It should be
> PFR_FB_NONE in this case.
> 
> > +   } else if (rv != 0) {
> > +   ad.pfra_fback = PFR_FB_NONE;
> > +   } else {
> > +   ad.pfra_fback = PFR_FB_ADDED;
> > +   }
> 
> Perhaps write this block as
> 
>   if (p == NULL)
>   ad.pfra_fback = PFR_FB_ADDED;
>   else if ((p->pfrke_flags & PFRKE_FLAG_NOT) != ad.pfra_not))
>   ad.pfra_fback = PFR_FB_CONFLICT;
>   else
>   ad.pfra_fback = PFR_FB_NONE;
> 

I thinks we still must check rv coming from pfr_insert_kentry():

if (flags & PFR_FLAG_FEEDBACK) {
-   if (p != NULL) {
-   if ((p->pfrke_flags & PFRKE_FLAG_NOT) != ad.pfra_not)
-   ad.pfra_fback = PFR_FB_CONFLICT;
-   else
-   ad.pfra_fback = PFR_FB_DUPLICATE;
-   } else if (rv != 0) {
+   if (p == NULL)
+   ad.pfra_fback = (rv == 0) ? PFR_FB_ADDED : PFR_FB_NONE;
+   else if ((p->pfrke_flags & PFRKE_FLAG_NOT) != ad.pfra_not)
+   ad.pfra_fback = PFR_FB_CONFLICT;
+   else
ad.pfra_fback = PFR_FB_NONE;
-   } else {
-   ad.pfra_fback = PFR_FB_ADDED;
-   }

 
> > +_bad:
> > +   if (flags & PFR_FLAG_FEEDBACK)
> > +   pfr_reset_feedback(addr, size, flags);
> 
> Don't use size, it must be 1.

the size argument got dropped.

> 
> > +   return (rv);
> 
> rv may be unitialized

thanks for catching that.

> 
> pfr_add_addrs() is not used anymore, remove it.

pfr_add_addrs() is gone in new patch.

thanks and
regards
sasha

8<---8<-8<

Index: sbin/pfctl/pfctl_radix.c
===
RCS file: /cvs/src/sbin/pfctl/pfctl_radix.c,v
retrieving revision 1.32
diff -u -p -r1.32 pfctl_radix.c
--- sbin/pfctl/pfctl_radix.c21 Jan 2015 21:50:33 -  1.32
+++ sbin/pfctl/pfctl_radix.c9 Nov 2015 20:43:52 -
@@ -184,6 +184,8 @@ pfr_add_addrs(struct p

Re: Patch 1/3 - make DIOCRADDADDRS to accept on IP address per ioctl() call

2015-11-09 Thread Alexandr Nedvedicky

On Sun, Nov 08, 2015 at 01:18:22PM +0100, Alexander Bluhm wrote:
> On Sun, Nov 08, 2015 at 02:37:58AM +0100, Alexander Bluhm wrote:
> > > + for (i = 0; (i < size) && (rv == 0); i++) {
> > 
> > rv is unitialized in the first interation
> > 
> > > + io.pfrio_buffer = addr++;
> > > + rv = ioctl(dev, DIOCRADDADDR, );
> > 
> > I would suggest to return (-1) if ioctl fails...
> > 
> > > + add++;
> > > + }
> 
> To keep the illusion of an atomic operation, we could remove the
> addresses we just added before the one add failed.
> 

actually pfctl_radix.c is just tip of the iceberg,  there are other tools than
pfctl, which manipulate with PF-tables:
authpf
bgpd
pfutils

The more I'm thinking about s/SIOCADDADDRS/SIOCADDADDR the less I like it.  I
feel good about s/pfr_add_addrs/pfr_add_addr. The pfr_add_addr() should be a
back end for SIOCADDADDRS ioctl operation, which I think should go back.  The
ioctl in kernel will iterate over the array of addresses coming from userland.
It seems to me as more convenient approach. I'm working on prototype, I
hope I'll send updated patches soon.

thanks and
regards
sasha

Re: patch saves some cycles by extending pfr_walktree() a bit

2015-11-09 Thread Alexandr Nedvedicky

Hello,

> 
> For now the code gets more as we have two ways to iterate over the
> tree.  When you remove the additional work queues I expect many -
> diffs.  So if this code dupliation is temporary, this aproach is
> fine for me.

yes code duplication is temporary, I'd like to kill work queues.

> >  #define pfrw_dyn   pfrw_1.pfrw1_dyn
> > +#definepfrw_tzero  pfrw_1.pfrw1_clstat.tzero
> > +#definepfrw_negchange  pfrw_1.pfrw1_clstat.negchange
> >  #define pfrw_cnt   pfrw_free
> 
> Use space instead of tab after the define.  Otherwise it is incosistent
> and the diff looks ugly.
thanks for catching that.

I will postpone commit for few more days. Just to give a chance other folks
to give OK / !OK

I also would like to finish prototype of pfr_set_addrs(), which still prevents
me from removing work queues in my OpenBSD branch.

thanks and
regards
sasha

8<---8<---8<--8<
Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.116
diff -u -p -r1.116 pf_table.c
--- pf_table.c  3 Nov 2015 22:10:33 -   1.116
+++ pf_table.c  9 Nov 2015 19:24:52 -
@@ -107,7 +107,9 @@ struct pfr_walktree {
PFRW_GET_ADDRS,
PFRW_GET_ASTATS,
PFRW_POOL_GET,
-   PFRW_DYNADDR_UPDATE
+   PFRW_DYNADDR_UPDATE,
+   PFRW_WININFO_UPDATE,
+   PFRW_CLSTAT
}pfrw_op;
union {
struct pfr_addr *pfrw1_addr;
@@ -115,15 +117,22 @@ struct pfr_walktree {
struct pfr_kentryworkq  *pfrw1_workq;
struct pfr_kentry   *pfrw1_kentry;
struct pfi_dynaddr  *pfrw1_dyn;
+   struct {
+   time_t  tzero;
+   int negchange;
+   }pfrw1_clstat;
}pfrw_1;
int  pfrw_free;
int  pfrw_flags;
+   struct pfr_ktable   *pfrw_kt;
 };
 #define pfrw_addr  pfrw_1.pfrw1_addr
 #define pfrw_astatspfrw_1.pfrw1_astats
 #define pfrw_workq pfrw_1.pfrw1_workq
 #define pfrw_kentrypfrw_1.pfrw1_kentry
 #define pfrw_dyn   pfrw_1.pfrw1_dyn
+#define pfrw_tzero pfrw_1.pfrw1_clstat.tzero
+#define pfrw_negchange pfrw_1.pfrw1_clstat.negchange
 #define pfrw_cnt   pfrw_free
 
 #define senderr(e) do { rv = (e); goto _bad; } while (0)
@@ -156,6 +165,8 @@ void pfr_remove_kentries(struct 
pfr_k
struct pfr_kentryworkq *);
 voidpfr_clstats_kentries(struct pfr_kentryworkq *, time_t,
int);
+voidpfr_clstats_kentries_pfrw(struct pfr_ktable *, time_t,
+   int);
 voidpfr_reset_feedback(struct pfr_addr *, int, int);
 voidpfr_prepare_network(union sockaddr_union *, int, int);
 int pfr_route_kentry(struct pfr_ktable *,
@@ -181,6 +192,7 @@ int  pfr_ktable_compare(struct pfr_kta
struct pfr_ktable *);
 voidpfr_ktable_winfo_update(struct pfr_ktable *,
struct pfr_kentry *);
+voidpfr_ktable_winfo_update_sum(struct pfr_ktable *);
 struct pfr_ktable  *pfr_lookup_table(struct pfr_table *);
 voidpfr_clean_node_mask(struct pfr_ktable *,
struct pfr_kentryworkq *);
@@ -189,6 +201,8 @@ int  pfr_skip_table(struct pfr_table *
struct pfr_ktable *, int);
 struct pfr_kentry  *pfr_kentry_byidx(struct pfr_ktable *, int, int);
 int pfr_islinklocal(sa_family_t, struct pf_addr *);
+voidpfr_walk(struct pfr_ktable *, struct pfr_walktree *,
+   const char *);
 
 RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
 RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
@@ -631,7 +645,6 @@ pfr_get_astats(struct pfr_table *tbl, st
 {
struct pfr_ktable   *kt;
struct pfr_walktree  w;
-   struct pfr_kentryworkq   workq;
int  rv;
time_t   tzero = time_second;
 
@@ -653,10 +666,8 @@ pfr_get_astats(struct pfr_table *tbl, st
rv = rn_walktree(kt->pfrkt_ip4, pfr_walktree, );
if (!rv)
rv = rn_walktree(kt->pfrkt_ip6, pfr_walktree, );
-   if (!rv && (flags & PFR_FLAG_CLSTATS)) {
-   pfr_enqueue_addrs(kt, , NULL, 0);
-   pfr_clstats_kentries(, tzero, 0);
-   }
+   if (!rv && (flags & PFR_FLAG_CLSTATS))
+   pfr_clstats_kentries_pfrw(kt, tzero, 0);
if (rv)
return (rv);
 
@@ -954,7 +965,6 @@ pfr_remove_kentries(struct pfr_ktable *k
 struct

Re: Patch 1/3 - make DIOCRADDADDRS to accept on IP address per ioctl() call

2015-11-09 Thread Alexandr Nedvedicky

> 
> I'm wondering - how does it affect tools that load several thousands of IPs
> into a table?  Like spamd, bgpd (for spam lists etc.), or pfctl for IP black
> lists (as distributed by ET).
> 
> There are valid use cases with HUGE tables, but I have to admit that I didn't
> test your diff yet. Just a concern that loading IPs one after another might
> take forever.
> 

I could measure no difference on sample of 1 unique IPv4 addresses.
Both (pfr_add_addrs/pfr_add_addr) could load them within 1sec.

pfr_add_addrs:
# wc -l test.table.pf ; date ; pfctl -t test -T add -f test.table.pf ; date
10 test.table.pf
Mon Nov  9 18:21:18 CET 2015
1 table created.
10/10 addresses added.
Mon Nov  9 18:21:19 CET 2015


pfr_add_addr:
Mon Nov  9 18:31:27 CET 2015
# wc -l test.table.pf ; date ; pfctl -t test -T add -f test.table.pf ; date
10 test.table.pf
Mon Nov  9 18:31:27 CET 2015
1 table created.  10/10 addresses added.
Mon Nov  9 18:31:28 CET 2015

My test machine is Toshiba Tecra with Centrino 2.

regards
sasha

Re: patch - potential use-after-free pfr_set_addrs()

2015-11-03 Thread Alexandr Nedvedicky

On Tue, Nov 03, 2015 at 10:09:49PM +0100, Alexander Bluhm wrote:
> On Tue, Nov 03, 2015 at 09:40:38PM +0100, Alexandr Nedvedicky wrote:
> > I think the
> > right thing is to use goto _skip; in that branch to avoid 499 et. al.
> > completely.
> 
> Yes
> 
> > @@ -491,6 +491,7 @@ pfr_set_addrs(struct pfr_table *tbl, str
> > if (pfr_route_kentry(tmpkt, p)) {
> > pfr_destroy_kentry(p);
> > ad.pfra_fback = PFR_FB_NONE;
> > +   goto _skip;
> > } else {
> > SLIST_INSERT_HEAD(, p, pfrke_workq);
> > ad.pfra_fback = PFR_FB_ADDED;
> 
> I would prefer not to have an else block after a goto.  You can
> remove the else and move the tree following lines one tab to the
> left.

makes sense to me. So this is the patch I'm going to commit.
thanks and
regards
sasha

8<---8<-8<
Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.115
diff -u -p -r1.115 pf_table.c
--- pf_table.c  7 Oct 2015 11:57:44 -   1.115
+++ pf_table.c  3 Nov 2015 21:23:49 -
@@ -491,11 +491,11 @@ pfr_set_addrs(struct pfr_table *tbl, str
if (pfr_route_kentry(tmpkt, p)) {
pfr_destroy_kentry(p);
ad.pfra_fback = PFR_FB_NONE;
-   } else {
-   SLIST_INSERT_HEAD(, p, pfrke_workq);
-   ad.pfra_fback = PFR_FB_ADDED;
-   xadd++;
+   goto _skip;
}
+   SLIST_INSERT_HEAD(, p, pfrke_workq);
+   ad.pfra_fback = PFR_FB_ADDED;
+   xadd++;
if (p->pfrke_type == PFRKE_COST)
kt->pfrkt_refcntcost++;
pfr_ktable_winfo_update(kt, p);

patch - potential use-after-free pfr_set_addrs()

2015-11-03 Thread Alexandr Nedvedicky

Hello,

Patch fixes potential use-after-free in pf_table.c:pfr_set_addrs():

 463 for (i = 0; i < size; i++) {
 ...
 483 q = pfr_lookup_addr(tmpkt, , 1);
 484 if (q != NULL) {
 485 ad.pfra_fback = PFR_FB_DUPLICATE;
 486 goto _skip;
 487 }
 488 p = pfr_create_kentry();
 489 if (p == NULL)
 490 senderr(ENOMEM);
 491 if (pfr_route_kentry(tmpkt, p)) {
 492 pfr_destroy_kentry(p);
 493 ad.pfra_fback = PFR_FB_NONE;
 494 } else {
 495 SLIST_INSERT_HEAD(, p, pfrke_workq);
 496 ad.pfra_fback = PFR_FB_ADDED;
 497 xadd++;
 498 }
 499 if (p->pfrke_type == PFRKE_COST)
 500 kt->pfrkt_refcntcost++;
 501 pfr_ktable_winfo_update(kt, p);
 502 }
 503 _skip:
 504 if (flags & PFR_FLAG_FEEDBACK)
 505 if (COPYOUT(, addr+i, sizeof(ad), flags))
 506 senderr(EFAULT);
 507 }


the things start to go downhill if PF takes branch at 492, where `p` gets
freed. The code jumps to 499 & 501, where PF steps to dead pointer. I think the
right thing is to use goto _skip; in that branch to avoid 499 et. al.
completely.

path is below.

OK?

thanks and
regards
sasha

8<---8<---8<--8<

Index: pf_table.c
===
RCS file: /cvs/src/sys/net/pf_table.c,v
retrieving revision 1.115
diff -u -p -r1.115 pf_table.c
--- pf_table.c  7 Oct 2015 11:57:44 -   1.115
+++ pf_table.c  3 Nov 2015 20:15:26 -
@@ -491,6 +491,7 @@ pfr_set_addrs(struct pfr_table *tbl, str
if (pfr_route_kentry(tmpkt, p)) {
pfr_destroy_kentry(p);
ad.pfra_fback = PFR_FB_NONE;
+   goto _skip;
} else {
SLIST_INSERT_HEAD(, p, pfrke_workq);
ad.pfra_fback = PFR_FB_ADDED;

Re: preparing pfi_kif to MP world

2015-10-16 Thread Alexandr Nedvedicky

On Fri, Oct 16, 2015 at 01:41:50PM +0200, Mike Belopuhov wrote:
> On 16 October 2015 at 13:28, Alexandr Nedvedicky
> <alexandr.nedvedi...@oracle.com> wrote:
> >
> > may be it's kind of bike shading...
> > How about make kifs to stick to convention we see for other objects
> > such as rulesets/anchors:
> >
> > pfi_kif_find()
> > pfi_kif_find_or_create()
> >
> 
> Personally I don't like "_or_create" style of function naming and
> I would rather see those renamed to something else
> 

yes, '... naming is hard'. another option would be to
keep _find() around and add an argument - a create flag, which orders create if
it does not exist. Then all _or_create() can get traded for create flag.
I really don't care that much for now...

I agree we should stick to your patch.

I'm OK with your changes.

sasha

Re: preparing pfi_kif to MP world

2015-10-16 Thread Alexandr Nedvedicky

> 
> Turns out this is a rather simple issue that got slightly
> complicated by the code diverging quite a bit since the
> inception.  Essentially the clr->ifname comes from the
> interface specification in the "pfctl -i foo0 -Fs" for
> if-bound states (floating states use fake interface "any").
> 
> Previously states have been hanging off of kif nodes but it's
> long gone and we can simply iterate over the state table tree
> (or even a state list like it's done in the DIOCGETSTATES in
> pf_ioctl).
> 
> Calling pf_kif_get here wouldn't be prudent because spawning
> new objects while disposing of the other ones seems somewhat
> counterproductive.
> 
> diff --git sys/net/if_pfsync.c sys/net/if_pfsync.c
> index 7d633db..fcaf5f5 100644
> --- sys/net/if_pfsync.c
> +++ sys/net/if_pfsync.c
> @@ -752,46 +752,28 @@ done:
>  
>  int
>  pfsync_in_clr(caddr_t buf, int len, int count, int flags)
>  {
>   struct pfsync_clr *clr;
> - int i;
> -
>   struct pf_state *st, *nexts;
> - struct pf_state_key *sk, *nextsk;
> - struct pf_state_item *si;
> + struct pfi_kif *kif = NULL;
>   u_int32_t creatorid;
> + int i;
>  
>   for (i = 0; i < count; i++) {
>   clr = (struct pfsync_clr *)buf + len * i;
>   creatorid = clr->creatorid;
> + if (strlen(clr->ifname) &&
> + (kif = pfi_kif_find(clr->ifname)) == NULL)
> + continue;
>  
> - if (clr->ifname[0] == '\0') {
> - for (st = RB_MIN(pf_state_tree_id, _id);
> - st; st = nexts) {
> - nexts = RB_NEXT(pf_state_tree_id, _id, st);
> - if (st->creatorid == creatorid) {
> - SET(st->state_flags, PFSTATE_NOSYNC);
> - pf_unlink_state(st);
> - }
> - }
> - } else {
> - if (pfi_kif_get(clr->ifname) == NULL)
> - continue;
> -
> - /* XXX correct? */
> - for (sk = RB_MIN(pf_state_tree, _statetbl);
> - sk; sk = nextsk) {
> - nextsk = RB_NEXT(pf_state_tree,
> - _statetbl, sk);
> - TAILQ_FOREACH(si, >states, entry) {
> - if (si->s->creatorid == creatorid) {
> - SET(si->s->state_flags,
> - PFSTATE_NOSYNC);
> - pf_unlink_state(si->s);
> - }
> - }
> + for (st = RB_MIN(pf_state_tree_id, _id); st; st = nexts) {
> + nexts = RB_NEXT(pf_state_tree_id, _id, st);
> + if (st->creatorid == creatorid &&
> + ((kif && st->kif == kif) || !kif)) {
> + SET(st->state_flags, PFSTATE_NOSYNC);
> + pf_unlink_state(st);
>   }
>   }
>   }
>  
>   return (0);
> diff --git sys/net/pf_if.c sys/net/pf_if.c
> index caaf9f9..bf77184 100644
> --- sys/net/pf_if.c
> +++ sys/net/pf_if.c
> @@ -97,18 +97,25 @@ pfi_initialize(void)
>   if ((pfi_all = pfi_kif_get(IFG_ALL)) == NULL)
>   panic("pfi_kif_get for pfi_all failed");
>  }
>  
>  struct pfi_kif *
> -pfi_kif_get(const char *kif_name)
> +pfi_kif_find(const char *kif_name)
>  {
> - struct pfi_kif  *kif;
>   struct pfi_kif_cmp   s;
>  
>   bzero(, sizeof(s));
>   strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name));
> - if ((kif = RB_FIND(pfi_ifhead, _ifs, (struct pfi_kif *))) != NULL)
> + return (RB_FIND(pfi_ifhead, _ifs, (struct pfi_kif *)));
> +}
> +
> +struct pfi_kif *
> +pfi_kif_get(const char *kif_name)

may be it's kind of bike shading...
How about make kifs to stick to convention we see for other objects
such as rulesets/anchors:

pfi_kif_find()
pfi_kif_find_or_create()

and kill pfi_kif_get() completely, just to avoid confusion/surprise.

anything else makes a sense to me.

regards
sasha

patch for two nits around pf_insert_src_node() et. al.

2015-10-10 Thread Alexandr Nedvedicky

Hello,

Patch fixes two small nits related to source node table in PF (a.k.a.
pf_src_tree_tracking).

The first issue comes to `global` argument of pf_insert_src_node(). It is
always 0 everywhere in source code. The `global` is supposed to indicate
whether particular state is bound to global/main rule or to rule created on
behalf of admin. However in reality PF always uses 0 for `global` everywhere.

The second issue is related to pf_remove_src_node() function, which refuses
the remove source node from table (pf_src_tree_tracking) if node to be removed
is not bound to rule (sn->rule.ptr == NULL). Such node would hang in tree
forever. I think we never hit this problem, since source node is always
bound to rule.

OK?

thanks and
regards
sasha

8<---8<---8<--8<

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.946
diff -u -p -r1.946 pf.c
--- pf.c8 Oct 2015 11:36:51 -   1.946
+++ pf.c10 Oct 2015 16:49:40 -
@@ -501,7 +501,7 @@ pf_src_connlimit(struct pf_state **state
 int
 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
 enum pf_sn_types type, sa_family_t af, struct pf_addr *src,
-struct pf_addr *raddr, int global)
+struct pf_addr *raddr)
 {
struct pf_src_node  k;
 
@@ -509,10 +509,7 @@ pf_insert_src_node(struct pf_src_node **
k.af = af;
k.type = type;
PF_ACPY(, src, af);
-   if (global)
-   k.rule.ptr = NULL;
-   else
-   k.rule.ptr = rule;
+   k.rule.ptr = rule;
pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
*sn = RB_FIND(pf_src_tree, _src_tracking, );
}
@@ -531,10 +528,7 @@ pf_insert_src_node(struct pf_src_node **
 
(*sn)->type = type;
(*sn)->af = af;
-   if (global)
-   (*sn)->rule.ptr = NULL;
-   else
-   (*sn)->rule.ptr = rule;
+   (*sn)->rule.ptr = rule;
PF_ACPY(&(*sn)->addr, src, af);
if (raddr)
PF_ACPY(&(*sn)->raddr, raddr, af);
@@ -570,16 +564,14 @@ pf_remove_src_node(struct pf_src_node *s
if (sn->states > 0 || sn->expire > time_uptime)
return;
 
-   if (sn->rule.ptr != NULL) {
-   sn->rule.ptr->src_nodes--;
-   if (sn->rule.ptr->states_cur == 0 &&
-   sn->rule.ptr->src_nodes == 0)
-   pf_rm_rule(NULL, sn->rule.ptr);
-   RB_REMOVE(pf_src_tree, _src_tracking, sn);
-   pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
-   pf_status.src_nodes--;
-   pool_put(_src_tree_pl, sn);
-   }
+   sn->rule.ptr->src_nodes--;
+   if (sn->rule.ptr->states_cur == 0 &&
+   sn->rule.ptr->src_nodes == 0)
+   pf_rm_rule(NULL, sn->rule.ptr);
+   RB_REMOVE(pf_src_tree, _src_tracking, sn);
+   pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
+   pf_status.src_nodes--;
+   pool_put(_src_tree_pl, sn);
 }
 
 struct pf_src_node *
@@ -3381,7 +3373,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
 
if (r->rule_flag & PFRULE_SRCTRACK &&
pf_insert_src_node([PF_SN_NONE], r, PF_SN_NONE, pd->af,
-   pd->src, NULL, 0) != 0) {
+   pd->src, NULL) != 0) {
REASON_SET(, PFRES_SRCLIMIT);
goto cleanup;
}
Index: pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving revision 1.49
diff -u -p -r1.49 pf_lb.c
--- pf_lb.c 3 Aug 2015 13:33:12 -   1.49
+++ pf_lb.c 10 Oct 2015 16:49:41 -
@@ -621,8 +621,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
pf_remove_src_node(sns[type]);
sns[type] = NULL;
}
-   if (pf_insert_src_node([type], r, type, af, saddr, naddr,
-   0))
+   if (pf_insert_src_node([type], r, type, af, saddr, naddr))
return (1);
}
 
Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.420
diff -u -p -r1.420 pfvar.h
--- pfvar.h 19 Aug 2015 21:22:41 -  1.420
+++ pfvar.h 10 Oct 2015 16:49:43 -
@@ -1681,7 +1681,7 @@ extern int pf_state_insert(struct 
pfi
 int pf_insert_src_node(struct pf_src_node **,
struct pf_rule *, enum pf_sn_types,
sa_family_t, struct pf_addr *,
-   struct pf_addr *, int);
+

Re: patch for two nits around pf_insert_src_node() et. al.

2015-10-12 Thread Alexandr Nedvedicky

Hello,

The updated patch addresses additional nit found by mpi:
> > Here can't you also change:
> > 
> >   if ((*sn)->rule.ptr != NULL)
> >   (*sn)->rule.ptr->src_nodes++;
> > 
> > into:
> > 
> > (*sn)->rule.ptr->src_nodes++;

> > 
> > I don't know enough to say if it's correct or not, but I'd suggest
> > sending another diff for that dealing with all the NULL checks :)
> > What about pf_state_export() for example?
> >
> 
> I think you might be confusing state rule pointers and source node
> rule pointers.  I think Sasha has got all of the latter ones (albeit
> with your correction), but on the other hand I would love to know
> how st->rule.ptr can be NULL in the pf_state_export as pf_rm_rule is
> not supposed to remove a rule with active states.

thanks Mike, I was looking at if_pfsync.c and related stuff in PF.  It looks
like source node table is not updated at all by PF-sync. I'm still not sure
about rules themselves, but it seems to me there is other mean than PF-sync to
sync the rules between the firewall nodes. I could not spot nothing related
to 'rule transfer' in if_pfsync.c/pf sources.

thanks and
regards
sasha

8<---8<-8<
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.946
diff -u -p -r1.946 pf.c
--- pf.c8 Oct 2015 11:36:51 -   1.946
+++ pf.c12 Oct 2015 15:52:47 -
@@ -501,7 +501,7 @@ pf_src_connlimit(struct pf_state **state
 int
 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
 enum pf_sn_types type, sa_family_t af, struct pf_addr *src,
-struct pf_addr *raddr, int global)
+struct pf_addr *raddr)
 {
struct pf_src_node  k;
 
@@ -509,10 +509,7 @@ pf_insert_src_node(struct pf_src_node **
k.af = af;
k.type = type;
PF_ACPY(, src, af);
-   if (global)
-   k.rule.ptr = NULL;
-   else
-   k.rule.ptr = rule;
+   k.rule.ptr = rule;
pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
*sn = RB_FIND(pf_src_tree, _src_tracking, );
}
@@ -531,10 +528,7 @@ pf_insert_src_node(struct pf_src_node **
 
(*sn)->type = type;
(*sn)->af = af;
-   if (global)
-   (*sn)->rule.ptr = NULL;
-   else
-   (*sn)->rule.ptr = rule;
+   (*sn)->rule.ptr = rule;
PF_ACPY(&(*sn)->addr, src, af);
if (raddr)
PF_ACPY(&(*sn)->raddr, raddr, af);
@@ -550,8 +544,7 @@ pf_insert_src_node(struct pf_src_node **
return (-1);
}
(*sn)->creation = time_uptime;
-   if ((*sn)->rule.ptr != NULL)
-   (*sn)->rule.ptr->src_nodes++;
+   (*sn)->rule.ptr->src_nodes++;
pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
pf_status.src_nodes++;
} else {
@@ -570,16 +563,14 @@ pf_remove_src_node(struct pf_src_node *s
if (sn->states > 0 || sn->expire > time_uptime)
return;
 
-   if (sn->rule.ptr != NULL) {
-   sn->rule.ptr->src_nodes--;
-   if (sn->rule.ptr->states_cur == 0 &&
-   sn->rule.ptr->src_nodes == 0)
-   pf_rm_rule(NULL, sn->rule.ptr);
-   RB_REMOVE(pf_src_tree, _src_tracking, sn);
-   pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
-   pf_status.src_nodes--;
-   pool_put(_src_tree_pl, sn);
-   }
+   sn->rule.ptr->src_nodes--;
+   if (sn->rule.ptr->states_cur == 0 &&
+   sn->rule.ptr->src_nodes == 0)
+   pf_rm_rule(NULL, sn->rule.ptr);
+   RB_REMOVE(pf_src_tree, _src_tracking, sn);
+   pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
+   pf_status.src_nodes--;
+   pool_put(_src_tree_pl, sn);
 }
 
 struct pf_src_node *
@@ -3381,7 +3372,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
 
if (r->rule_flag & PFRULE_SRCTRACK &&
pf_insert_src_node([PF_SN_NONE], r, PF_SN_NONE, pd->af,
-   pd->src, NULL, 0) != 0) {
+   pd->src, NULL) != 0) {
REASON_SET(, PFRES_SRCLIMIT);
goto cleanup;
}
Index: pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving revision 1.49
diff -u -p -r1.49 pf_lb.c
--- pf_lb.c 3 Aug 2015 13:33:12 -   1.49
+++ pf_lb.c 12 Oct 2015 15:52:48 -
@@ -621,8 +621,7 @@ pf_map_addr(sa_family_t af, struct pf_ru
pf_remove_src_node(sns[type]);
sns[type] = NULL;
}
-   if (pf_insert_src_node([type], r, type, af,

Re: patch for two nits around pf_insert_src_node() et. al.

2015-10-12 Thread Alexandr Nedvedicky

Hello,

Richard Procter came back to me in private email with one more nit to fix:

we can get rid of

if (sn->rule.ptr != NULL)
test condition in pfioctl() function as well.

The relevant snippet looks as follows:

2188 p = psn->psn_src_nodes;
2189 RB_FOREACH(n, pf_src_tree, _src_tracking) {

2198 pstore->kif = NULL;
2199 if (n->rule.ptr != NULL)
2200 pstore->rule.nr = n->rule.ptr->nr;

need one more OK for Richard's suggestion. Updated patch is below.
Complete email from Richard follows the patch.

thanks and
regards
sasha

8<---8<---8<--8<

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.946
diff -u -p -r1.946 pf.c
--- pf.c8 Oct 2015 11:36:51 -   1.946
+++ pf.c12 Oct 2015 20:20:17 -
@@ -501,7 +501,7 @@ pf_src_connlimit(struct pf_state **state
 int
 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
 enum pf_sn_types type, sa_family_t af, struct pf_addr *src,
-struct pf_addr *raddr, int global)
+struct pf_addr *raddr)
 {
struct pf_src_node  k;
 
@@ -509,10 +509,7 @@ pf_insert_src_node(struct pf_src_node **
k.af = af;
k.type = type;
PF_ACPY(, src, af);
-   if (global)
-   k.rule.ptr = NULL;
-   else
-   k.rule.ptr = rule;
+   k.rule.ptr = rule;
pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
*sn = RB_FIND(pf_src_tree, _src_tracking, );
}
@@ -531,10 +528,7 @@ pf_insert_src_node(struct pf_src_node **
 
(*sn)->type = type;
(*sn)->af = af;
-   if (global)
-   (*sn)->rule.ptr = NULL;
-   else
-   (*sn)->rule.ptr = rule;
+   (*sn)->rule.ptr = rule;
PF_ACPY(&(*sn)->addr, src, af);
if (raddr)
PF_ACPY(&(*sn)->raddr, raddr, af);
@@ -550,8 +544,7 @@ pf_insert_src_node(struct pf_src_node **
return (-1);
}
(*sn)->creation = time_uptime;
-   if ((*sn)->rule.ptr != NULL)
-   (*sn)->rule.ptr->src_nodes++;
+   (*sn)->rule.ptr->src_nodes++;
pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
pf_status.src_nodes++;
} else {
@@ -570,16 +563,14 @@ pf_remove_src_node(struct pf_src_node *s
if (sn->states > 0 || sn->expire > time_uptime)
return;
 
-   if (sn->rule.ptr != NULL) {
-   sn->rule.ptr->src_nodes--;
-   if (sn->rule.ptr->states_cur == 0 &&
-   sn->rule.ptr->src_nodes == 0)
-   pf_rm_rule(NULL, sn->rule.ptr);
-   RB_REMOVE(pf_src_tree, _src_tracking, sn);
-   pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
-   pf_status.src_nodes--;
-   pool_put(_src_tree_pl, sn);
-   }
+   sn->rule.ptr->src_nodes--;
+   if (sn->rule.ptr->states_cur == 0 &&
+   sn->rule.ptr->src_nodes == 0)
+   pf_rm_rule(NULL, sn->rule.ptr);
+   RB_REMOVE(pf_src_tree, _src_tracking, sn);
+   pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
+   pf_status.src_nodes--;
+   pool_put(_src_tree_pl, sn);
 }
 
 struct pf_src_node *
@@ -3381,7 +3372,7 @@ pf_test_rule(struct pf_pdesc *pd, struct
 
if (r->rule_flag & PFRULE_SRCTRACK &&
pf_insert_src_node([PF_SN_NONE], r, PF_SN_NONE, pd->af,
-   pd->src, NULL, 0) != 0) {
+   pd->src, NULL) != 0) {
REASON_SET(, PFRES_SRCLIMIT);
goto cleanup;
}
Index: pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.290
diff -u -p -r1.290 pf_ioctl.c
--- pf_ioctl.c  4 Sep 2015 21:40:25 -   1.290
+++ pf_ioctl.c  12 Oct 2015 20:20:20 -
@@ -2175,8 +2175,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
bzero(>entry, sizeof(pstore->entry));
pstore->rule.ptr = NULL;
pstore->kif = NULL;
-   if (n->rule.ptr != NULL)
-   pstore->rule.nr = n->rule.ptr->nr;
+   pstore->rule.nr = n->rule.ptr->nr;
pstore->creation = secs - pstore->creation;
if (pstore->expire > secs)
pstore->expire -= secs;
Index: pf_lb.c
===
RCS file: /cvs/src/sys/net/pf_lb.c,v
retrieving

Re: preparing pfi_kif to MP world

2015-10-13 Thread Alexandr Nedvedicky

Hello,

> > > Furthermore existing functions pfi_kif_ref()/pfi_kif_unref() are thrown 
> > > away
> > > in favor of pfi_kif_take()/pfi_kif_rele(), which follow naming convention
> > > set by refcnt_init(9).  Patch also removes kif reference types (enum
> > > pfi_kif_refs).
> > >
> [snip]
> > > @@ -810,3 +762,28 @@ pfi_unmask(void *addr)
> > >   return (b);
> > > }
> > > 
> > > +struct pfi_kif *
> > > +pfi_kif_take(struct pfi_kif *kif)
> > > +{
> > > + if (kif != pfi_all) {
> > > + PF_REF_TAKE(kif->pfik_refcnt);
> > > + }
> 
> No need for curly braces here --^
> 

yes, that's true, will fix that...

> 
> Even the typedef? (-:
> 

sorry I forgot about OpenBSD hates typedefs ;-)
I'll keep in mind to kill all typdefs in my patches next time.

> but I'd prefer the former, e.g. _k instead of _kif_.  Double

O.K. I'll stick to _k style.

> 
> PFI_KIF_TAKE and PFI_KIF_RELE macros add another level of
> complexity for almost zero gain.  PFI_KIF_TAKE is not even
> used.  I'd say scrap them.

it's true in case of pfi_kif objects we can scrap them.

on the other hand the macros illustrate the way I'd like to reference counting
for other objects. I'll stick to the pfi_kif just as an illustration.

if I use lowercase function such as:
r->kif = pfi_kif_take(kif);
then I'm sure the kif instance exists, if it does not exist the thing
will blow up with NULL pointer dereference and I know I'm wrong. On the
other hand if I do:
r->kif = PFI_KIF_TAKE(kif);
then it means kif is optional here, but if it exists, then I must have
a reference for it here.

The way we have it in PF on Solaris these days looks as follows:
_take(x)
{
if (x != NULL)
PF_INC_REF(x->refcnt);
return (x);
}
This way we assume 'x' is always optional. And this is sometimes quite
wrong, as we might set mine, we trip over later.

The uppercase macros allows us to easily annotate cases, where the
object is optional and vice versa: the lower case functions annotate 
cases, where we assume the object must exist. This could save us few 
days
of debugging.

So I'd like to make sure if you hate that approach in general or in the
scope of this particular patch. I admit the pfi_kif is not the right
guy for the uppercase version as the pfi_kif instance always must exist.

I'll remove those macros from proposed patch, but I'd like to know
how do you feel about them for other objects such as tables, anchors,
states, 

> I would also say that the PF_REF_* should stay under _KERNEL
> as they simply cannot be used in the userland and moved some-
> where before "extern struct pf_status pf_status" (after the
> prototypes chunk) or after the "extern struct pf_pool_limit"
> right before the "#endif /* _KERNEL */".
> 

yes that's very good point. (How I could miss it?)

thanks and
regards
sasha

Re: the very first step towards MULTIPROCESSOR friendly PF

2015-08-27 Thread Alexandr Nedvedicky

/large snip

  Assuming the locking in MULTIPROCESSOR goes like:
   interrupt grabs splsoftnet - ip_input - PF grabs KERNEL_LOCK()
  We need to take care of ioctl() call path and purge thread. Those need to
  get synchronize with packets using KERNEL_LOCK(). They should not to mess 
  with
  splsoftnet() any more in MULTIPROCESSOR version.
 
 As long as we are using (soft-)interrupts to process incoming packets
 they should.
 
 If you don't raise the SPL level to softnet while CPU0 is executing some
 code in ioctl() path and it takes an interrupt at this point it might end
 up executing ip_input() *before* returning to the ioctl() (process)
 context.  This scenario might corrupt the states you're trying to protect.
 

just to verify I got everything right here. I assume I've got MULTIPROCESSOR
kernel with my broken patch, which basically trades or splfotnet() locks for
KERNEL_LOCKS()...

We have ioctl() operation, which attempts to insert a rule into a global
ruleset, ioctl() grabs KERNEL_LOCK() only and gets to work. (yes my broken
patch does not grab splsoftnet()).

while ioctl() is working there is interrupt, which delivers packet from NIC.
since no one holds splsoftnet() on CPU packet is free to go and enters
firewall (pf_test()).  And that's very bad, interrupt tries to grab
KERNEL_LOCK(), which is still held by ioctl() packet has just interrupted. One
of the two bad things will happen:

a) deadlock, we will be waiting for KERNEL_LOCK(), which is still held
   by interrupted ioctl() operation.

b) since KERNEL_LOCK() is recursive, the pf_test() will be free to
   go possibly finding a list of rules in inconsistent state. In this
   case no deadlock happens, but PF plays game with pointers...

as Kettenis pointed out the KERNEL_LOCK() is recursive, I have an itchy feeling
we are taking b) option. Is that right?

  Does that idea sound right? If not what piece I'm still missing in puzzle?
 
 I think you're assuming that the softnet code path is running in a
 thread which is not (yet) the case.

yes, my context is not fully switched from Solaris to OpenBSD yet...

thanks and
regards
sasha

the very first step towards MULTIPROCESSOR friendly PF

2015-08-26 Thread Alexandr Nedvedicky

Hello,

I'm not sure I got everything right in Calgary. So this patch should roughly
illustrates how I think we should start moving forward to make PF MULTIPROCESSOR
friendly. It's quite likely my proposal/way is completely off, I'll be happy if
you put me back to ground.

The brief summary of what patch is trying to achieve is as follows:

patch trades all splsoftnet() for KERNEL_LOCK() when it gets compiled
with MULTIPROCESSOR option on.

if MULTIPROCESSOR option is off, the compiler produces PF, which uses
splsoftnet.

To achieve this the patch introduces macros PF_LOCK()/PF_UNLOCK(),
which expand to KERNEL_LOCK()/KERNEL_UNLOCK(), when MULTIPROCESSOR is 
on.
On the other hand if MULTIPROCESSOR is off the PF_*LOCK() macros become
splsoftnet()/splx()

Skip to =breakage= if you don't care about details/future plans. Currently PF
must synchronize all those guys:

- packets, which are running through pf_test(). IP stack already
  serializes calls to pf_test() (there is always one running pf_test()
  instance at most)

- ioctl() operations on PF driver with packets and with each other
  (it looks like there might be more processes, which read state table,
  those are allowed to run in parallel). To serialize ioctl() operations
  with each other PF uses pf_consistency_lock (which is an RW-lock).

  If particular ioctl() operation must be synchronized with packets it
  must get splsotnet.

- purge thread, which expires states. purge thread must grab
  pf_consistency_lock and splsoftnet.

The desired state is to break a giant pf_consistency_lock into few more
RW-locks.  Which will protect various data PF keeps. Those RW-locks will
also synchronize packets. The list of locks, which I have on mind is as follows:

- pf_state_rw

- pf_anchors_rw (packets don't need to grab it as they grab rw-locks
  bound to individual rulesets)

- pf_tables_rw (packets don't need to grab it as they grab rw-locks
  bound to table instances).

The first major milestone in this effort is to introduce pf_state_rw. The patch
I'm proposing here buys us enough freedom to relatively safely decompose the
pf_consistency_lock and make pf_test() parallel for packets.

=breakage=
The proposed patch breaks 'return-*' action, when PF gets compiled with
MULTIPROCESSOR on. I think it is unsafe to call icmp_err*() functions, while
holding a KERNEL_LOCK(). And it is risky to give up KERNEL_LOCK(), execute
a send operation on response packet and re-grab KERNEL_LOCK() again as we
would arrive to different world (different in sense the pointer we remember
might be invalid now). To fix that we must introduce a reference counting
for objects, so it will become safe to drop and re-grab KERNEL_LOCK(), while
holding a reference.

The problem has been solved for pf_route*() functions, so PBR works in
MULTIPROCESSOR friendly PF.

My patch does not touch if_pfsync.c at all. The PF_SYNC support in
MULTIPROCESSOR PF will have to come in some later phase. You should consider it
to be broken in MULTIPROCESSOR version.

There should be no breakage in PF for GENERIC kernel.

regards
sasha

88---8-8

Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.936
diff -u -p -r1.936 pf.c
--- pf.c19 Aug 2015 21:22:41 -  1.936
+++ pf.c26 Aug 2015 14:11:17 -
@@ -906,7 +906,7 @@ int
 pf_state_insert(struct pfi_kif *kif, struct pf_state_key **skw,
 struct pf_state_key **sks, struct pf_state *s)
 {
-   splsoftassert(IPL_SOFTNET);
+   PF_ASSERT_LOCKED(nothing);
 
s-kif = kif;
if (*skw == *sks) {
@@ -1150,12 +1150,13 @@ pf_state_export(struct pfsync_state *sp,
 void
 pf_purge_thread(void *v)
 {
-   int nloops = 0, s;
+   int nloops = 0;
+   PF_LOCK_INSTANCE(s);
 
for (;;) {
tsleep(pf_purge_thread, PWAIT, pftm, 1 * hz);
 
-   s = splsoftnet();
+   PF_LOCK(s);
 
/* process a fraction of the state table every second */
pf_purge_expired_states(1 + (pf_status.states
@@ -1168,7 +1169,7 @@ pf_purge_thread(void *v)
nloops = 0;
}
 
-   splx(s);
+   PF_UNLOCK(s);
}
 }
 
@@ -1259,7 +1260,7 @@ pf_src_tree_remove_state(struct pf_state
 void
 pf_unlink_state(struct pf_state *cur)
 {
-   splsoftassert(IPL_SOFTNET);
+   PF_ASSERT_LOCKED(nothing);
 
/* handle load balancing related tasks */
pf_postprocess_addr(cur);
@@ -1294,7 +1295,7 @@ pf_free_state(struct pf_state *cur)
 {
struct pf_rule_item *ri;
 
-   splsoftassert(IPL_SOFTNET);
+   PF_ASSERT_LOCKED(nothing);
 
 #if NPFSYNC  0
if

Re: the very first step towards MULTIPROCESSOR friendly PF

2015-08-27 Thread Alexandr Nedvedicky

On Wed, Aug 26, 2015 at 06:12:10PM +0200, Mark Kettenis wrote:
  Date: Wed, 26 Aug 2015 17:30:14 +0200
  From: Alexandr Nedvedicky alexandr.nedvedi...@oracle.com

  Hello,

  I'm not sure I got everything right in Calgary. So this patch should
  roughly illustrates how I think we should start moving forward to
  make PF MULTIPROCESSOR friendly. It's quite likely my proposal/way
  is completely off, I'll be happy if you put me back to ground.

  The brief summary of what patch is trying to achieve is as follows:

  patch trades all splsoftnet() for KERNEL_LOCK() when it gets compiled
  with MULTIPROCESSOR option on.

  if MULTIPROCESSOR option is off, the compiler produces PF, which uses
  splsoftnet.

  To achieve this the patch introduces macros PF_LOCK()/PF_UNLOCK(),
  which expand to KERNEL_LOCK()/KERNEL_UNLOCK(), when MULTIPROCESSOR is 
  on.
  On the other hand if MULTIPROCESSOR is off the PF_*LOCK() macros become
  splsoftnet()/splx()

 I don't think this will work.  Even on MULTIPROCESSOR kernels you'll
 need to raise the spl to prevent soft interrupts from running on the
 same CPU.  KERNEL_LOCK() will not prevent this from happening as it is
 a recursive lock.  This is why OpenBSD's mutexes (spinning locks)
 raise the spl.

 So I think you'll have to define PF_LOCK()/PF_UNLOCK() to do the spl
 stuff even for MULTIPROCESSOR kernels.

Thanks for putting my feet to ground. I'm trying to make some sense from that.
So if I understand splsotfnet() right in MULTIPROCESSOR kernel, it prevents
CPU from handling another network interrupt until the first one is
handled. So that splsoftnet should be raised by network driver, before
packet reaches PF (pf_test() function).

the pf_test() must protect consistency of firewall data, which are shared
between CPUs (?can we say kernel threads?). So for the first phase, we opt for
KERNEL_LOCK() being grabbed by pf_test() as soon as it gets called. It makes PF
to still process single packet at time. All other CPUs will have to wait on
KERNEL_LOCK(), holding their splsoftnets(). Do I still follow the concept, or
am I wrong again?

Assuming the locking in MULTIPROCESSOR goes like:
 interrupt grabs splsoftnet - ip_input - PF grabs KERNEL_LOCK()
We need to take care of ioctl() call path and purge thread. Those need to
get synchronize with packets using KERNEL_LOCK(). They should not to mess with
splsoftnet() any more in MULTIPROCESSOR version.

Does that idea sound right? If not what piece I'm still missing in puzzle?

thanks and
regards
sasha

Re: [PATCH] PF: cksum modification & refactor [0/24]

2015-08-31 Thread Alexandr Nedvedicky

Hello,

I'm fine with this change. It certainly posses no thread to SMP branch...

> * Initialise pd->pcksum for icmp6 
> 
>   - ensures pcksum is set for all known checksummed protocols   
>   - later patches rely on this 
may be this patch/change should be folded to patch, which
really expects pd->pcksum to be set to address of icmp6_cksum field.
just to keep things sane in CVS tree.

regards
sasha

PF ignores block action when rule contains route-to/dup-to action

2015-08-31 Thread Alexandr Nedvedicky

Hello,

Dilli Paudel in Oracle was playing with PF enough to find funny glitch.
He used rule as follows:

block in on vnic4 from 192.168.1.0/24 to any route-to 172.16.1.1@vnic5

Many people expect the route-to action is somewhat futile as 'block' action
takes precedence here, so packet gets always dropped. Well the reality is
very different (and still makes a sort of sense) from PF point of view.
The snippet comes from pf_test():

6586 switch (action) {
6587 case PF_SYNPROXY_DROP:

6593 case PF_DIVERT:
6594 switch (pd.af) {
6595 case AF_INET:

6610 case PF_AFRT:
6611 if (pf_translate_af()) {

6624 #endif /* INET6 */
6625 default:
6626 /* pf_route can free the mbuf causing *m0 to become 
NULL */
6627 if (r->rt) {
6628 switch (pd.af) {
6629 case AF_INET:
6630 pf_route(m0, r, pd.dir, 
pd.kif->pfik_ifp, s);
6631 break;

the action comes from matching rule. It's PF_DROP in case of Dilli's rule.  As
you can see there is no case-branch for PF_DROP in switch statement at line
6586, so a default: is executed. For route-to action the r->rt is set and PF
executes the route_to*().

Dilli  suggests to introduce PF_DROP case branch to switch() at line 6586 Issue
has been also discussed on Friday over icb, where Mr. bluhm further suggested
we should try to add some sanity check to parse.y.

As a side effect the patch breaks block rules with dup-to action. dup-to
action as a part of block rule might make some sense... So if there is
someone, who really needs block ... dup-to he should opt for equivalent
rule using pass ... route-to 

Also there is one more question:

shall we implement similar sanity checks for nat-to/rdr-to/... actions?

no one should expect those in block rule, so making pfctl to refuse such rules
loudly sounds like a right thing to do...

regards
sasha

8<---8<---8<--8<

Index: sbin/pfctl/parse.y
===
RCS file: /cvs/src/sbin/pfctl/parse.y,v
retrieving revision 1.648
diff -u -p -r1.648 parse.y
--- sbin/pfctl/parse.y  21 Apr 2015 16:34:59 -  1.648
+++ sbin/pfctl/parse.y  31 Aug 2015 20:30:56 -
@@ -3997,8 +3997,11 @@ rule_consistent(struct pf_rule *r, int a
problems++;
}
 
-   /* match rules rules */
-   if (r->action == PF_MATCH) {
+   /* 
+* Basic rule sanity check.
+*/
+   switch (r->action) {
+   case PF_MATCH:
if (r->divert.port) {
yyerror("divert is not supported on match rules");
problems++;
@@ -4016,6 +4019,15 @@ rule_consistent(struct pf_rule *r, int a
yyerror("af-to is not supported on match rules");
problems++;
}
+   break;
+   case PF_DROP:
+   if (r->rt) {
+   yyerror("route-to, reply-to and dup-to "
+  "must not be used on block rules");
+   problems++;
+   }
+   break;
+   default:;
}
return (-problems);
 }
Index: sys/net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.936
diff -u -p -r1.936 pf.c
--- sys/net/pf.c19 Aug 2015 21:22:41 -  1.936
+++ sys/net/pf.c31 Aug 2015 20:31:02 -
@@ -6622,6 +6622,10 @@ done:
action = PF_PASS;
break;
 #endif /* INET6 */
+   case PF_DROP:
+   m_freem(*m0);
+   *m0 = NULL;
+   break;
default:
/* pf_route can free the mbuf causing *m0 to become NULL */
if (r->rt) {

Re: [PATCH] PF: cksum modification & refactor [3/24]

2015-08-31 Thread Alexandr Nedvedicky

Hello Richard,

the code in patch looks good for the first glance. However it seems to me
the newly introduced pf_cksum_fixup*() are not called yet.  Do you think you
can reshuffle changes between your set of patches a bit, so the newly
introduced functions will become alive (get called)?

Also I think your patch 0/24, you've sent earlier, can be fold here (setting
pd->pcksum to point to icmp6 header chksum field). 

thanks a lot
regards
sasha

Re: PF ignores block action when rule contains route-to/dup-to action

2015-09-01 Thread Alexandr Nedvedicky

Hello,


> > As a side effect the patch breaks block rules with dup-to action. dup-to
> > action as a part of block rule might make some sense... So if there is
> > someone, who really needs block ... dup-to he should opt for equivalent
> > rule using pass ... route-to 
> > 
> > Also there is one more question:
> > 
> >shall we implement similar sanity checks for nat-to/rdr-to/... actions?
> 
> IMHO, yes that would make sense.
> 

I'll try to keep it on my todo list...

> Some bike shedding inline below, apart from that, ok jung@

I love bike shedding, so let's go an pick up some colors ;-)

> 
> > -   if (r->action == PF_MATCH) {
> > +   /* 
> > +* Basic rule sanity check.
> > +*/
> 
> A single line comment is enough here, isn’t it?
> 

O.K. new patch has single line comment.

> > +   switch (r->action) {
> > +   case PF_MATCH:
> > if (r->divert.port) {
> > yyerror("divert is not supported on match rules");
> > problems++;
> > @@ -4016,6 +4019,15 @@ rule_consistent(struct pf_rule *r, int a
> > yyerror("af-to is not supported on match rules");
> > problems++;
> > }
> > +   break;
> > +   case PF_DROP:
> > +   if (r->rt) {
> > +   yyerror("route-to, reply-to and dup-to "
> > +  "must not be used on block rules”);
> 
> The other error messages say “is not supported” instead of “must no be used”.
> I do not care which wording you chose, but maybe take the chance and unify 
> it, 
> to be more consistent here? 
> 

the question is which consistency do you want. The patch does not show them,
but there are two options, actually three. Let me show the current code without
patch applied:

4000 /* match rules rules */
4001 if (r->action == PF_MATCH) {
4002 if (r->divert.port) {
4003 yyerror("divert is not supported on match 
rules");
4004 problems++;
4005 }
4006 if (r->divert_packet.port) {
4007 yyerror("divert is not supported on match 
rules");
4008 problems++;
4009 }
4010 if (r->rt) {
4011 yyerror("route-to, reply-to and dup-to "
4012"must not be used on match rules");
4013 problems++;
4014 }
4015 if (r->rule_flag & PFRULE_AFTO) {
4016 yyerror("af-to is not supported on match 
rules");
4017 problems++;
4018 }
4019 }

as you can see there are two colors to chose from:

color A:
... is not supported on ... rules (used at 4003, 4007, 4016

color B:
... must not be used on match rules (used at line 4911)

we have three options:

1) leave it as it is (both colors will be used)

2) use color A

3) use color B

IMO consistency is good here. I prefer color A as it sounds more polite. 

updated patch is further below.

regards
sasha

8<---8<---8<--8<

Index: sbin/pfctl/parse.y
===
RCS file: /cvs/src/sbin/pfctl/parse.y,v
retrieving revision 1.648
diff -u -p -r1.648 parse.y
--- sbin/pfctl/parse.y  21 Apr 2015 16:34:59 -  1.648
+++ sbin/pfctl/parse.y  1 Sep 2015 12:29:41 -
@@ -3997,8 +3997,9 @@ rule_consistent(struct pf_rule *r, int a
problems++;
}
 
-   /* match rules rules */
-   if (r->action == PF_MATCH) {
+   /* Basic rule sanity check. */
+   switch (r->action) {
+   case PF_MATCH:
if (r->divert.port) {
yyerror("divert is not supported on match rules");
problems++;
@@ -4009,13 +4010,22 @@ rule_consistent(struct pf_rule *r, int a
}
if (r->rt) {
yyerror("route-to, reply-to and dup-to "
-  "must not be used on match rules");
+  "are not supported on match rules");
problems++;
}
if (r->rule_flag & PFRULE_AFTO) {
yyerror("af-to is not supported on match rules");
problems++;
}
+   break;
+   case PF_DROP:
+   if (r->rt) {
+   yyerror("route-to, reply-to and dup-to "
+  "are not supported on block rules");
+   problems++;
+   }
+   break;
+   default:;
}
return (-problems);
 }
Index: sys/net/pf.c
===
RCS file:

Re: the very first step towards MULTIPROCESSOR friendly PF

2015-09-04 Thread Alexandr Nedvedicky

Hello,

after reading emails from Philip Guenther and Mark Kettenis, doing some RTFM on
locking in OpenBSD kernel I have a new patch. I call it as a smp-step-0.

Patch introduces a KERNEL_LOCK() to PF. Many dances with KERNEL_LOCK() happens
in pf_test(). From future work point of view there are distinct parts as
follows in pf_test():

- packet sanitization with reassembly. We grab and drop KERNEL_LOCK()
  pf_noralize_ip*() functions around  pf_reassemble*(). Fragment cache
  will get its own rw-lock later.

- as soon as packet is sanitized (or should say normalized), we do
  state check. State check will get its own rw-lock (I call it
  smp-step-1)

- if no state is found PF proceeds to rule check, which will get
  its rw-lock too (smp-step-2).

Basically KERNEL_LOCK() is placed everywhere, where individual rw-locks will be
introduced later.

As Philip Guenther has pointed out earlier in email, the user-threads must
grab KERNEL_LOCK() before they will jump to rw-locks. The game is easy for PF
in ioctl() subsystem, since everything there will be user thread.

For packets (pf_test()) function the game is not that clear, since not all
packets will be running in user-thread context. My assumption is the packets
bound to loopback and local outbound packets are running in user-threads.  All
other packets (inbound and forwarded) are running in interrupt-threads. For
this reason I'd like to introduce the extra code to pf_test(), which decides
whether packet requires KERNEL_LOCK(), because it is running as a user thread.

I'd like to get the patch in, before proceeding to next SMP step.

thanks and
regards
sasha

8<---8<---8<--8<

Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.937
diff -u -p -r1.937 pf.c
--- net/pf.c1 Sep 2015 19:12:25 -   1.937
+++ net/pf.c4 Sep 2015 12:52:55 -
@@ -6316,6 +6316,7 @@ pf_test(sa_family_t af, int fwdir, struc
union pf_headers pdhdrs;
int  dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
u_int32_tqid, pqid = 0;
+   int  have_lock = 0;
 
if (!pf_status.running)
return (PF_PASS);
@@ -6349,6 +6350,26 @@ pf_test(sa_family_t af, int fwdir, struc
return (PF_PASS);
}
 
+   /*
+* Unlike kernel-threads, the user-threads require to have
+* KENREL_LOCK() before they will touch rw-locks.
+*
+* The assumption here is fairly simple:
+*  - all inbound packets run as interrupt (kernel-thread),
+*unless they are bound to loopback
+*
+*  - outbound packets run as kernel-thread, if they
+*are marked by PF_TAG_KERNEL flag.
+*
+* in all other cases we must assume packet runs in user-thread
+* context and we should get KERNEL_LOCK().
+*/
+   if ((dir == PF_IN) && (ifp->if_type != IFT_LOOP)) {
+   (*m0)->m_pkthdr.pf.flags |= PF_TAG_KERNEL;
+   } else if (((*m0)->m_pkthdr.pf.flags & PF_TAG_KERNEL) == 0) {
+   PF_LOCK(have_lock);
+   }
+
action = pf_setup_pdesc(, , af, dir, kif, *m0, );
if (action != PF_PASS) {
 #if NPFLOG > 0
@@ -6399,6 +6420,7 @@ pf_test(sa_family_t af, int fwdir, struc
 * handle fragments that aren't reassembled by
 * normalization
 */
+   PF_LOCK(have_lock);
action = pf_test_rule(, , , , );
if (action != PF_PASS)
REASON_SET(, PFRES_FRAG);
@@ -6413,6 +6435,7 @@ pf_test(sa_family_t af, int fwdir, struc
"dropping IPv6 packet with ICMPv4 payload");
goto done;
}
+   PF_LOCK(have_lock);
action = pf_test_state_icmp(, , );
if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
@@ -6437,6 +6460,7 @@ pf_test(sa_family_t af, int fwdir, struc
"dropping IPv4 packet with ICMPv6 payload");
goto done;
}
+   PF_LOCK(have_lock);
action = pf_test_state_icmp(, , );
if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
@@ -6461,6 +6485,7 @@ pf_test(sa_family_t af, int fwdir, struc
if (action == PF_DROP)
goto done;
}
+   PF_LOCK(have_lock);
action = pf_test_state(, , );
if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
@@ -6658,6 +6683,8 @@ done:
else if (!s->if_index_out && dir == PF_OUT)
s->if_index_out = ifp->if_index;
}
+
+

Re: PF SMP: making anchor stack multithreaded

2015-09-12 Thread Alexandr Nedvedicky

Hello,

re-sending with updated patch version. I'd like to get it in, so
I can start moving forward with other things.

any O.Ks?

thanks and
regards
sasha

On Sat, Aug 08, 2015 at 12:16:26PM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> I've reworked the anchor handling so the traversal uses true recursion now.
> Using recursion here will allow us to implement ruleset locking in nicer
> fashion.  The idea is to split current pf_test_rule() into two functions:
> pf_test_rule() and pf_match_rule().
> 
> pf_step_into_anchor() is changed to drive recursive anchor traversal. It calls
> pf_match_rule() to match rules in nested rulesets.  pf_step_out_of_anchor() 
> has
> been merged into new pf_step_into_anchor()
> 
> To minimize stack frame size a pf_test_ctx is introduced. Its members are
> various variables, which used to be local at former pf_test_rule().  The
> pf_test_ctx instance is a local variable of new pf_test_rule().
> pf_match_rule() receives pointer to pf_test_ctx as its argument so it can 
> reach
> all variables it needs. The goal is to move out as many local variables from
> pf_match_rule() and pf_step_into_anchor() as possible to save memory.
> 
> To minimize amount of differences macros to access members in pf_test_ctx
> are introduced. Once consensus on proposed approach will be reached, we
> can polish the patch a bit.
> 
> I did some basic testing with rules as follows:
> 
> pass all
> anchor "ap" self to 10.0.0.0/8 {
>   block proto tcp from self to 10.0.0.138 port 23
>   pass proto tcp from self to 10.0.0.138 port 23 once
> }
> 
> and wildcard variant. It seems to me it works, but I'll be glad for any
> further testing tips.
> 
> regards
> sasha
> 

8<---8<---8<--8<
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.941
diff -u -p -r1.941 pf.c
--- pf.c11 Sep 2015 15:21:31 -  1.941
+++ pf.c12 Sep 2015 16:31:24 -
@@ -114,13 +114,37 @@ u_char pf_tcp_secret[16];
 int pf_tcp_secret_init;
 int pf_tcp_iss_off;
 
-struct pf_anchor_stackframe {
-   struct pf_ruleset   *rs;
-   struct pf_rule  *r;
-   struct pf_anchor_node   *parent;
-   struct pf_anchor*child;
-} pf_anchor_stack[64];
+struct pf_test_ctx {
+   int test_status;
+   struct pf_pdesc *pd;
+   struct pf_rule_actions  act;
+   u_int8_ticmpcode;
+   u_int8_ticmptype;
+   int icmp_dir;
+   int state_icmp;
+   int tag;
+   u_short reason;
+   struct pf_rule_item *ri;
+   struct pf_src_node  *sns[PF_SN_MAX];
+   struct pf_rule_slistrules;
+   struct pf_rule  *nr;
+   struct pf_rule  **rm;
+   struct pf_rule  *a;
+   struct pf_rule  **am;
+   struct pf_ruleset   **rsm;
+   struct pf_ruleset   *arsm;
+   struct pf_ruleset   *aruleset;
+   struct tcphdr   *th;
+   int  depth;
+};
+
+#definePF_ANCHOR_STACK_MAX 64
 
+enum {
+   PF_TEST_FAIL = -1,
+   PF_TEST_OK,
+   PF_TEST_QUICK
+};
 /*
  * Cannot fold into pf_pdesc directly, unknown storage size outside pf.c.
  * Keep in sync with union pf_headers in pflog_bpfcopy() in if_pflog.c.
@@ -225,11 +249,8 @@ struct pf_state*pf_find_state(struct p
struct pf_state_key_cmp *, u_int, struct mbuf *);
 int pf_src_connlimit(struct pf_state **);
 int pf_match_rcvif(struct mbuf *, struct pf_rule *);
-voidpf_step_into_anchor(int *, struct pf_ruleset **,
-   struct pf_rule **, struct pf_rule **);
-int pf_step_out_of_anchor(int *, struct pf_ruleset **,
-struct pf_rule **, struct pf_rule **,
-int *);
+int pf_step_into_anchor(struct pf_test_ctx *, struct 
pf_rule *);
+int pf_match_rule(struct pf_test_ctx *, struct pf_ruleset 
*);
 voidpf_counters_inc(int, struct pf_pdesc *,
struct pf_state *, struct pf_rule *,
struct pf_rule *);
@@ -2628,74 +2649,37 @@ pf_tag_packet(struct mbuf *m, int tag, i
m->m_pkthdr.ph_rtableid = (u_int)rtableid;
 }
 
-void
-pf_step_into_anchor(int *depth, struct pf_ruleset **rs,
-struct pf_rule **r, struct pf_rule **a)
+int
+pf_step_into_anchor(s

PF SMP: mutex for fragcache

2015-09-12 Thread Alexandr Nedvedicky

Hello,

very small first step towards MP(i) friendly PF. Patch adds mutex around
fragment cache.

Patch adds a lock around fragment cache. Unlike other parts of PF the fragment
cache is self-contained subsystem. In that sense we can easily guard its entry
points (pf_reassemble(), pf_reassemble6()) by mutex. The cache is shared
by both protocols (AF_INET, AF_INET6), hence we have just one lock.

The locks (technically speaking mutexes) for other PF subsystems will follow as
soon as the remove operations for PF data objects will get untangled.
What essentially needs to be done is to split remove and destroy operations for
PF objects into separate functions. This is something, what's being worked on
currently.

As you can see the mutex, when acquired, raises  interrupt level to softnet.
Same interrupt level is used by ioctl() and purge threads. IMO it should be
fine, but I'd like to hear some confirmation...


any OKs?

thanks and
regards
sasha

8<---8<---8<--8<
Index: pf_norm.c
===
RCS file: /cvs/src/sys/net/pf_norm.c,v
retrieving revision 1.182
diff -u -p -r1.182 pf_norm.c
--- pf_norm.c   10 Sep 2015 08:28:31 -  1.182
+++ pf_norm.c   12 Sep 2015 17:18:43 -
@@ -134,6 +134,7 @@ int  pf_reassemble6(struct mbuf **, st
 struct pool pf_frent_pl, pf_frag_pl;
 struct pool pf_state_scrub_pl;
 int pf_nfrents;
+struct mutexpf_frag_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
 void
 pf_normalize_init(void)
@@ -771,6 +772,7 @@ pf_normalize_ip(struct pf_pdesc *pd, u_s
struct ip   *h = mtod(pd->m, struct ip *);
u_int16_tfragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
u_int16_tmff = (ntohs(h->ip_off) & IP_MF);
+   int  rv;
 
if (!fragoff && !mff)
goto no_fragment;
@@ -792,8 +794,11 @@ pf_normalize_ip(struct pf_pdesc *pd, u_s
if (!pf_status.reass)
return (PF_PASS);   /* no reassembly */
 
+   PF_FRAG_LOCK();
/* Returns PF_DROP or m is NULL or completely reassembled mbuf */
-   if (pf_reassemble(>m, pd->dir, reason) != PF_PASS)
+   rv = pf_reassemble(>m, pd->dir, reason);
+   PF_FRAG_UNLOCK();
+   if (rv != PF_PASS)
return (PF_DROP);
if (pd->m == NULL)
return (PF_PASS);  /* packet has been reassembled, no error */
@@ -813,6 +818,7 @@ int
 pf_normalize_ip6(struct pf_pdesc *pd, u_short *reason)
 {
struct ip6_frag  frag;
+   int  rv;
 
if (pd->fragoff == 0)
goto no_fragment;
@@ -824,9 +830,12 @@ pf_normalize_ip6(struct pf_pdesc *pd, u_
if (!pf_status.reass)
return (PF_PASS);   /* no reassembly */
 
+   PF_FRAG_LOCK();
/* Returns PF_DROP or m is NULL or completely reassembled mbuf */
-   if (pf_reassemble6(>m, , pd->fragoff + sizeof(frag),
-   pd->extoff, pd->dir, reason) != PF_PASS)
+   rv = pf_reassemble6(>m, , pd->fragoff + sizeof(frag),
+   pd->extoff, pd->dir, reason);
+   PF_FRAG_UNLOCK();
+   if (rv != PF_PASS)
return (PF_DROP);
if (pd->m == NULL)
return (PF_PASS);  /* packet has been reassembled, no error */
Index: pfvar.h
===
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.420
diff -u -p -r1.420 pfvar.h
--- pfvar.h 19 Aug 2015 21:22:41 -  1.420
+++ pfvar.h 12 Sep 2015 17:18:43 -
@@ -1907,7 +1907,10 @@ int   pf_postprocess_addr(struct 
pf_sta
 
 voidpf_cksum(struct pf_pdesc *, struct mbuf *);
 
-#endif /* _KERNEL */
+extern struct mutex pf_frag_mtx;
+#definePF_FRAG_LOCK()  mtx_enter(_frag_mtx)
+#definePF_FRAG_UNLOCK()mtx_leave(_frag_mtx)
 
+#endif /* _KERNEL */
 
 #endif /* _NET_PFVAR_H_ */

Re: [patch] cleaner checksum modification for pf

2015-09-29 Thread Alexandr Nedvedicky

Hello,

I've tried Richard's patch on sparc. I took a brief look at its source code.
It's essentially what PF is doing on Solaris.

The checksum handling in PF on systems with HW assisted checksums is getting
tricky for local (out)bound packets. The approach we take on Solaris is as
follows:

- for inbound packets PF always trusts HW, if HW says chksum is
  correct, then checksum is correct. if HW is not able to verify
  checksum (HW checksum verification is off), PF falls back to SW
  verification (1)

- PF does not check (verify) checksum for outbound packets, outbound
  packet is either

- forwarded, so checksum has been verified in inbound side (2a)

- local outbound, then checksum is either valid or to be
  calculated by HW (2b)

The things are getting pretty wild in 2b, when PF is doing PBR (policy based
routing) on outbound packets. Consider situation when IP stack routes packet
via NIC, which is able to calculate chksum in HW.  IP stack sets flags and
fields and passes packet to PF. PF changes interface, where packet is bound to,
to NIC, which is not able to calculate checksum, so the HW-cksum flags set by
IP stack are no longer valid. In this case we always revert to calculation
in SW.

I have not looked at current checksum handling at PF on OpenBSD, so can't tell
exactly what's going on there. I feel PF does not bother too much with updating
the checksum, when it changes the packet. It seems to me the
in_proto_cksum_out() gets called as soon as outbound packet gets inspected by
pf_test() to calculate/fix checksums. It looks like in_proto_cksum_out() has to
recalculate checksum in SW for entire packet, when underlying HW does not offer
checksum offload. Is that right? Or am I missing some piece?

On the other hand Richard's patch adjusts checksums by delta caused by update.
The adjustment is of few operations (add/and/not) on very small chunk of
memory. The price should be same we pay for extra logic to decide if
HW will compute chksum for us or we have to do it on our own. However we will
save plenty of cycles, when we would have to revert to SW.


I currently have small suggestion to improve Richard's patch. The macro in
PF_ALGNMNT() in pfvar.h uses modulo:

#define PF_HI (true)
#define PF_LO (!PF_HI)
#define PF_ALGNMNT(off) (((off) % 2) == 0 ? PF_HI : PF_LO)

I think we can get away with simple and operation (& 1), which will be faster
than % on many platforms.

#define PF_ALGNMNT(off) (((off) & 1) == 0 ? PF_HI : PF_LO)

regards
sasha

Re: pf statekey inp chaining

2015-12-03 Thread Alexandr Nedvedicky

Hello,

OK

sasha

On Thu, Dec 03, 2015 at 12:29:15PM +0100, Alexander Bluhm wrote:
> On Wed, Dec 02, 2015 at 07:45:09PM +0100, Alexander Bluhm wrote:
> > Here is a new version of the diff.  This is new:
> 
> Now with feedback from sashan@
> 
> - merge
> - no SS_ISCONNECTED check in tcp as it was before
> - fix the disabled call to pf_inp_lookup() in udp input
> 
> ok?
> 
> bluhm
>

Re: free sizes for most free calls in pf_ioctl

2015-12-03 Thread Alexandr Nedvedicky

Hello,

OK

regards
sasha

On Thu, Dec 03, 2015 at 01:21:32PM +0100, Claudio Jeker wrote:
> This should cover the simple free calls in pf_ioctl.
> 
> -- 
> :wq Claudio
> 
> Index: pf_ioctl.c
> ===
> RCS file: /cvs/src/sys/net/pf_ioctl.c,v
> retrieving revision 1.296
> diff -u -p -r1.296 pf_ioctl.c
> --- pf_ioctl.c3 Dec 2015 10:34:11 -   1.296
> +++ pf_ioctl.c3 Dec 2015 12:08:00 -
> @@ -397,7 +397,7 @@ tag_unref(struct pf_tags *head, u_int16_
>   if (tag == p->tag) {
>   if (--p->ref == 0) {
>   TAILQ_REMOVE(head, p, entries);
> - free(p, M_RTABLE, 0);
> + free(p, M_RTABLE, sizeof(*p));
>   }
>   break;
>   }
> @@ -1564,7 +1564,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>   pf_state_export(pstore, state);
>   error = copyout(pstore, p, sizeof(*p));
>   if (error) {
> - free(pstore, M_TEMP, 0);
> + free(pstore, M_TEMP, sizeof(*pstore));
>   goto fail;
>   }
>   p++;
> @@ -1575,7 +1575,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>  
>   ps->ps_len = sizeof(struct pfsync_state) * nr;
>  
> - free(pstore, M_TEMP, 0);
> + free(pstore, M_TEMP, sizeof(*pstore));
>   break;
>   }
>  
> @@ -2030,8 +2030,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>   bzero(_trans_set, sizeof(pf_trans_set));
>   for (i = 0; i < io->size; i++) {
>   if (copyin(io->array+i, ioe, sizeof(*ioe))) {
> - free(table, M_TEMP, 0);
> - free(ioe, M_TEMP, 0);
> + free(table, M_TEMP, sizeof(*table));
> + free(ioe, M_TEMP, sizeof(*ioe));
>   error = EFAULT;
>   goto fail;
>   }
> @@ -2042,29 +2042,29 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>   sizeof(table->pfrt_anchor));
>   if ((error = pfr_ina_begin(table,
>   >ticket, NULL, 0))) {
> - free(table, M_TEMP, 0);
> - free(ioe, M_TEMP, 0);
> + free(table, M_TEMP, sizeof(*table));
> + free(ioe, M_TEMP, sizeof(*ioe));
>   goto fail;
>   }
>   break;
>   default:
>   if ((error = pf_begin_rules(>ticket,
>   ioe->anchor))) {
> - free(table, M_TEMP, 0);
> - free(ioe, M_TEMP, 0);
> + free(table, M_TEMP, sizeof(*table));
> + free(ioe, M_TEMP, sizeof(*ioe));
>   goto fail;
>   }
>   break;
>   }
>   if (copyout(ioe, io->array+i, sizeof(io->array[i]))) {
> - free(table, M_TEMP, 0);
> - free(ioe, M_TEMP, 0);
> + free(table, M_TEMP, sizeof(*table));
> + free(ioe, M_TEMP, sizeof(*ioe));
>   error = EFAULT;
>   goto fail;
>   }
>   }
> - free(table, M_TEMP, 0);
> - free(ioe, M_TEMP, 0);
> + free(table, M_TEMP, sizeof(*table));
> + free(ioe, M_TEMP, sizeof(*ioe));
>   break;
>   }
>  
> @@ -2082,8 +2082,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>   table = malloc(sizeof(*table), M_TEMP, M_WAITOK);
>   for (i = 0; i < io->size; i++) {
>   if (copyin(io->array+i, ioe, sizeof(*ioe))) {
> - free(table, M_TEMP, 0);
> - free(ioe, M_TEMP, 0);
> + free(table, M_TEMP, sizeof(*table));
> + free(ioe, M_TEMP, sizeof(*ioe));
>   error = EFAULT;
>   goto fail;
>   }
> @@ -2094,23 +2094,23 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
>   sizeof(table->pfrt_anchor));
>   if ((error = pfr_ina_rollback(table,
>

Re: introducing ip_send()/ip6_send() to OpenBSD kernel

2015-12-04 Thread Alexandr Nedvedicky

On Thu, Dec 03, 2015 at 11:00:09PM +0100, Thierry Deval wrote:
> Hello Sasha,
> 
> You kept the static prototypes for ip_send_dispatch and 
> ip6_send_dispatch.
> 
> You'd better avoid mixing static and non-static declarations for the same 
> functions. ;-)
> 
bluhm sitting next to me spot it too. I've fixed it in follow up
commit and forgot to send email.

regards
sasha

introducing ip_send()/ip6_send() to OpenBSD kernel

2015-12-03 Thread Alexandr Nedvedicky

Hello,

patch below introduces ip_send() function to OpenBSD kernel. ip_send()
function takes an mbuf with packet and passes to ip_output(), which
will be running in softnet task.

the patch also changes icmp_error()/icmp6_error() to dispatch the ICMP error
responses via ip_send(), so both functions are safe for MP friendly PF, because
pf_test() recursion is avoided.

the overall idea comes from Markus Friedl, some fine tuning touches suches
using mbuf queue come from mikeb.

the ipsend_mq queue size is 64 currently. I need some guidance if it is
too much/too little. I've just opted for nice rounded number.

OK?

regards
sasha

8<---8<---8<--8<
Index: net/if.h
===
RCS file: /cvs/src/sys/net/if.h,v
retrieving revision 1.174
diff -u -p -r1.174 if.h
--- net/if.h3 Dec 2015 12:22:51 -   1.174
+++ net/if.h3 Dec 2015 14:00:16 -
@@ -484,4 +484,7 @@ int if_setlladdr(struct ifnet *, const u
 
 #endif /* __BSD_VISIBLE */
 
+#ifdef _KERNEL
+extern struct taskq *softnettq;
+#endif /* _KERNEL */
 #endif /* _NET_IF_H_ */
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.955
diff -u -p -r1.955 pf.c
--- net/pf.c3 Dec 2015 09:49:15 -   1.955
+++ net/pf.c3 Dec 2015 14:00:16 -
@@ -2424,11 +2424,11 @@ pf_send_tcp(const struct pf_rule *r, sa_
 
switch (af) {
case AF_INET:
-   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   ip_send(m);
break;
 #ifdef INET6
case AF_INET6:
-   ip6_output(m, NULL, NULL, 0, NULL, NULL);
+   ip6_send(m);
break;
 #endif /* INET6 */
}
Index: netinet/ip_icmp.c
===
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.149
diff -u -p -r1.149 ip_icmp.c
--- netinet/ip_icmp.c   2 Dec 2015 16:35:53 -   1.149
+++ netinet/ip_icmp.c   3 Dec 2015 14:00:16 -
@@ -854,7 +854,10 @@ icmp_send(struct mbuf *m, struct mbuf *o
printf("icmp_send dst %s src %s\n", dst, src);
}
 #endif
-   ip_output(m, opts, NULL, 0, NULL, NULL, 0);
+   if (opts != NULL)
+   m = ip_insertoptions(m, opts, );
+
+   ip_send(m);
 }
 
 u_int32_t
Index: netinet/ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.263
diff -u -p -r1.263 ip_input.c
--- netinet/ip_input.c  2 Dec 2015 13:29:26 -   1.263
+++ netinet/ip_input.c  3 Dec 2015 14:00:16 -
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -121,11 +122,15 @@ struct pool ipq_pool;
 
 struct ipstat ipstat;
 
+static struct mbuf_queue   ipsend_mq;
+
 void   ip_ours(struct mbuf *);
 intip_dooptions(struct mbuf *, struct ifnet *);
 intin_ouraddr(struct mbuf *, struct ifnet *, struct in_addr);
 void   ip_forward(struct mbuf *, struct ifnet *, int);
 
+static void ip_send_dispatch(void *);
+static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, 
_mq);
 /*
  * Used to save the IP options in case a protocol wants to respond
  * to an incoming packet over the same route if the packet got here
@@ -184,6 +189,8 @@ ip_init(void)
strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
+
+   mq_init(_mq, 64, IPL_SOFTNET);
 }
 
 struct route ipforward_rt;
@@ -1742,3 +1749,18 @@ ip_savecontrol(struct inpcb *inp, struct
}
 }
 
+static void
+ip_send_dispatch(void *cx)
+{
+   struct mbuf *m;
+
+   while ((m = mq_dequeue((struct mbuf_queue *)cx)) != NULL)
+   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+}
+
+void
+ip_send(struct mbuf *m)
+{
+   mq_enqueue(_mq, m);
+   task_add(softnettq, _task);
+}
Index: netinet/ip_var.h
===
RCS file: /cvs/src/sys/netinet/ip_var.h,v
retrieving revision 1.60
diff -u -p -r1.60 ip_var.h
--- netinet/ip_var.h16 Jul 2015 21:14:21 -  1.60
+++ netinet/ip_var.h3 Dec 2015 14:00:16 -
@@ -180,6 +180,8 @@ void ip_freef(struct ipq *);
 voidip_freemoptions(struct ip_moptions *);
 int ip_getmoptions(int, struct ip_moptions *, struct mbuf **);
 voidip_init(void);
+struct mbuf*
+ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 int ip_mforward(struct mbuf *, struct ifnet *);
 int ip_optcopy(struct ip *, struct ip *);
 int ip_output(struct mbuf *, struct mbuf *, struct route *, int,
@@ -191,6 +193,7 @@ struct in_ifaddr *
 ip_rtaddr(struct in_addr, u_int);
 u_int16_t

removing expired once rules in pf_purge_thread()

2015-12-05 Thread Alexandr Nedvedicky

Hello,

henning@ and mikeb@ showed some interest to change handling of once rules to
the same way as PF has it on Solaris. Just to refresh the audience on once
option offered by PF:

 onceCreates a one shot rule that will remove itself from an active
 ruleset after the first match.  In case this is the only rule in
 the anchor, the anchor will be destroyed automatically after the
 rule is matched.
   -- pf.conf(5)

Currently the once rules are removed by matching packet. Patch makes life for
packets, which match once rules bit easier. Packets instead of removing rule
from ruleset just mark rule as expired and put it to garbage colloector list.
The list is processed by pf_purge_thread(), which just removes and deletes
those expired rules. To get there we need to simplify pf_purge_rule() image,
which currently looks as follows:

void
pf_purge_rule(struct pf_ruleset *ruleset, struct pf_rule *rule,
struct pf_ruleset *aruleset, struct pf_rule *arule)

- ruleset is the ruleset, where once rule is being removed from

- rule is a once rule to remove

- aruleset holds an anchor rule with once-rule we remove

- arule an anchor which holds a once rule

To make pf_purge_rule() suitable for pf_purge_thread() it has to be changed to:

void
pf_purge_rule(struct pf_rule *once_rule)

To get there the ruleset and arule has to be carried by once_rule itself.
Therefore patch adds those members to pf_rule:
struct pf_ruleset   *myruleset
struct pf_rule  *myarule
SLIST_ENTRY(pf_rule) gcle
(the gcle is garbage colleter list link).

Patch sets myruleset as soon as rule gets inserted to ruleset in SIOCADDRULE
ioctl. The myarule is set in pf_test_rule(), when once rule is marked as
expired.

Don't forget to recompile all user-land bits (pfctl, proxies et. al.) when
you'll be testing the patch, since pf_rule structure gets changed.

regards
sasha

8<---8<---8<--8<

Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.958
diff -u -p -r1.958 pf.c
--- net/pf.c5 Dec 2015 14:58:06 -   1.958
+++ net/pf.c5 Dec 2015 22:09:42 -
@@ -298,6 +298,9 @@ RB_GENERATE(pf_state_tree, pf_state_key,
 RB_GENERATE(pf_state_tree_id, pf_state,
 entry_id, pf_state_compare_id);
 
+SLIST_HEAD(pf_rule_gcl, pf_rule)   pf_rule_gcl =
+   SLIST_HEAD_INITIALIZER(pf_rule_gcl);
+
 __inline int
 pf_addr_compare(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
@@ -1140,6 +1143,24 @@ pf_state_export(struct pfsync_state *sp,
 /* END state table stuff */
 
 void
+pf_purge_expired_rules(void)
+{
+   struct pf_rule  *r;
+
+   if (SLIST_EMPTY(_rule_gcl)) {
+   return;
+   }
+
+   rw_enter_write(_consistency_lock);
+   while ((r = SLIST_FIRST(_rule_gcl)) != NULL) {
+   SLIST_REMOVE(_rule_gcl, r, pf_rule, gcle);
+   KASSERT(r->rule_flag & PFRULE_EXPIRED);
+   pf_purge_rule(r);
+   }
+   rw_exit_write(_consistency_lock);
+}
+
+void
 pf_purge_thread(void *v)
 {
int nloops = 0, s;
@@ -1157,6 +1178,7 @@ pf_purge_thread(void *v)
if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
pf_purge_expired_fragments();
pf_purge_expired_src_nodes(0);
+   pf_purge_expired_rules();
nloops = 0;
}
 
@@ -3149,6 +3171,10 @@ pf_test_rule(struct pf_pdesc *pd, struct
ruleset = _main_ruleset;
r = TAILQ_FIRST(pf_main_ruleset.rules.active.ptr);
while (r != NULL) {
+   if (r->rule_flag & PFRULE_EXPIRED) {
+   r = TAILQ_NEXT(r, entries);
+   goto nextrule;
+   }
r->evaluations++;
PF_TEST_ATTRIB((pfi_kif_match(r->kif, pd->kif) == r->ifnot),
r->skip[PF_SKIP_IFP].ptr);
@@ -3447,8 +3473,11 @@ pf_test_rule(struct pf_pdesc *pd, struct
}
 #endif /* NPFSYNC > 0 */
 
-   if (r->rule_flag & PFRULE_ONCE)
-   pf_purge_rule(ruleset, r, aruleset, a);
+   if (r->rule_flag & PFRULE_ONCE) {
+   r->rule_flag |= PFRULE_EXPIRED;
+   r->myarule = a;
+   SLIST_INSERT_HEAD(_rule_gcl, r, gcle);
+   }
 
 #ifdef INET6
if (rewrite && skw->af != sks->af)
Index: net/pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.297
diff -u -p -r1.297 pf_ioctl.c
--- net/pf_ioctl.c  3 Dec 2015 13:30:18 -   1.297
+++ net/pf_ioctl.c  5 Dec 2015 22:09:42 -
@@ -301,12 +301,14 @@ pf_rm_rule(struct pf_rulequeue *rulequeu
 }
 
 void

Re: introducing ip_send()/ip6_send() to OpenBSD kernel

2015-12-03 Thread Alexandr Nedvedicky

Hello,

so after a feedback in a hackroom here is the third version of patch. The
summary of changes is as follows:
- ip*_send() function use softnettq to dispatch packet
- ip*_output() functions running in ip*_send_dispatch() are protected
  KERNEL_LOCK() and running at SOFTNET spl level.
- mq_delist() is used to move packets from global queue to local queue
  for processing.

regards
sasha


8<---8<---8<--8<

Index: net/if.h
===
RCS file: /cvs/src/sys/net/if.h,v
retrieving revision 1.174
diff -u -p -r1.174 if.h
--- net/if.h3 Dec 2015 12:22:51 -   1.174
+++ net/if.h3 Dec 2015 17:06:32 -
@@ -484,4 +484,7 @@ int if_setlladdr(struct ifnet *, const u
 
 #endif /* __BSD_VISIBLE */
 
+#ifdef _KERNEL
+extern struct taskq *softnettq;
+#endif /* _KERNEL */
 #endif /* _NET_IF_H_ */
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.956
diff -u -p -r1.956 pf.c
--- net/pf.c3 Dec 2015 14:05:28 -   1.956
+++ net/pf.c3 Dec 2015 17:06:32 -
@@ -2424,11 +2424,11 @@ pf_send_tcp(const struct pf_rule *r, sa_
 
switch (af) {
case AF_INET:
-   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   ip_send(m);
break;
 #ifdef INET6
case AF_INET6:
-   ip6_output(m, NULL, NULL, 0, NULL, NULL);
+   ip6_send(m);
break;
 #endif /* INET6 */
}
Index: netinet/ip_icmp.c
===
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.149
diff -u -p -r1.149 ip_icmp.c
--- netinet/ip_icmp.c   2 Dec 2015 16:35:53 -   1.149
+++ netinet/ip_icmp.c   3 Dec 2015 17:06:33 -
@@ -854,7 +854,10 @@ icmp_send(struct mbuf *m, struct mbuf *o
printf("icmp_send dst %s src %s\n", dst, src);
}
 #endif
-   ip_output(m, opts, NULL, 0, NULL, NULL, 0);
+   if (opts != NULL)
+   m = ip_insertoptions(m, opts, );
+
+   ip_send(m);
 }
 
 u_int32_t
Index: netinet/ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.264
diff -u -p -r1.264 ip_input.c
--- netinet/ip_input.c  3 Dec 2015 15:12:59 -   1.264
+++ netinet/ip_input.c  3 Dec 2015 17:06:33 -
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -121,6 +122,8 @@ struct pool ipq_pool;
 
 struct ipstat ipstat;
 
+static struct mbuf_queue   ipsend_mq;
+
 void   ip_ours(struct mbuf *);
 intip_dooptions(struct mbuf *, struct ifnet *);
 intin_ouraddr(struct mbuf *, struct ifnet *, struct in_addr);
@@ -130,6 +133,8 @@ int ip_input_ipsec_fwd_check(struct mbuf
 intip_input_ipsec_ours_check(struct mbuf *, int);
 #endif /* IPSEC */
 
+static void ip_send_dispatch(void *);
+static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, 
_mq);
 /*
  * Used to save the IP options in case a protocol wants to respond
  * to an incoming packet over the same route if the packet got here
@@ -188,6 +193,8 @@ ip_init(void)
strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
+
+   mq_init(_mq, 64, IPL_SOFTNET);
 }
 
 struct route ipforward_rt;
@@ -1752,3 +1759,26 @@ ip_savecontrol(struct inpcb *inp, struct
}
 }
 
+static void
+ip_send_dispatch(void *cx)
+{
+   struct mbuf *m;
+   struct mbuf_list ml;
+   int s;
+
+   mq_delist((struct mbuf_queue *)cx, );
+   s = splsoftnet();
+   KERNEL_LOCK();
+   while ((m = ml_dequeue()) != NULL) {
+   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   }
+   KERNEL_UNLOCK();
+   splx(s);
+}
+
+void
+ip_send(struct mbuf *m)
+{
+   mq_enqueue(_mq, m);
+   task_add(softnettq, _task);
+}
Index: netinet/ip_var.h
===
RCS file: /cvs/src/sys/netinet/ip_var.h,v
retrieving revision 1.60
diff -u -p -r1.60 ip_var.h
--- netinet/ip_var.h16 Jul 2015 21:14:21 -  1.60
+++ netinet/ip_var.h3 Dec 2015 17:06:33 -
@@ -180,6 +180,8 @@ void ip_freef(struct ipq *);
 voidip_freemoptions(struct ip_moptions *);
 int ip_getmoptions(int, struct ip_moptions *, struct mbuf **);
 voidip_init(void);
+struct mbuf*
+ip_insertoptions(struct mbuf *, struct mbuf *, int *);
 int ip_mforward(struct mbuf *, struct ifnet *);
 int ip_optcopy(struct ip *, struct ip *);
 int ip_output(struct mbuf *, struct mbuf *, struct route *, int,
@@ -191,6 +193,7 @@ struct in_ifaddr *
 ip_rtaddr(struct

Re: introducing ip_send()/ip6_send() to OpenBSD kernel

2015-12-03 Thread Alexandr Nedvedicky

Hello,

mikeb@ found a fundamental problem in my earlier patch. The ip_send() function
was using `softnettq` (softnet task queue) to dispatch packet via ip*_output(). 
Doing so it's risky business as ip*_output() is not unlocked yet.

So new patch version introduces a new task: ipsendtq. The ipsendtq will be
running at IPL_SOFTNET and will be servicing both protocols (IPv4, IPv6).

The calls to ip*_output() functions will be guarded by KERNEL_LOCK().
diff below illustrates the change against earlier patch version. New
version of complete patch is appended to email.

diff -r 259e5a3a782a src/sys/netinet/ip_input.c
--- a/src/sys/netinet/ip_input.cThu Dec 03 09:40:12 2015 +0100
+++ b/src/sys/netinet/ip_input.cThu Dec 03 10:52:47 2015 +0100
@@ -122,6 +122,7 @@
 
 struct ipstat ipstat;
 
+struct taskq *ipsendtq;
 static struct mbuf_queue   ipsend_mq;
 
 void   ip_ours(struct mbuf *);
@@ -191,6 +192,7 @@
strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
 
mq_init(_mq, 64, IPL_SOFTNET);
+   ipsendtq = taskq_create("ipsend", 1, IPL_SOFTNET, TASKQ_CANTSLEEP);
 }
 
 struct route ipforward_rt;
@@ -1754,13 +1756,16 @@
 {
struct mbuf *m;
 
-   while ((m = mq_dequeue((struct mbuf_queue *)cx)) != NULL)
+   while ((m = mq_dequeue((struct mbuf_queue *)cx)) != NULL) {
+   KERNEL_LOCK();
ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   KERNEL_UNLOCK();
+   }
 }
 
regards
sasha

8<---8<---8<--8<

Index: net/if.h
===
RCS file: /cvs/src/sys/net/if.h,v
retrieving revision 1.174
diff -u -p -r1.174 if.h
--- net/if.h3 Dec 2015 12:22:51 -   1.174
+++ net/if.h3 Dec 2015 15:16:00 -
@@ -484,4 +484,7 @@ int if_setlladdr(struct ifnet *, const u
 
 #endif /* __BSD_VISIBLE */
 
+#ifdef _KERNEL
+extern struct taskq *softnettq;
+#endif /* _KERNEL */
 #endif /* _NET_IF_H_ */
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.955
diff -u -p -r1.955 pf.c
--- net/pf.c3 Dec 2015 09:49:15 -   1.955
+++ net/pf.c3 Dec 2015 15:16:00 -
@@ -2424,11 +2424,11 @@ pf_send_tcp(const struct pf_rule *r, sa_
 
switch (af) {
case AF_INET:
-   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   ip_send(m);
break;
 #ifdef INET6
case AF_INET6:
-   ip6_output(m, NULL, NULL, 0, NULL, NULL);
+   ip6_send(m);
break;
 #endif /* INET6 */
}
Index: netinet/ip_icmp.c
===
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.149
diff -u -p -r1.149 ip_icmp.c
--- netinet/ip_icmp.c   2 Dec 2015 16:35:53 -   1.149
+++ netinet/ip_icmp.c   3 Dec 2015 15:16:01 -
@@ -854,7 +854,10 @@ icmp_send(struct mbuf *m, struct mbuf *o
printf("icmp_send dst %s src %s\n", dst, src);
}
 #endif
-   ip_output(m, opts, NULL, 0, NULL, NULL, 0);
+   if (opts != NULL)
+   m = ip_insertoptions(m, opts, );
+
+   ip_send(m);
 }
 
 u_int32_t
Index: netinet/ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.263
diff -u -p -r1.263 ip_input.c
--- netinet/ip_input.c  2 Dec 2015 13:29:26 -   1.263
+++ netinet/ip_input.c  3 Dec 2015 15:16:01 -
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -121,11 +122,16 @@ struct pool ipq_pool;
 
 struct ipstat ipstat;
 
+struct taskq *ipsendtq;
+static struct mbuf_queue   ipsend_mq;
+
 void   ip_ours(struct mbuf *);
 intip_dooptions(struct mbuf *, struct ifnet *);
 intin_ouraddr(struct mbuf *, struct ifnet *, struct in_addr);
 void   ip_forward(struct mbuf *, struct ifnet *, int);
 
+static void ip_send_dispatch(void *);
+static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, 
_mq);
 /*
  * Used to save the IP options in case a protocol wants to respond
  * to an incoming packet over the same route if the packet got here
@@ -184,6 +190,9 @@ ip_init(void)
strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
+
+   mq_init(_mq, 64, IPL_SOFTNET);
+   ipsendtq = taskq_create("ipsend", 1, IPL_SOFTNET, TASKQ_CANTSLEEP);
 }
 
 struct route ipforward_rt;
@@ -1742,3 +1751,21 @@ ip_savecontrol(struct inpcb *inp, struct
}
 }
 
+static void
+ip_send_dispatch(void *cx)
+{
+   struct mbuf *m;
+
+   while ((m = mq_dequeue((struct mbuf_queue *)cx)) != NULL) {
+   KERNEL_LOCK();
+

Re: introducing ip_send()/ip6_send() to OpenBSD kernel

2015-12-03 Thread Alexandr Nedvedicky

Hello,

below is final patch I'm going to commit. Summary of changes:
- softnettq declaration moved to net/if_var.h (by bluhm@)
- lock order swapped: KERNEL_LOCK() goes first folllowed
  by spl (by bluhm@)
- long line got fixed (by bluhm@)
- ip_insertoptions() prototype deleted in ip_output.c (by bluhm@)
- avoiding mix of tab/space at netinet6/ip6_var.h (by bluhm@)
- static at ip*_send_dispatch() got removed (by mpi@)
- ip*_send_dispatch() functions look more like if_input_process()
  now (by mpi@)

thanks and
regards
sasha

8<---8<---8<--8<

Index: net/if_var.h
===
RCS file: /cvs/src/sys/net/if_var.h,v
retrieving revision 1.62
diff -u -p -r1.62 if_var.h
--- net/if_var.h3 Dec 2015 16:27:32 -   1.62
+++ net/if_var.h3 Dec 2015 20:43:46 -
@@ -361,6 +361,7 @@ int niq_enlist(struct niqueue *, struct
 
 extern struct ifnet_head ifnet;
 extern unsigned int lo0ifidx;
+extern struct taskq *softnettq;
 
 void   if_start(struct ifnet *);
 void   if_start_barrier(struct ifnet *);
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.956
diff -u -p -r1.956 pf.c
--- net/pf.c3 Dec 2015 14:05:28 -   1.956
+++ net/pf.c3 Dec 2015 20:43:47 -
@@ -2424,11 +2424,11 @@ pf_send_tcp(const struct pf_rule *r, sa_
 
switch (af) {
case AF_INET:
-   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   ip_send(m);
break;
 #ifdef INET6
case AF_INET6:
-   ip6_output(m, NULL, NULL, 0, NULL, NULL);
+   ip6_send(m);
break;
 #endif /* INET6 */
}
Index: netinet/ip_icmp.c
===
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.149
diff -u -p -r1.149 ip_icmp.c
--- netinet/ip_icmp.c   2 Dec 2015 16:35:53 -   1.149
+++ netinet/ip_icmp.c   3 Dec 2015 20:43:47 -
@@ -854,7 +854,10 @@ icmp_send(struct mbuf *m, struct mbuf *o
printf("icmp_send dst %s src %s\n", dst, src);
}
 #endif
-   ip_output(m, opts, NULL, 0, NULL, NULL, 0);
+   if (opts != NULL)
+   m = ip_insertoptions(m, opts, );
+
+   ip_send(m);
 }
 
 u_int32_t
Index: netinet/ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.264
diff -u -p -r1.264 ip_input.c
--- netinet/ip_input.c  3 Dec 2015 15:12:59 -   1.264
+++ netinet/ip_input.c  3 Dec 2015 20:43:47 -
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -121,6 +122,8 @@ struct pool ipq_pool;
 
 struct ipstat ipstat;
 
+static struct mbuf_queue   ipsend_mq;
+
 void   ip_ours(struct mbuf *);
 intip_dooptions(struct mbuf *, struct ifnet *);
 intin_ouraddr(struct mbuf *, struct ifnet *, struct in_addr);
@@ -130,6 +133,8 @@ int ip_input_ipsec_fwd_check(struct mbuf
 intip_input_ipsec_ours_check(struct mbuf *, int);
 #endif /* IPSEC */
 
+static void ip_send_dispatch(void *);
+static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, 
_mq);
 /*
  * Used to save the IP options in case a protocol wants to respond
  * to an incoming packet over the same route if the packet got here
@@ -188,6 +193,8 @@ ip_init(void)
strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
+
+   mq_init(_mq, 64, IPL_SOFTNET);
 }
 
 struct route ipforward_rt;
@@ -1752,3 +1759,27 @@ ip_savecontrol(struct inpcb *inp, struct
}
 }
 
+void
+ip_send_dispatch(void *xmq)
+{
+   struct mbuf_queue *mq = xmq;
+   struct mbuf *m;
+   struct mbuf_list ml;
+   int s;
+
+   mq_delist(mq, );
+   KERNEL_LOCK();
+   s = splsoftnet();
+   while ((m = ml_dequeue()) != NULL) {
+   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
+   }
+   splx(s);
+   KERNEL_UNLOCK();
+}
+
+void
+ip_send(struct mbuf *m)
+{
+   mq_enqueue(_mq, m);
+   task_add(softnettq, _task);
+}
Index: netinet/ip_var.h
===
RCS file: /cvs/src/sys/netinet/ip_var.h,v
retrieving revision 1.60
diff -u -p -r1.60 ip_var.h
--- netinet/ip_var.h16 Jul 2015 21:14:21 -  1.60
+++ netinet/ip_var.h3 Dec 2015 20:43:47 -
@@ -180,6 +180,8 @@ void ip_freef(struct ipq *);
 voidip_freemoptions(struct ip_moptions *);
 int ip_getmoptions(int, struct ip_moptions *, struct mbuf **);
 voidip_init(void);
+struct mbuf*
+ip_insertoptions(struct mbuf *,

Re: pf unlink remove

2015-12-02 Thread Alexandr Nedvedicky

Hello,

OK

sasha

Re: removing expired once rules in pf_purge_thread()

2015-12-18 Thread Alexandr Nedvedicky

Hello Richard,

> > What has to be granted is there is one 'winner' only, which puts the 
> > rule to garbage collector list. The pragmatic approach wins here.
> 
> Right. I'll just note though that the patch as it stands allows multiple 
> winners: consider the window between testing PFRULE_EXPIRED and setting it 
> (quoted below). Multiple threads may execute in that window before one of 
> them closes it by setting PFRULE_EXPIRED. Each of them will delete the 
> same rule in turn, causing a probable crash. At least,
> 
> SLIST_INSERT_HEAD(_rule_gcl, r, gcle);
> SLIST_INSERT_HEAD(_rule_gcl, r, gcle);
> 
> crashes my machine. 
> 
> Whether that's a realistic issue, I don't know. I have though been bitten 
> by enough edge cases like this to be very wary of them.

I think it's not realistic with current PF at OpenBSD. The pf_test() function
does not run concurrently, so there can be no such race.

FYI: PF@solaris uses mutex there to protect the insertion to gcl. Code goes as
follows at Solaris:

if (!(r->rule_flags & PFRULE_EXPIRED)) {
mutex_enter(_mutex);
/* retest under exclusive protection */
if (!(r->rule_flags & PFRULE_EXPIRED)) {
r->rule_flag |= PFRULE_EXPIRED;
SLIST_INSERT_HEAD(_rule_gcl, r, gcle);
}
}

regards
sasha

Re: removing expired once rules in pf_purge_thread()

2015-12-15 Thread Alexandr Nedvedicky

Hello,

> It just occurred to me that another possibility would be a match-only
> rule that matches one but doesn't involve any purging machinery.  Right
> now we install ftp-proxy rules as having maximum number of states equal
> to 1, however there's a time window between the moment the state is no
> longer there, but anchor is not gone yet and it can potentially create
> more states which is not a problem for an eavesdropper who can sniff
> the plaintext protocol.

I'm not sure I'm following you. IMO the expire flag should solve that problem.
As soon as rule is marked as expired it can not be matched by packet any more.

I'm attaching yet another version of patch. The list of changes to
patch sent by Richard is as follows:

- atomic operations are gone as we don't need them
  right now. those will be introduced as soon as we will get
  there. pf_test()/pf_test_rule() is still processing a single
  packet at a time.

- it's better to use TAILQ_EMPTY() instead of checking link
  members to determine if last rule is being removed from   
  anchor `a`

- I've also realized we have to call pf_purge_expired_rules()
  from pf_commit_rules(). We have to make sure expired rules are gone
  from active lists. I think this is the best approach for pf@Puffy
  currently. It will be changed as soon as ruleset locks and
  reference counting for rules will be merged in. I had to also
  reshuffle a consistency lock a bit in pf_purge_thread().

- last but not least: silly name got renamed (s/myruleset/ruleset)

so what do you think? OK?

regards
sasha

-8<8<8<---8<
Index: pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.960
diff -u -p -r1.960 pf.c
--- pf.c6 Dec 2015 10:03:23 -   1.960
+++ pf.c16 Dec 2015 00:52:22 -
@@ -295,6 +295,9 @@ RB_GENERATE(pf_state_tree, pf_state_key,
 RB_GENERATE(pf_state_tree_id, pf_state,
 entry_id, pf_state_compare_id);
 
+SLIST_HEAD(pf_rule_gcl, pf_rule)   pf_rule_gcl =
+   SLIST_HEAD_INITIALIZER(pf_rule_gcl);
+
 __inline int
 pf_addr_compare(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
@@ -1137,6 +1140,18 @@ pf_state_export(struct pfsync_state *sp,
 /* END state table stuff */
 
 void
+pf_purge_expired_rules(void)
+{
+   struct pf_rule  *r;
+
+   while ((r = SLIST_FIRST(_rule_gcl)) != NULL) {
+   SLIST_REMOVE(_rule_gcl, r, pf_rule, gcle);
+   KASSERT(r->rule_flag & PFRULE_EXPIRED);
+   pf_purge_rule(r);
+   }
+}
+
+void
 pf_purge_thread(void *v)
 {
int nloops = 0, s;
@@ -1154,6 +1169,11 @@ pf_purge_thread(void *v)
if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
pf_purge_expired_fragments();
pf_purge_expired_src_nodes(0);
+   if (!SLIST_EMPTY(_rule_gcl)) {
+   rw_enter_write(_consistency_lock);
+   pf_purge_expired_rules();
+   rw_exit_write(_consistency_lock);
+   }
nloops = 0;
}
 
@@ -3135,6 +3155,10 @@ pf_test_rule(struct pf_pdesc *pd, struct
ruleset = _main_ruleset;
r = TAILQ_FIRST(pf_main_ruleset.rules.active.ptr);
while (r != NULL) {
+   if (r->rule_flag & PFRULE_EXPIRED) {
+   r = TAILQ_NEXT(r, entries);
+   goto nextrule;
+   }
r->evaluations++;
PF_TEST_ATTRIB((pfi_kif_match(r->kif, pd->kif) == r->ifnot),
r->skip[PF_SKIP_IFP].ptr);
@@ -3433,8 +3457,15 @@ pf_test_rule(struct pf_pdesc *pd, struct
}
 #endif /* NPFSYNC > 0 */
 
-   if (r->rule_flag & PFRULE_ONCE)
-   pf_purge_rule(ruleset, r, aruleset, a);
+   if (r->rule_flag & PFRULE_ONCE) {
+   if ((a != NULL) && TAILQ_EMPTY(a->ruleset->rules.active.ptr)) {
+   a->rule_flag |= PFRULE_EXPIRED;
+   SLIST_INSERT_HEAD(_rule_gcl, a, gcle);
+   }
+
+   r->rule_flag |= PFRULE_EXPIRED;
+   SLIST_INSERT_HEAD(_rule_gcl, r, gcle);
+   }
 
 #ifdef INET6
if (rewrite && skw->af != sks->af)
Index: pf_ioctl.c
===
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.297
diff -u -p -r1.297 pf_ioctl.c
--- pf_ioctl.c  3 Dec 2015 13:30:18 -   1.297
+++ pf_ioctl.c  16 Dec 2015 00:52:25 -
@@ -301,12 +301,13 @@ pf_rm_rule(struct pf_rulequeue *rulequeu
 }
 
 void
-pf_purge_rule(struct pf_ruleset *ruleset, struct pf_rule *rule,
-struct pf_ruleset *aruleset, struct pf_rule *arule)
+pf_purge_rule(struct pf_rule *rule)

Re: removing expired once rules in pf_purge_thread()

2015-12-16 Thread Alexandr Nedvedicky

hello,


> > > It just occurred to me that another possibility would be a match-only
> > > rule that matches one but doesn't involve any purging machinery.  Right
> > > now we install ftp-proxy rules as having maximum number of states equal
> > > to 1, however there's a time window between the moment the state is no
> > > longer there, but anchor is not gone yet and it can potentially create
> > > more states which is not a problem for an eavesdropper who can sniff
> > > the plaintext protocol.
> > 
> > I'm not sure I'm following you. IMO the expire flag should solve
> > that problem. As soon as rule is marked as expired it can not be
> > matched by packet any more.
> >
> 
> Sure.  I'm just saying that we could change the logic and remove
> the purging part, while keeping only the "match once".
> 
that might be other possibility. However I would give a try to
garbage collection.

> 
> Please follow the example of pf_purge_expired_src_nodes and fold
> all of this including the rwlocks inside pf_purge_expired_rules.
> 

fixed

> 
> I like the generalised approach to a/r purging.  I have a few
> questions however:
> 
> Since you're not removing the rule from the ruleset at this point
> what does "pfctl -sr" display in this case? 

new version of the patch makes pfctl -sr aware of PFRULE_EXPIRED
flag. The expired rule will be shown in verbose mode only.

> What happens if you
> list the anchor?  And what should the user see?
> 
> Should we omit exporting those rules into userland via ioctl?

I think we should export those rules to userland and let userland
to interpret what it gets from kernel. Changing DIOCGETRULE might
not be that easy.

> 
> What happens with pfsync in this case?

I keep forgetting about pfsync...

After looking at current pf_purge_rule() implementation in CVS it
seems to me it just does not care about pfsync at all.

The patch I'd like to commit keeps things same with respect to pfsync.

from my very limited understanding of pfsync, only main rulesets are synced up
between nodes. When pf_rules_commit() loads new rules to active list it calls
pf_setup_pfsync_matching(). the function calculates a checksum on ruleset.
It's still not clear to me how/when (if ever) are rules synced up.

thanks and
regards
sasha
 
8<---8<---8<--8<

Index: sys/net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.960
diff -u -p -r1.960 pf.c
--- sys/net/pf.c6 Dec 2015 10:03:23 -   1.960
+++ sys/net/pf.c16 Dec 2015 16:28:28 -
@@ -295,6 +295,9 @@ RB_GENERATE(pf_state_tree, pf_state_key,
 RB_GENERATE(pf_state_tree_id, pf_state,
 entry_id, pf_state_compare_id);
 
+SLIST_HEAD(pf_rule_gcl, pf_rule)   pf_rule_gcl =
+   SLIST_HEAD_INITIALIZER(pf_rule_gcl);
+
 __inline int
 pf_addr_compare(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
@@ -1137,6 +1140,27 @@ pf_state_export(struct pfsync_state *sp,
 /* END state table stuff */
 
 void
+pf_purge_expired_rules(int locked)
+{
+   struct pf_rule  *r;
+
+   if (SLIST_EMPTY(_rule_gcl))
+   return;
+
+   if (!locked)
+   rw_enter_write(_consistency_lock);
+
+   while ((r = SLIST_FIRST(_rule_gcl)) != NULL) {
+   SLIST_REMOVE(_rule_gcl, r, pf_rule, gcle);
+   KASSERT(r->rule_flag & PFRULE_EXPIRED);
+   pf_purge_rule(r);
+   }
+
+   if (!locked)
+   rw_exit_write(_consistency_lock);
+}
+
+void
 pf_purge_thread(void *v)
 {
int nloops = 0, s;
@@ -1154,6 +1178,7 @@ pf_purge_thread(void *v)
if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
pf_purge_expired_fragments();
pf_purge_expired_src_nodes(0);
+   pf_purge_expired_rules(0);
nloops = 0;
}
 
@@ -3135,6 +3160,10 @@ pf_test_rule(struct pf_pdesc *pd, struct
ruleset = _main_ruleset;
r = TAILQ_FIRST(pf_main_ruleset.rules.active.ptr);
while (r != NULL) {
+   if (r->rule_flag & PFRULE_EXPIRED) {
+   r = TAILQ_NEXT(r, entries);
+   goto nextrule;
+   }
r->evaluations++;
PF_TEST_ATTRIB((pfi_kif_match(r->kif, pd->kif) == r->ifnot),
r->skip[PF_SKIP_IFP].ptr);
@@ -3433,8 +3462,15 @@ pf_test_rule(struct pf_pdesc *pd, struct
}
 #endif /* NPFSYNC > 0 */
 
-   if (r->rule_flag & PFRULE_ONCE)
-   pf_purge_rule(ruleset, r, aruleset, a);
+   if (r->rule_flag & PFRULE_ONCE) {
+   if ((a != NULL) && TAILQ_EMPTY(a->ruleset->rules.active.ptr)) {
+   a->rule_flag |= PFRULE_EXPIRED;
+   SLIST_INSERT_HEAD(_rule_gcl, a, gcle);
+   }
+
+   r->rule_flag |= PFRULE_EXPIRED;
+

Re: removing expired once rules in pf_purge_thread()

2015-12-15 Thread Alexandr Nedvedicky

Hello,

> >
> >Another possibility would be to require 'once' rules to be 'quick'.
> >This closes the candidacy window and makes its serialisation, to
> >preclude multiple matches, more feasible.
> >
> >Yet another possibility is to drop 'once' rules as too complex to
> >implement for multiprocessor but I have no idea if this is viable.
> >
> 
> It is.  And I have said that before with an authority of the implementor
> of "once" rules: since we don't have any userland applications that
> would use this yet, we can ditch them for now and possibly devise a
> better approach later.
> 
> Don't make your lives harder than they have to be!
> 

I'm just trying out patch improved by Richard. I like his idea to put anchor
rule to garbage collector list. I'd like to give it a try on Solaris. There
might be small changes in details.

IMO once rules are fine, and offloading hard task to service/garbage collector
thread works just fine.

regards
sasha

Re: removing expired once rules in pf_purge_thread()

2015-12-16 Thread Alexandr Nedvedicky

Hello,


On Wed, Dec 16, 2015 at 02:48:49PM +1300, Richard Procter wrote:
> 
> On Tue, 15 Dec 2015, Mike Belopuhov wrote:
> 
> > >Yet another possibility is to drop 'once' rules as too complex to
> > >implement for multiprocessor but I have no idea if this is viable.
> > 
> > It is.  And I have said that before with an authority of the implementor
> > of "once" rules: since we don't have any userland applications that
> > would use this yet, we can ditch them for now and possibly devise a
> > better approach later.
>  
> > Don't make your lives harder than they have to be!
> 
> I tend to agree! And I can't see a way to reimplement it for a 
> multithreaded pf without introducing downsides.
> 
> Quoting pf.conf(5) "once - Creates a one shot rule that will remove itself 
> from an active ruleset after the first match."
> 
> A 'first' match presupposes a total ordering. This comes for free when pf 
> is single-threaded but concurrent rule matching must take the trouble to 
> re-impose it somewhere. (I assume that pf aims to do concurrent matching 
> of packets against the ruleset.)

pf@solaris allows the race here. The price for correct behavior, which is to
allow one and only one packet to hit the rule is too high (given the once rules
are kind of special case). What has to be granted is there is one 'winner'
only, which puts the rule to garbage collector list. The pragmatic approach
wins here.

regards
sasha

PF: reference counting for statekey

2016-01-03 Thread Alexandr Nedvedicky

Hello,

there is a sad story behind patch below. I've commit the change on Dec 22snd.
Unfortunately many people who run snapshots experienced panic on assert. Markus
Lude has been the first who reported the issue:

!pf_state_key_isvalid(sk)" failed: file "../../../../net/pf.c", line 6830

Obviously the change delivered just before Christmas was not complete.
Unfortunately I did not followed promptly and when I tried to commit fix
crafted by Stefan Kempf I've realized the offending commit got backed out on
Dec 23rd (thanks jasper).

Now let me share some technical details. If packet matches/creates state in PF,
the PF puts a pointer to state key into mbuf (->m_pkthdr.pf.statekey). In MP
world sharing a pointer like that is bad idea. The statekey might expire/get
purged, while packet (mbuf) is routed via IP stack and mbuf is left with
pointer to dead memory.

My commit from Dec 22snd solved the issue by introducing a reference counting,
the ->m_pkthdr.pf.statekey is no longer a pointer but reference. Instead of
simply assigning pointer to statekey one has to always do:

pf_pkt_state_key_ref(sk);
->m_pkthdr.pf.statekey = sk;

Various functions in kern/uipc_mbuf.c must take care of statekey reference
handling. My commit from 22snd forgot about m_dup_pkthdr(). Anytime the
mbuf header got copied, the function just stole the reference instead of
obtaining a new reference for copy. This has been fixed by Stefan Kempf,
who kindly sent patch to me in private email.

I failed to discover bug in my patch during testing. This time I tried better,
but still can't be sure. My test environment is rather virtual. I'll be happy
if there will be few more testers.

thanks and
regards
sasha

8<---8<---8<--8<
Index: kern/uipc_mbuf.c
===
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.216
diff -u -p -r1.216 uipc_mbuf.c
--- kern/uipc_mbuf.c23 Dec 2015 21:04:55 -  1.216
+++ kern/uipc_mbuf.c3 Jan 2016 14:40:29 -
@@ -72,6 +72,8 @@
  * Research Laboratory (NRL).
  */
 
+#include "pf.h"
+
 #include 
 #include 
 #include 
@@ -85,6 +87,9 @@
 #include 
 #include 
 #include 
+#if NPF > 0
+#include 
+#endif /* NPF > 0 */
 
 
 #include 
@@ -261,6 +266,10 @@ m_resethdr(struct mbuf *m)
/* delete all mbuf tags to reset the state */
m_tag_delete_chain(m);
 
+#if NPF > 0
+   pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+
/* like m_inithdr(), but keep any associated data and mbufs */
memset(>m_pkthdr, 0, sizeof(m->m_pkthdr));
m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
@@ -350,8 +359,12 @@ m_free(struct mbuf *m)
if (n)
n->m_flags |= M_ZEROIZE;
}
-   if (m->m_flags & M_PKTHDR)
+   if (m->m_flags & M_PKTHDR) {
m_tag_delete_chain(m);
+#if NPF > 0
+   pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+   }
if (m->m_flags & M_EXT)
m_extfree(m);
 
@@ -1201,6 +1214,10 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
to->m_flags |= (from->m_flags & M_COPYFLAGS);
to->m_pkthdr = from->m_pkthdr;
+
+#if NPF > 0
+   pf_pkt_state_key_ref(to);
+#endif /* NPF > 0 */
 
SLIST_INIT(>m_pkthdr.ph_tags);
 
Index: sys/mbuf.h
===
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.205
diff -u -p -r1.205 mbuf.h
--- sys/mbuf.h  21 Nov 2015 11:46:25 -  1.205
+++ sys/mbuf.h  3 Jan 2016 14:40:30 -
@@ -316,6 +316,7 @@ struct mbuf {
(to)->m_pkthdr = (from)->m_pkthdr;  \
(from)->m_flags &= ~M_PKTHDR;   \
SLIST_INIT(&(from)->m_pkthdr.ph_tags);  \
+   (from)->m_pkthdr.pf.statekey = NULL;\
 } while (/* CONSTCOND */ 0)
 
 /*
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.962
diff -u -p -r1.962 pf.c
--- net/pf.c23 Dec 2015 21:04:55 -  1.962
+++ net/pf.c3 Jan 2016 14:40:33 -
@@ -231,6 +231,11 @@ int pf_step_out_of_anchor(int *, 
stru
 voidpf_counters_inc(int, struct pf_pdesc *,
struct pf_state *, struct pf_rule *,
struct pf_rule *);
+voidpf_state_key_link(struct pf_state_key *,
+   struct pf_state_key *);
+voidpf_inpcb_unlink_state_key(struct inpcb *);
+voidpf_state_key_unlink_reverse(struct pf_state_key *);
+
 #if NPFLOG > 0
 voidpf_log_matches(struct pf_pdesc *, struct pf_rule *,
struct pf_rule *, struct pf_ruleset *,
@@

Re: Interesting if_get() case

2015-11-20 Thread Alexandr Nedvedicky

Hello,

thanks for detailed explanation.

> + else {
> + struct ifnet *destifp;
> +
> + destifp = if_get(rt->rt_ifidx);
> + if (destifp != NULL)
> + destmtu = destifp->if_mtu;
> + if_put(destifp);
> + }

your code potentially leaves destmtu set to 0 in case we deal with invalid
ipforward_rt. I wonder how icmp_error() we are going to call further below
(at line 1544 in the old code) is going to deal with it.  May be we
should just give up on sending ICMP_UNREACH message in this case.
find my small improvement to your patch further below.

regards
sasha

8<---8<---8<--8<

Warning: Permanently added 'anoncvs.spacehopper.org' (ECDSA) to the list of 
known hosts.
Index: ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.261
diff -u -p -r1.261 ip_input.c
--- ip_input.c  14 Nov 2015 15:40:40 -  1.261
+++ ip_input.c  20 Nov 2015 15:48:47 -
@@ -1516,11 +1516,24 @@ ip_forward(struct mbuf *m, struct ifnet

if (rt->rt_rmx.rmx_mtu)
destmtu = rt->rt_rmx.rmx_mtu;
-   else
-   destmtu = ipforward_rt.ro_rt->rt_ifp->if_mtu;
+   else {
+   struct ifnet *destifp;
+
+   destifp = if_get(rt->rt_ifidx);
+   if (destifp != NULL)
+   destmtu = destifp->if_mtu;
+   if_put(destifp);
+   }
}
 #endif /*IPSEC*/
ipstat.ips_cantfrag++;
+
+   /*
+* route to destniation no longer exists, we should revert code
+* back to host unreachable.
+*/
+   if (destmtu == 0)
+   code = ICMP_UNREACH_HOST;
break;

case EACCES:

Re: rt_ifp and icmp

2015-11-20 Thread Alexandr Nedvedicky

On Fri, Nov 20, 2015 at 02:31:23PM +0100, Martin Pieuchot wrote:
> The first rt_ifp of the day, make use of if_get() inside icmp_mtudisc().
> 
> ok?
> 

OK (however you are probably better to wait for more experienced one OK)

BTW I like the way rt_ifidx is defined, it's smart way to move forward safely.

regards
sasha

Re: rt_ifp and icmp6

2015-11-20 Thread Alexandr Nedvedicky

Hello,

OK

regards
sasha

On Fri, Nov 20, 2015 at 02:33:44PM +0100, Martin Pieuchot wrote:
> Some if_get() love in icmp6.
> 
> ok?
>

Re: rt_ifp and pf(4)

2015-11-20 Thread Alexandr Nedvedicky

Hello,

I have just nit comments, feel free to ignore them.

1) would it be possible to use closing #endif guards as follows:
#endif  /* NCARP */

2) another nit here:
> @@ -5607,6 +5616,8 @@ pf_route(struct mbuf **m, struct pf_rule
>  done:
>   if (r->rt != PF_DUPTO)
>   *m = NULL;
> + if (!r->rt)
> + if_put(ifp);
>   rtfree(rt);
>   return;
>  

I would probably use test as follows:
if (rt != NULL)
if_put(ifp);

it's detail, given at some point in future we will probably have to use
if_put()/if_get() for ifp's bound to kif's (right?).  My point here is we call
if_get() after we successfully obtain rt.

Apart those two everything is OK, since those two are nits, it's up to you if
you go for my suggestions.

regards
sasha
On Thu, Nov 19, 2015 at 12:07:38PM +0100, Martin Pieuchot wrote:
> Stop using rt_ifp.  While here put some NCARP... ok?
> 
> Index: net/pf.c
> ===
> RCS file: /cvs/src/sys/net/pf.c,v
> retrieving revision 1.950
> diff -u -p -r1.950 pf.c
> --- net/pf.c  12 Nov 2015 10:07:14 -  1.950
> +++ net/pf.c  19 Nov 2015 11:05:37 -
> @@ -36,6 +36,7 @@
>   */
>  
>  #include "bpfilter.h"
> +#include "carp.h"
>  #include "pflog.h"
>  #include "pfsync.h"
>  #include "pflow.h"
> @@ -2595,9 +2596,11 @@ pf_match_rcvif(struct mbuf *m, struct pf
>   if (ifp == NULL)
>   return (0);
>  
> +#if NCARP > 0
>   if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
>   kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
>   else
> +#endif
>   kif = (struct pfi_kif *)ifp->if_pf_kif;
>  
>   if_put(ifp);
> @@ -5347,7 +5350,6 @@ pf_routable(struct pf_addr *addr, sa_fam
>   struct sockaddr_in6 *dst6;
>  #endif   /* INET6 */
>   struct rtentry  *rt, *rt0 = NULL;
> - struct ifnet*ifp;
>  
>   check_mpath = 0;
>   memset(, 0, sizeof(ss));
> @@ -5397,13 +5399,20 @@ pf_routable(struct pf_addr *addr, sa_fam
>   ret = 0;
>   rt = rt0;
>   do {
> - if (rt->rt_ifp->if_type == IFT_CARP)
> - ifp = rt->rt_ifp->if_carpdev;
> - else
> - ifp = rt->rt_ifp;
> -
> - if (kif->pfik_ifp == ifp)
> + if (rt->rt_ifidx == kif->pfik_ifp->if_index) {
>   ret = 1;
> +#if NCARP > 0
> + } else {
> + struct ifnet*ifp;
> +
> + ifp = if_get(rt->rt_ifidx);
> + if (ifp != NULL && ifp->if_type == IFT_CARP &&
> + ifp->if_carpdev == kif->pfik_ifp)
> + ret = 1;
> + if_put(ifp);
> +#endif
> + }
> +
>  #ifndef SMALL_KERNEL
>   rt = rtable_mpath_next(rt);
>  #else
> @@ -5512,7 +5521,7 @@ pf_route(struct mbuf **m, struct pf_rule
>   goto bad;
>   }
>  
> - ifp = rt->rt_ifp;
> + ifp = if_get(rt->rt_ifidx);
>  
>   if (rt->rt_flags & RTF_GATEWAY)
>   dst = satosin(rt->rt_gateway);
> @@ -5607,6 +5616,8 @@ pf_route(struct mbuf **m, struct pf_rule
>  done:
>   if (r->rt != PF_DUPTO)
>   *m = NULL;
> + if (!r->rt)
> + if_put(ifp);
>   rtfree(rt);
>   return;
>  
> @@ -6312,9 +6323,11 @@ pf_test(sa_family_t af, int fwdir, struc
>   if (!pf_status.running)
>   return (PF_PASS);
>  
> +#if NCARP > 0
>   if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
>   kif = (struct pfi_kif *)ifp->if_carpdev->if_pf_kif;
>   else
> +#endif
>   kif = (struct pfi_kif *)ifp->if_pf_kif;
>  
>   if (kif == NULL) {
>

Re: Interesting if_get() case

2015-11-22 Thread Alexandr Nedvedicky

Hello,

> > 
> > > + else {
> > > + struct ifnet *destifp;
> > > +
> > > + destifp = if_get(rt->rt_ifidx);
> > > + if (destifp != NULL)
> > > + destmtu = destifp->if_mtu;
> > > + if_put(destifp);
> > > + }
> > 
> > your code potentially leaves destmtu set to 0 in case we deal with invalid
> > ipforward_rt. I wonder how icmp_error() we are going to call further below
> > (at line 1544 in the old code) is going to deal with it.  May be we
> > should just give up on sending ICMP_UNREACH message in this case.
> > find my small improvement to your patch further below.
> 
> Note that my change should only matter for IPSEC.  This whole MTU mess is
> scary. I can't even tell if it makes sense to send an ICMP if the ifp
> disappeared.  The use of if_get(9) makes it clear that the MTU handling
> is insane.
> 
> So I'm not opposed to your change but I think you should either commit
> it separately or keep it inside the "if (ipforward_rt.ro_rt) {" block,
> because it's a behavior change.
> 

IMO we should not let IPSEC send ICMP with 0 MTU proposal. We are probably
better to send nothing in this case and let sender to retransmit.

Also if I understand the code well enough the proposed MTU could not be zero
before your change. However given the likelihood someone triggers the case
of invalidated route is very low (more academical than practical problem...),
I agree with follow up commit.

We should probably check with mikeb here.

regards
sasha

Re: rt_ifp and pf(4)

2015-11-22 Thread Alexandr Nedvedicky

On Sat, Nov 21, 2015 at 12:37:58PM +0100, Martin Pieuchot wrote:
> On 20/11/15(Fri) 18:05, Alexandr Nedvedicky wrote:
> > Hello,
> > 
> > I have just nit comments, feel free to ignore them.
> > 
> > 1) would it be possible to use closing #endif guards as follows:
> > #endif  /* NCARP */
> 
> Done.
> 

thanks a lot. I'm fine with your patch.

OK

regards
sasha

> > it's detail, given at some point in future we will probably have to use
> > if_put()/if_get() for ifp's bound to kif's (right?).
> 
> Well not necessarily.  kif have the same lifetime as an ifp and are easy
> to garbage collect so there's no need for the moment to use index for
> them.  Indexes are useful when two objects having a different lifetime
> need to be linked at some point.  By using indexes/id/cookie we do not 
> increase the lifetime of an object like with references but this has the
> cost of a layer of indirection.

and thanks for clarification here.

small glitch in pfctl_show_rules()

2016-05-30 Thread Alexandr Nedvedicky

Hello,

Petr Hoffmann discovered glitch in 'pfctl -a "*" -sr' command when it is
recursively dumping rulesets from PF kernel module. The ruleset in kernel
got created by screw-it.sh shell script:

#!/bin/sh

pfctl -d
echo 'anchor "../bar/*"\nanchor "bar/*"' |pfctl -a foo -f -
echo pass | pfctl -f - -a foo/bar
echo 'anchor "foo"\nanchor "bar" ' | pfctl -f -

Now if we use 'pfctl -a "*" -sr' we get output as follows:

# pfctl -a "*" -sr
anchor "foo" all {
  anchor "../bar/*" all {
pfctl: DIOCGETRULES: Invalid argument
  }
  anchor "bar/*" all {
pass all flags S/SA
  }
}
anchor "bar" all {
}

The problem comes from pfctl_show_rules(), which is going to descend
to "../bar/" anchor at line 845:

 828 case PFCTL_SHOW_RULES:
 829 if (pr.rule.label[0] && (opts & PF_OPT_SHOWALL))
 830 labels = 1;
 839 if (pr.anchor_call[0] &&
 ...
 840 (((p = strrchr(pr.anchor_call, '/')) ?
 841 p[1] == '_' : pr.anchor_call[0] == '_') ||
 842 opts & PF_OPT_RECURSE)) {
 843 printf(" {\n");
 844 pfctl_print_rule_counters(, opts);
 845 pfctl_show_rules(dev, npath, opts, format,
 846 pr.anchor_call, depth + 1,
 847 pr.rule.anchor_wildcard, -1);
 848 INDENT(depth, !(opts & PF_OPT_VERBOSE));
 849 printf("}\n");

The simplest fix here is not to allow pfctl_show_rules() to descend
to anchor specified by relative path.

patched pfctl gives output as follows:

# ./pfctl -a "*" -sr 
anchor "foo" all {
  anchor "../bar/*" all
  anchor "bar/*" all {
pass all flags S/SA
  }
}
anchor "bar" all {
}

OK?

thanks a lot
regards
sasha

8<---8<---8<--8<
diff -r d0c6296079db src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cMon May 30 20:12:24 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cMon May 30 20:41:11 2016 +0200
@@ -835,11 +835,14 @@
 * If this is an 'unnamed' brace notation anchor OR
 * the user has explicitly requested recursion,
 * print it recursively.
+*
+* Note: we don't descend to relative anchors.
 */
if (pr.anchor_call[0] &&
(((p = strrchr(pr.anchor_call, '/')) ?
p[1] == '_' : pr.anchor_call[0] == '_') ||
-   opts & PF_OPT_RECURSE)) {
+   ((opts & PF_OPT_RECURSE) &&
+   (strncmp(pr.anchor_call, "../", 3) != 0 {
printf(" {\n");
pfctl_print_rule_counters(, opts);
pfctl_show_rules(dev, npath, opts, format,

KASSERT() @ pf_test() is back

2016-02-07 Thread Alexandr Nedvedicky

Hello,

I don't expect to see O.K. to patch below.

The patch is the part of the change, which has been backed out last weekend.
Too many things were wrong so I'm trying to untangle the code a bit.

Patch below is for brave hearts, who don't mind to see KASSERT() to fire:

Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.965
diff -u -p -r1.965 pf.c
--- net/pf.c31 Jan 2016 00:18:07 -  1.965
+++ net/pf.c7 Feb 2016 23:38:16 -
@@ -6534,6 +6534,12 @@ done:
if (action == PF_PASS && qid)
pd.m->m_pkthdr.pf.qid = qid;
if (pd.dir == PF_IN && s && s->key[PF_SK_STACK]) {
+   /*
+* ASSERT() below fires whenever caller forgets to call
+* pf_pkt_addr_changed(). This might happen when we deal with
+* IP tunnels.
+*/
+   KASSERT(pd.m->m_pkthdr.pf.statekey == NULL);
pd.m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
}
if (pd.dir == PF_OUT &&

before we get any further with unlocking PF, we need to be sure how packet
handling looks like at 'global level'. Besides the change, which re-introduces
the KASSERT(), I'm adding few more changes, where .pf.statekey should be
removed from mbuf, so KASSERT() won't fire.

My plan is to commit patch as soon as CVS will be unlocked. The change won't
appear in 5.9. I hope to see some KASSERT() panic stories now.

thanks a lot
regards
sasha

8<---8<---8<--8<

Index: net/if_etherip.c
===
RCS file: /cvs/src/sys/net/if_etherip.c,v
retrieving revision 1.5
diff -u -p -r1.5 if_etherip.c
--- net/if_etherip.c25 Jan 2016 05:12:34 -  1.5
+++ net/if_etherip.c7 Feb 2016 23:38:09 -
@@ -499,6 +499,10 @@ ip_etherip_input(struct mbuf *m, ...)
}
m->m_flags &= ~(M_BCAST|M_MCAST);
 
+#if NPF > 0
+   pf_pkt_addr_changed(m);
+#endif
+
ml_enqueue(, m);
if_input(ifp, );
 }
@@ -641,6 +645,10 @@ ip6_etherip_input(struct mbuf **mp, int 
}
 
m->m_flags &= ~(M_BCAST|M_MCAST);
+
+#if NPF > 0
+   pf_pkt_addr_changed(m);
+#endif
 
ml_enqueue(, m);
if_input(ifp, );
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.965
diff -u -p -r1.965 pf.c
--- net/pf.c31 Jan 2016 00:18:07 -  1.965
+++ net/pf.c7 Feb 2016 23:38:16 -
@@ -6534,6 +6534,12 @@ done:
if (action == PF_PASS && qid)
pd.m->m_pkthdr.pf.qid = qid;
if (pd.dir == PF_IN && s && s->key[PF_SK_STACK]) {
+   /*
+* ASSERT() below fires whenever caller forgets to call
+* pf_pkt_addr_changed(). This might happen when we deal with
+* IP tunnels.
+*/
+   KASSERT(pd.m->m_pkthdr.pf.statekey == NULL);
pd.m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
}
if (pd.dir == PF_OUT &&
Index: net/pipex.c
===
RCS file: /cvs/src/sys/net/pipex.c,v
retrieving revision 1.84
diff -u -p -r1.84 pipex.c
--- net/pipex.c 3 Nov 2015 21:33:56 -   1.84
+++ net/pipex.c 7 Feb 2016 23:38:18 -
@@ -1139,6 +1139,10 @@ pipex_ip_input(struct mbuf *m0, struct p
goto drop;
}
 
+#if NPF > 0
+   pf_pkt_addr_changed(m0);
+#endif
+
len = m0->m_pkthdr.len;
 
 #if NBPFILTER > 0
Index: netinet/ip_gre.c
===
RCS file: /cvs/src/sys/netinet/ip_gre.c,v
retrieving revision 1.58
diff -u -p -r1.58 ip_gre.c
--- netinet/ip_gre.c2 Dec 2015 08:47:00 -   1.58
+++ netinet/ip_gre.c7 Feb 2016 23:38:20 -
@@ -337,6 +337,10 @@ gre_mobile_input(struct mbuf *m, ...)
bpf_mtap_af(sc->sc_if.if_bpf, AF_INET, m, BPF_DIRECTION_IN);
 #endif
 
+#if NPF > 0
+   pf_pkt_addr_changed(m);
+#endif
+
niq_enqueue(, m);
 }

Re: snapshot from 26-Jan-2016 - pfsync panic

2016-01-28 Thread Alexandr Nedvedicky

Hello,

> would this commit
> http://permalink.gmane.org/gmane.os.openbsd.cvs/152882
> resolve this problem?

This commit should fix this issue. can you test it for me?

FYI: if_pfsync.c must initialize reference counter at imported
statekey to 1. If it is left to 0, then PF trips the assert.

thanks a lot
regards
sasha



On Thu, Jan 28, 2016 at 04:02:27PM +0100, Hrvoje Popovski wrote:
> Hi all,
> 
> i have pf,carp,pfsync and dhcpd setup with 2 Dell R610. today i updated
> my secondary firewall with latest snapshot from
> http://ftp2.eu.openbsd.org/   
> install59.iso 26-Jan-2016 04:00
> 
> and after update secondary firewall panic
> http://kosjenka.srce.hr/~hrvoje/crash2.jpg
> 
> i couldn't type anything in ddb console ..
> 
> i disabled pfsync0 and syncdev bnx3 on primary firewall and then
> secondary boots up..
> 
> would this commit
> http://permalink.gmane.org/gmane.os.openbsd.cvs/152882
> resolve this problem?
> 
> 
> pf.conf
> set limit states 20
> set skip on { lo bnx3 enc0 }
> table  { XXX }
> 
> match out on bnx0 from 10.111/16 nat-to XXX/29 port 1024:65535
> 
> block
> pass quick proto carp keep state (no-sync)
> pass in quick on bnx0 from 
> pass out log on bnx0
> pass in log on bnx1 from 10.111/16
> pass inet proto icmp icmp-type { echoreq, unreach code needfrag }
> 
> 
> 
> dmesg
> 
> OpenBSD 5.9-beta (GENERIC.MP) #1864: Mon Jan 25 19:11:29 MST 2016
> dera...@amd64.openbsd.org:/usr/src/sys/arch/amd64/compile/GENERIC.MP
> real mem = 12854988800 (12259MB)
> avail mem = 12461199360 (11883MB)
> mpath0 at root
> scsibus0 at mpath0: 256 targets
> mainbus0 at root
> bios0 at mainbus0: SMBIOS rev. 2.6 @ 0xcf49c000 (84 entries)
> bios0: vendor Dell Inc. version "6.4.0" date 07/23/2013
> bios0: Dell Inc. PowerEdge R610
> acpi0 at bios0: rev 2
> acpi0: sleep states S0 S4 S5
> acpi0: tables DSDT FACP APIC SPCR HPET DM__ MCFG WD__ SLIC ERST HEST
> BERT EINJ SRAT TCPA
> acpi0: wakeup devices PCI0(S5)
> acpitimer0 at acpi0: 3579545 Hz, 24 bits
> acpimadt0 at acpi0 addr 0xfee0: PC-AT compat
> cpu0 at mainbus0: apid 32 (boot processor)
> cpu0: Intel(R) Xeon(R) CPU E5630 @ 2.53GHz, 2660.41 MHz
> cpu0:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,DCA,SSE4.1,SSE4.2,POPCNT,AES,NXE,PAGE1GB,LONG,LAHF,PERF,ITSC,SENSOR,ARAT
> cpu0: 256KB 64b/line 8-way L2 cache
> cpu0: smt 0, core 0, package 1
> mtrr: Pentium Pro MTRR support, 10 var ranges, 88 fixed ranges
> cpu0: apic clock running at 133MHz
> cpu0: mwait min=64, max=64, C-substates=0.2.1.1, IBE
> cpu1 at mainbus0: apid 0 (application processor)
> cpu1: Intel(R) Xeon(R) CPU E5630 @ 2.53GHz, 2660.00 MHz
> cpu1:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,DCA,SSE4.1,SSE4.2,POPCNT,AES,NXE,PAGE1GB,LONG,LAHF,PERF,ITSC,SENSOR,ARAT
> cpu1: 256KB 64b/line 8-way L2 cache
> cpu1: smt 0, core 0, package 0
> cpu2 at mainbus0: apid 34 (application processor)
> cpu2: Intel(R) Xeon(R) CPU E5630 @ 2.53GHz, 2660.00 MHz
> cpu2:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,DCA,SSE4.1,SSE4.2,POPCNT,AES,NXE,PAGE1GB,LONG,LAHF,PERF,ITSC,SENSOR,ARAT
> cpu2: 256KB 64b/line 8-way L2 cache
> cpu2: smt 0, core 1, package 1
> cpu3 at mainbus0: apid 2 (application processor)
> cpu3: Intel(R) Xeon(R) CPU E5630 @ 2.53GHz, 2660.00 MHz
> cpu3:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,DCA,SSE4.1,SSE4.2,POPCNT,AES,NXE,PAGE1GB,LONG,LAHF,PERF,ITSC,SENSOR,ARAT
> cpu3: 256KB 64b/line 8-way L2 cache
> cpu3: smt 0, core 1, package 0
> cpu4 at mainbus0: apid 50 (application processor)
> cpu4: Intel(R) Xeon(R) CPU E5630 @ 2.53GHz, 2660.00 MHz
> cpu4:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,DCA,SSE4.1,SSE4.2,POPCNT,AES,NXE,PAGE1GB,LONG,LAHF,PERF,ITSC,SENSOR,ARAT
> cpu4: 256KB 64b/line 8-way L2 cache
> cpu4: smt 0, core 9, package 1
> cpu5 at mainbus0: apid 18 (application processor)
> cpu5: Intel(R) Xeon(R) CPU E5630 @ 2.53GHz, 2660.00 MHz
> cpu5:
> FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,CX16,xTPR,PDCM,PCID,DCA,SSE4.1,SSE4.2,POPCNT,AES,NXE,PAGE1GB,LONG,LAHF,PERF,ITSC,SENSOR,ARAT
> cpu5: 256KB 64b/line 8-way L2 cache
> cpu5: smt 0, core 9, package 0
> cpu6 at mainbus0: apid 52 (application processor)
> cpu6: Intel(R)

back out ASSERT(...pf.statekey == NULL) @ pf.c:6569

2016-01-30 Thread Alexandr Nedvedicky

Hello,

time has come to backout stuff, which has been baked just before Christmas
2015.  Although there was some progress over the last few days the things are
not improving that fast to become good enough for 5.9 release.

I took patch (~sthen/pf-statekey-backout.diff) prepared by sthen last week and
did some minor tweaks so it applies cleanly. Patch removes the offending
KASSERT() and related changes.

I'll try to post partial patches in next few days, so brave volunteers will be
able to continue testing. So it will be possible re-commit the stuff, I'd like
to back out now, in much better shape.

thanks and
regards
sasha
 
8<---8<---8<--8<

Index: kern/uipc_mbuf.c
===
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.217
diff -u -p -r1.217 uipc_mbuf.c
--- kern/uipc_mbuf.c7 Jan 2016 22:23:13 -   1.217
+++ kern/uipc_mbuf.c30 Jan 2016 22:23:31 -
@@ -1,4 +1,4 @@
-/* $OpenBSD: uipc_mbuf.c,v 1.217 2016/01/07 22:23:13 sashan Exp $  */
+/* $OpenBSD: uipc_mbuf.c,v 1.216 2015/12/23 21:04:55 jasper Exp $  */
 /* $NetBSD: uipc_mbuf.c,v 1.15.4.1 1996/06/13 17:11:44 cgd Exp $   */
 
 /*
@@ -72,8 +72,6 @@
  * Research Laboratory (NRL).
  */
 
-#include "pf.h"
-
 #include 
 #include 
 #include 
@@ -87,9 +85,6 @@
 #include 
 #include 
 #include 
-#if NPF > 0
-#include 
-#endif /* NPF > 0 */
 
 
 #include 
@@ -266,10 +261,6 @@ m_resethdr(struct mbuf *m)
/* delete all mbuf tags to reset the state */
m_tag_delete_chain(m);
 
-#if NPF > 0
-   pf_pkt_unlink_state_key(m);
-#endif /* NPF > 0 */
-
/* like m_inithdr(), but keep any associated data and mbufs */
memset(>m_pkthdr, 0, sizeof(m->m_pkthdr));
m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
@@ -359,12 +350,8 @@ m_free(struct mbuf *m)
if (n)
n->m_flags |= M_ZEROIZE;
}
-   if (m->m_flags & M_PKTHDR) {
+   if (m->m_flags & M_PKTHDR)
m_tag_delete_chain(m);
-#if NPF > 0
-   pf_pkt_unlink_state_key(m);
-#endif /* NPF > 0 */
-   }
if (m->m_flags & M_EXT)
m_extfree(m);
 
@@ -1214,10 +1201,6 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
to->m_flags |= (from->m_flags & M_COPYFLAGS);
to->m_pkthdr = from->m_pkthdr;
-
-#if NPF > 0
-   pf_pkt_state_key_ref(to);
-#endif /* NPF > 0 */
 
SLIST_INIT(>m_pkthdr.ph_tags);
 
Index: net/if_pfsync.c
===
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.226
diff -u -p -r1.226 if_pfsync.c
--- net/if_pfsync.c 27 Jan 2016 04:35:56 -  1.226
+++ net/if_pfsync.c 30 Jan 2016 22:23:33 -
@@ -523,7 +523,6 @@ pfsync_state_import(struct pfsync_state 
skw->port[0] = sp->key[PF_SK_WIRE].port[0];
skw->port[1] = sp->key[PF_SK_WIRE].port[1];
skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
-   PF_REF_INIT(skw->refcnt);
skw->proto = sp->proto;
if (!(skw->af = sp->key[PF_SK_WIRE].af))
skw->af = sp->af;
@@ -533,7 +532,6 @@ pfsync_state_import(struct pfsync_state 
sks->port[0] = sp->key[PF_SK_STACK].port[0];
sks->port[1] = sp->key[PF_SK_STACK].port[1];
sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
-   PF_REF_INIT(sks->refcnt);
if (!(sks->af = sp->key[PF_SK_STACK].af))
sks->af = sp->af;
if (sks->af != skw->af) {
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.964
diff -u -p -r1.964 pf.c
--- net/pf.c25 Jan 2016 18:49:57 -  1.964
+++ net/pf.c30 Jan 2016 22:23:41 -
@@ -1,4 +1,4 @@
-/* $OpenBSD: pf.c,v 1.964 2016/01/25 18:49:57 sashan Exp $ */
+/* $OpenBSD: pf.c,v 1.962 2015/12/23 21:04:55 jasper Exp $ */
 
 /*
  * Copyright (c) 2001 Daniel Hartmeier
@@ -231,11 +231,6 @@ int pf_step_out_of_anchor(int *, 
stru
 voidpf_counters_inc(int, struct pf_pdesc *,
struct pf_state *, struct pf_rule *,
struct pf_rule *);
-voidpf_state_key_link(struct pf_state_key *,
-   struct pf_state_key *);
-voidpf_inpcb_unlink_state_key(struct inpcb *);
-voidpf_state_key_unlink_reverse(struct pf_state_key *);
-
 #if NPFLOG > 0
 voidpf_log_matches(struct pf_pdesc *, struct pf_rule *,
struct pf_rule *, struct pf_ruleset *,
@@ -699,9 +694,8 @@ pf_state_key_attach(struct pf_state_key 
}
pool_put(_state_key_pl, sk);
s->key[idx] = cur;
-

Re: let's get reference to state key back

2016-03-28 Thread Alexandr Nedvedicky

Hello,

> i can't see any problem with this patch. i'm sending 14Mpps ip4 and ip6
> over ix intefaces and creating around 3M to 5M states and all this more
> than 24 hours. box is unusable but it's alive :)

thank you very much for testing. let's wait day or two to give other folks
chance to trip panics/leaks.

the patch you've kindly tried is tiny step, which prepares PF for
gentle/gradual unlock.

regards
sashan

let's get reference to state key back

2016-03-25 Thread Alexandr Nedvedicky

Hello,

this yet another patch, which got backed out in Jan. This time I'd like to
commit change, which makes sure IP stack keeps reference to state key along
the packet (mbuf).

The change should not cause panics, but I'd like to ask testers to watch
for memory leaks on state key pool (pfstkey):

while true ; do vmstat -m |egrep '^pf|^Name' ; sleep 5 ; done 

I could spot no memory leaks, but my test scenario is currently limited
to local traffic and simple router, which forwards packets.

thanks a lot
regards
sasha

8<---8<---8<--8<
Index: src/sys/kern/uipc_mbuf.c
===
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.220
diff -u -p -r1.220 uipc_mbuf.c
--- src/sys/kern/uipc_mbuf.c22 Mar 2016 06:17:00 -  1.220
+++ src/sys/kern/uipc_mbuf.c25 Mar 2016 12:49:38 -
@@ -72,6 +72,8 @@
  * Research Laboratory (NRL).
  */
 
+#include "pf.h"
+
 #include 
 #include 
 #include 
@@ -93,6 +95,10 @@
 #include 
 #endif
 
+#if NPF > 0
+#include 
+#endif /* NPF > 0 */
+
 struct mbstat mbstat;  /* mbuf stats */
 struct mutex mbstatmtx = MUTEX_INITIALIZER(IPL_NET);
 struct pool mbpool;/* mbuf pool */
@@ -261,6 +267,10 @@ m_resethdr(struct mbuf *m)
/* delete all mbuf tags to reset the state */
m_tag_delete_chain(m);
 
+#if NPF > 0
+   pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+
/* like m_inithdr(), but keep any associated data and mbufs */
memset(>m_pkthdr, 0, sizeof(m->m_pkthdr));
m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
@@ -350,8 +360,12 @@ m_free(struct mbuf *m)
if (n)
n->m_flags |= M_ZEROIZE;
}
-   if (m->m_flags & M_PKTHDR)
+   if (m->m_flags & M_PKTHDR) {
m_tag_delete_chain(m);
+#if NPF > 0
+   pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+   }
if (m->m_flags & M_EXT)
m_extfree(m);
 
@@ -1201,6 +1215,10 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
to->m_flags |= (from->m_flags & M_COPYFLAGS);
to->m_pkthdr = from->m_pkthdr;
+
+#if NPF > 0
+   pf_pkt_state_key_ref(to);
+#endif /* NPF > 0 */
 
SLIST_INIT(>m_pkthdr.ph_tags);
 
Index: src/sys/net/if_pfsync.c
===
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.227
diff -u -p -r1.227 if_pfsync.c
--- src/sys/net/if_pfsync.c 31 Jan 2016 00:18:07 -  1.227
+++ src/sys/net/if_pfsync.c 25 Mar 2016 12:49:41 -
@@ -523,6 +523,7 @@ pfsync_state_import(struct pfsync_state 
skw->port[0] = sp->key[PF_SK_WIRE].port[0];
skw->port[1] = sp->key[PF_SK_WIRE].port[1];
skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
+   PF_REF_INIT(skw->refcnt);
skw->proto = sp->proto;
if (!(skw->af = sp->key[PF_SK_WIRE].af))
skw->af = sp->af;
@@ -532,6 +533,7 @@ pfsync_state_import(struct pfsync_state 
sks->port[0] = sp->key[PF_SK_STACK].port[0];
sks->port[1] = sp->key[PF_SK_STACK].port[1];
sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
+   PF_REF_INIT(sks->refcnt);
if (!(sks->af = sp->key[PF_SK_STACK].af))
sks->af = sp->af;
if (sks->af != skw->af) {
Index: src/sys/net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.966
diff -u -p -r1.966 pf.c
--- src/sys/net/pf.c4 Mar 2016 22:38:23 -   1.966
+++ src/sys/net/pf.c25 Mar 2016 12:49:49 -
@@ -231,6 +231,11 @@ int pf_step_out_of_anchor(int *, 
stru
 voidpf_counters_inc(int, struct pf_pdesc *,
struct pf_state *, struct pf_rule *,
struct pf_rule *);
+voidpf_state_key_link(struct pf_state_key *,
+   struct pf_state_key *);
+voidpf_inpcb_unlink_state_key(struct inpcb *);
+voidpf_state_key_unlink_reverse(struct pf_state_key *);
+
 #if NPFLOG > 0
 voidpf_log_matches(struct pf_pdesc *, struct pf_rule *,
struct pf_rule *, struct pf_ruleset *,
@@ -732,6 +737,7 @@ void
 pf_state_key_detach(struct pf_state *s, int idx)
 {
struct pf_state_item*si;
+   struct pf_state_key *sk;
 
if (s->key[idx] == NULL)
return;
@@ -745,15 +751,15 @@ pf_state_key_detach(struct pf_state *s, 
pool_put(_state_item_pl, si);
}
 
-   if (TAILQ_EMPTY(>key[idx]->states)) {
-   RB_REMOVE(pf_state_tree, _statetbl, s->key[idx]);
-   if (s->key[idx]->reverse)
-

Re: let's get reference to state key back

2016-03-25 Thread Alexandr Nedvedicky

Hello,

sorry I have not test patch, which I've sent earlier.
I boot unpatched kernel...

this is the fix one need. updated patch is further below.

8<---8<-8<

diff -r 00f90fca186c src/sys/net/pf.c
--- a/src/sys/net/pf.c  Fri Mar 25 09:29:43 2016 +0100
+++ b/src/sys/net/pf.c  Fri Mar 25 21:06:11 2016 +0100
@@ -6566,7 +6566,8 @@
 * IP tunnels.
 */
KASSERT(pd.m->m_pkthdr.pf.statekey == NULL);
-   pd.m->m_pkthdr.pf.statekey = s->key[PF_SK_STACK];
+   pd.m->m_pkthdr.pf.statekey =
+   pf_state_key_ref(s->key[PF_SK_STACK]);
}
if (pd.dir == PF_OUT &&
pd.m->m_pkthdr.pf.inp && !pd.m->m_pkthdr.pf.inp->inp_pf_sk &&

8<---8<-8<


sorry for inconveniences.

regards
sasha


8<---8<-8<

Index: src/sys/kern/uipc_mbuf.c
===
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.220
diff -u -p -r1.220 uipc_mbuf.c
--- src/sys/kern/uipc_mbuf.c22 Mar 2016 06:17:00 -  1.220
+++ src/sys/kern/uipc_mbuf.c26 Mar 2016 00:12:07 -
@@ -72,6 +72,8 @@
  * Research Laboratory (NRL).
  */
 
+#include "pf.h"
+
 #include 
 #include 
 #include 
@@ -93,6 +95,10 @@
 #include 
 #endif
 
+#if NPF > 0
+#include 
+#endif /* NPF > 0 */
+
 struct mbstat mbstat;  /* mbuf stats */
 struct mutex mbstatmtx = MUTEX_INITIALIZER(IPL_NET);
 struct pool mbpool;/* mbuf pool */
@@ -261,6 +267,10 @@ m_resethdr(struct mbuf *m)
/* delete all mbuf tags to reset the state */
m_tag_delete_chain(m);
 
+#if NPF > 0
+   pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+
/* like m_inithdr(), but keep any associated data and mbufs */
memset(>m_pkthdr, 0, sizeof(m->m_pkthdr));
m->m_pkthdr.pf.prio = IFQ_DEFPRIO;
@@ -350,8 +360,12 @@ m_free(struct mbuf *m)
if (n)
n->m_flags |= M_ZEROIZE;
}
-   if (m->m_flags & M_PKTHDR)
+   if (m->m_flags & M_PKTHDR) {
m_tag_delete_chain(m);
+#if NPF > 0
+   pf_pkt_unlink_state_key(m);
+#endif /* NPF > 0 */
+   }
if (m->m_flags & M_EXT)
m_extfree(m);
 
@@ -1201,6 +1215,10 @@ m_dup_pkthdr(struct mbuf *to, struct mbu
to->m_flags = (to->m_flags & (M_EXT | M_EXTWR));
to->m_flags |= (from->m_flags & M_COPYFLAGS);
to->m_pkthdr = from->m_pkthdr;
+
+#if NPF > 0
+   pf_pkt_state_key_ref(to);
+#endif /* NPF > 0 */
 
SLIST_INIT(>m_pkthdr.ph_tags);
 
Index: src/sys/net/if_pfsync.c
===
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.227
diff -u -p -r1.227 if_pfsync.c
--- src/sys/net/if_pfsync.c 31 Jan 2016 00:18:07 -  1.227
+++ src/sys/net/if_pfsync.c 26 Mar 2016 00:12:08 -
@@ -523,6 +523,7 @@ pfsync_state_import(struct pfsync_state 
skw->port[0] = sp->key[PF_SK_WIRE].port[0];
skw->port[1] = sp->key[PF_SK_WIRE].port[1];
skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
+   PF_REF_INIT(skw->refcnt);
skw->proto = sp->proto;
if (!(skw->af = sp->key[PF_SK_WIRE].af))
skw->af = sp->af;
@@ -532,6 +533,7 @@ pfsync_state_import(struct pfsync_state 
sks->port[0] = sp->key[PF_SK_STACK].port[0];
sks->port[1] = sp->key[PF_SK_STACK].port[1];
sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
+   PF_REF_INIT(sks->refcnt);
if (!(sks->af = sp->key[PF_SK_STACK].af))
sks->af = sp->af;
if (sks->af != skw->af) {
Index: src/sys/net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.966
diff -u -p -r1.966 pf.c
--- src/sys/net/pf.c4 Mar 2016 22:38:23 -   1.966
+++ src/sys/net/pf.c26 Mar 2016 00:12:09 -
@@ -231,6 +231,11 @@ int pf_step_out_of_anchor(int *, 
stru
 voidpf_counters_inc(int, struct pf_pdesc *,
struct pf_state *, struct pf_rule *,
struct pf_rule *);
+voidpf_state_key_link(struct pf_state_key *,
+   struct pf_state_key *);
+voidpf_inpcb_unlink_state_key(struct inpcb *);
+voidpf_state_key_unlink_reverse(struct pf_state_key *);
+
 #if NPFLOG > 0
 voidpf_log_matches(struct pf_pdesc *, struct pf_rule *,
struct pf_rule *, struct pf_ruleset *,
@@ -732,6 +737,7 @@ void
 pf_state_key_detach(struct pf_state *s, int idx)
 {
struct pf_state_item*si;
+   struct pf_state_key *sk;
 
if (s->key[idx] == NULL)

Re: KASSERT() @ pf_test() is back

2016-03-07 Thread Alexandr Nedvedicky

On Mon, Mar 07, 2016 at 10:54:26AM +0100, Mattieu Baptiste wrote:
> On Mon, Mar 7, 2016 at 10:03 AM, Alexandr Nedvedicky
> <alexandr.nedvedi...@oracle.com> wrote:
> > Hello Mattieu,
> >
> > Mark Patruck reported panic on KASSERT() in pf_test() yesterday . I've 
> > crafted
> > patch below. Can you try it out?
> >
> > I think we need to apply pf_pkt_addr_changed() on broadcast packets seen by 
> > bridge
> > as well.
> >
> > thanks a lot
> > and sorry for inconveniences
> 
> Hi Alexandr,
> I don't know if both are necessary but, with your patch and the one
> from Martin, the box survives after reboot. Thanks!
> Do you want I test just with your patch?

Hello Mattieu,

I think it makes no point to test my patch only, Mark has done it already.
do it just in case you really have time to do it.

I still need to look at Martin's diff more closely. It's very likely it
fixes yet another edge case, which we are not aware of.

thanks for quick testing.

regards
sasha

Re: KASSERT() @ pf_test() is back

2016-03-04 Thread Alexandr Nedvedicky

Hello Stuart,

thanks for testing it. I'll commit it today.

regards
sasha

On Fri, Mar 04, 2016 at 01:08:42AM +, Stuart Henderson wrote:
> On 2016/02/28 13:01, Martin Pieuchot wrote:
> > On 08/02/16(Mon) 01:55, Alexandr Nedvedicky wrote:
> > > Hello,
> > > 
> > > I don't expect to see O.K. to patch below.
> > > 
> > > The patch is the part of the change, which has been backed out last 
> > > weekend.
> > > Too many things were wrong so I'm trying to untangle the code a bit.
> > > 
> > > Patch below is for brave hearts, who don't mind to see KASSERT() to fire:
> > 
> > Any progress with this diff?  Now would be the good time to get it in.
> 
> I'm running it on a pppoe router (v4/v6 plus ipsec), it hasn't exploded
> yet. I'm in favour of putting it in to see what happens, it's a good time
> in the cycle.
>

Re: KASSERT() @ pf_test() is back

2016-03-07 Thread Alexandr Nedvedicky

Hello Mattieu,

Mark Patruck reported panic on KASSERT() in pf_test() yesterday . I've crafted
patch below. Can you try it out?

I think we need to apply pf_pkt_addr_changed() on broadcast packets seen by 
bridge
as well.

thanks a lot
and sorry for inconveniences

regards
sasha

-8<8<8<---8<

Index: if_bridge.c
===
RCS file: /cvs/src/sys/net/if_bridge.c,v
retrieving revision 1.275
diff -u -p -r1.275 if_bridge.c
--- if_bridge.c 5 Dec 2015 10:07:55 -   1.275
+++ if_bridge.c 7 Mar 2016 09:00:06 -
@@ -1283,6 +1283,10 @@ bridge_localbroadcast(struct bridge_soft
return;
}
 
+#if NPF > 0
+   pf_pkt_addr_changed(m1);
+#endif /* NPF */
+
bridge_ifinput(ifp, m1);
 }

Re: pf ouraddr

2016-07-18 Thread Alexandr Nedvedicky

Hello,

it looks good to me.

OK sasha

On Mon, Jul 18, 2016 at 10:51:44AM +0200, Alexander Bluhm wrote:
> Hi,
> 
> To hide pf internals move code from in_ouraddr() to pf_ouraddr().
> This will also make it possible to implement the same shortcut for
> IPv6.
> 
> ok?
> 
> bluhm
>

Re: refactor PF option parsing loops

2017-01-24 Thread Alexandr Nedvedicky

Hello Richard,

> PF implements six distinct TCP option parsing loops. This patch converts 
> these to one inline function in pfvar_priv.h, normalises their semantics, 
> and strips ~100 lines. 

what is the reason to keep function definition in pfvar_priv.h?
I would expect to stick function header to pfvar_priv.h and
definition to .c.

The only reason, which comes to my mind you want to avoid the extra
stack frame for any price. So the only way to give compiler chance
to inline the pf_find_tcpopt() is to keep its definition in .h.
is my assumption correct?

My preference is to put the function to .c.

thanks and
regards
sasha

Re: removing expired once rules in pf_purge_thread()

2016-09-03 Thread Alexandr Nedvedicky

Hello,

there was still one more glitch catched by mikeb:

I have to sanitize pointer when copying rule to userland.

The other thing pointed out by mike is the Expired time should
be printed for expired rules only in debug mode output (pfctl -sr -g)

Incremental patch is as follows:
8<---8<---8<--8<
diff -r 7a93d568ede3 -r b366ba821dfb src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cSat Sep 03 15:06:16 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cSat Sep 03 16:20:24 2016 +0200
@@ -702,14 +702,9 @@ pfctl_print_rule_counters(struct pf_rule
printf("  [ queue: qname=%s qid=%u pqname=%s pqid=%u ]\n",
rule->qname, rule->qid, rule->pqname, rule->pqid);
 
-   if (rule->rule_flag & PFRULE_ONCE)
-   if (rule->rule_flag & PFRULE_EXPIRED)
-   printf("  [ Expired: %lld secs ago ]\n",
-   (long long)(time(NULL) - rule->exptime));
-   else
-   printf("  [ Expired: not yet ]\n");
-   else
-   printf("  [ Expired: never ]\n");
+   if (rule->rule_flag & PFRULE_EXPIRED)
+   printf("  [ Expired: %lld secs ago ]\n",
+   (long long)(time(NULL) - rule->exptime));
}
if (opts & PF_OPT_VERBOSE) {
printf("  [ Evaluations: %-8llu  Packets: %-8llu  "
diff -r 7a93d568ede3 -r b366ba821dfb src/sys/net/pf_ioctl.c
--- a/src/sys/net/pf_ioctl.cSat Sep 03 15:06:16 2016 +0200
+++ b/src/sys/net/pf_ioctl.cSat Sep 03 16:20:24 2016 +0200
@@ -1268,6 +1268,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
pr->rule.rcv_kif = NULL;
pr->rule.anchor = NULL;
pr->rule.overload_tbl = NULL;
+   bzero(>rule.gcle, sizeof(pr->rule.gcle));
+   pr->rule.ruleset = NULL;
if (pf_anchor_copyout(ruleset, rule, pr)) {
error = EBUSY;
break;
8<---8<---8<--8<

complete patch I'd like to commit is further below.

regards
sasha

8<---8<---8<--8<
diff -r 8006a1eca673 src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cSat Sep 03 14:19:17 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cSat Sep 03 16:26:08 2016 +0200
@@ -701,6 +701,10 @@ pfctl_print_rule_counters(struct pf_rule
 
printf("  [ queue: qname=%s qid=%u pqname=%s pqid=%u ]\n",
rule->qname, rule->qid, rule->pqname, rule->pqid);
+
+   if (rule->rule_flag & PFRULE_EXPIRED)
+   printf("  [ Expired: %lld secs ago ]\n",
+   (long long)(time(NULL) - rule->exptime));
}
if (opts & PF_OPT_VERBOSE) {
printf("  [ Evaluations: %-8llu  Packets: %-8llu  "
@@ -848,7 +852,13 @@ pfctl_show_rules(int dev, char *path, in
INDENT(depth, !(opts & PF_OPT_VERBOSE));
printf("}\n");
} else {
-   printf("\n");
+   /*
+* Do not print newline, when we have not
+* printed expired rule.
+*/
+   if (!(pr.rule.rule_flag & PFRULE_EXPIRED) ||
+   (opts & (PF_OPT_VERBOSE2|PF_OPT_DEBUG)))
+   printf("\n");
pfctl_print_rule_counters(, opts);
}
break;
diff -r 8006a1eca673 src/sbin/pfctl/pfctl_parser.c
--- a/src/sbin/pfctl/pfctl_parser.c Sat Sep 03 14:19:17 2016 +0200
+++ b/src/sbin/pfctl/pfctl_parser.c Sat Sep 03 16:26:08 2016 +0200
@@ -701,8 +701,12 @@ print_rule(struct pf_rule *r, const char
int verbose = opts & (PF_OPT_VERBOSE2 | PF_OPT_DEBUG);
char*p;
 
+   if ((r->rule_flag & PFRULE_EXPIRED) && (!verbose))
+   return;
+
if (verbose)
printf("@%d ", r->nr);
+
if (r->action > PF_MATCH)
printf("action(%d)", r->action);
else if (anchor_call[0]) {
diff -r 8006a1eca673 src/sys/net/pf.c
--- a/src/sys/net/pf.c  Sat Sep 03 14:19:17 2016 +0200
+++ b/src/sys/net/pf.c  Sat Sep 03 16:26:08 2016 +0200
@@ -311,6 +311,9 @@ RB_GENERATE(pf_state_tree, pf_state_key,
 RB_GENERATE(pf_state_tree_id, pf_state,
 entry_id, pf_state_compare_id);
 
+SLIST_HEAD(pf_rule_gcl, pf_rule)   pf_rule_gcl =
+   SLIST_HEAD_INITIALIZER(pf_rule_gcl);
+
 __inline int
 pf_addr_compare(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
@@ -1174,6 +1177,29 @@ pf_state_export(struct pfsync_state *sp,
 /* END state table stuff */
 
 void

Re: removing expired once rules in pf_purge_thread()

2016-09-03 Thread Alexandr Nedvedicky

Hello,


> >updated version is below.
> >
> >comments? O.K.?
> 
> One comment, otherwise ok.
> 

> Could you assert the lock is held otherwise, this might save
> effort if/when this code is refactored:
> 
>   else
>   rw_assert_wrlock(_consistency_lock);

sure.

also mikeb came to me with one more suggestion. mikeb does not
like the change to print_rule():

@@ -1120,6 +1124,9 @@
printf(" ");
print_pool(>route, 0, 0, r->af, PF_POOL_ROUTE, verbose);
}
+
+   if (r->rule_flag & PFRULE_EXPIRED)
+   printf("[ rule expired ]");
 }
 

he suggests to move this bit to pfctl_print_rules_counters().
Mike also proposed to record time in rule when it got expired
and print it along the rules counters (pfctl -sr -g).  

I've also noticed pfctl -sr might print empty lines for expired
rules, hence there is a new check at pfctl_show_rules():

@@ -848,7 +857,13 @@ pfctl_show_rules(int dev, char *path, in
INDENT(depth, !(opts & PF_OPT_VERBOSE));
printf("}\n");
} else {
-   printf("\n");
+   /*
+* Do not print newline, when we have not
+* printed expired rule.
+*/
+   if (!(pr.rule.rule_flag & PFRULE_EXPIRED) ||
+   (opts & (PF_OPT_VERBOSE2|PF_OPT_DEBUG)))
+   printf("\n");
pfctl_print_rule_counters(, opts);
}


is that OK?

thanks a lot
regards
sasha

8<---8<---8<--8<
diff -r 8006a1eca673 src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cSat Sep 03 14:19:17 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cSat Sep 03 15:07:21 2016 +0200
@@ -701,6 +701,15 @@ pfctl_print_rule_counters(struct pf_rule
 
printf("  [ queue: qname=%s qid=%u pqname=%s pqid=%u ]\n",
rule->qname, rule->qid, rule->pqname, rule->pqid);
+
+   if (rule->rule_flag & PFRULE_ONCE)
+   if (rule->rule_flag & PFRULE_EXPIRED)
+   printf("  [ Expired: %lld secs ago ]\n",
+   (long long)(time(NULL) - rule->exptime));
+   else
+   printf("  [ Expired: not yet ]\n");
+   else
+   printf("  [ Expired: never ]\n");
}
if (opts & PF_OPT_VERBOSE) {
printf("  [ Evaluations: %-8llu  Packets: %-8llu  "
@@ -848,7 +857,13 @@ pfctl_show_rules(int dev, char *path, in
INDENT(depth, !(opts & PF_OPT_VERBOSE));
printf("}\n");
} else {
-   printf("\n");
+   /*
+* Do not print newline, when we have not
+* printed expired rule.
+*/
+   if (!(pr.rule.rule_flag & PFRULE_EXPIRED) ||
+   (opts & (PF_OPT_VERBOSE2|PF_OPT_DEBUG)))
+   printf("\n");
pfctl_print_rule_counters(, opts);
}
break;
diff -r 8006a1eca673 src/sbin/pfctl/pfctl_parser.c
--- a/src/sbin/pfctl/pfctl_parser.c Sat Sep 03 14:19:17 2016 +0200
+++ b/src/sbin/pfctl/pfctl_parser.c Sat Sep 03 15:07:21 2016 +0200
@@ -701,8 +701,12 @@ print_rule(struct pf_rule *r, const char
int verbose = opts & (PF_OPT_VERBOSE2 | PF_OPT_DEBUG);
char*p;
 
+   if ((r->rule_flag & PFRULE_EXPIRED) && (!verbose))
+   return;
+
if (verbose)
printf("@%d ", r->nr);
+
if (r->action > PF_MATCH)
printf("action(%d)", r->action);
else if (anchor_call[0]) {
diff -r 8006a1eca673 src/sys/net/pf.c
--- a/src/sys/net/pf.c  Sat Sep 03 14:19:17 2016 +0200
+++ b/src/sys/net/pf.c  Sat Sep 03 15:07:21 2016 +0200
@@ -311,6 +311,9 @@ RB_GENERATE(pf_state_tree, pf_state_key,
 RB_GENERATE(pf_state_tree_id, pf_state,
 entry_id, pf_state_compare_id);
 
+SLIST_HEAD(pf_rule_gcl, pf_rule)   pf_rule_gcl =
+   SLIST_HEAD_INITIALIZER(pf_rule_gcl);
+
 __inline int
 pf_addr_compare(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
@@ -1174,6 +1177,29 @@ pf_state_export(struct pfsync_state *sp,
 /* END state table stuff */
 
 void
+pf_purge_expired_rules(int locked)
+{
+   struct pf_rule  *r;
+
+   if (SLIST_EMPTY(_rule_gcl))
+   return;
+
+   if (!locked)
+   rw_enter_write(_consistency_lock);
+

Re: pfctl mixes up anchorname with anchorpath

2016-09-03 Thread Alexandr Nedvedicky

Hello,

mikeb pointed out I should not be copy'n'pasting buggy code.
I'm just re-sending updated patch with change below:

8<---8<---8<--8<

diff -r e2383ec80feb src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cSat Sep 03 15:39:17 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cSat Sep 03 16:56:52 2016 +0200
@@ -1076,7 +1076,7 @@
sizeof(rule->anchor->path)) >= sizeof(rule->anchor->path))
 errx(1, "pfctl_add_rule: strlcpy");
if ((p = strrchr(anchor_call, '/')) != NULL) {
-   if (!strlen(p))
+   if (strlen(p) == 1)
err(1, "pfctl_add_rule: bad anchor name %s",
anchor_call);
} else
@@ -1486,7 +1486,7 @@
errx(1, "pfctl_add_rule: strlcpy");
 
if ((p = strrchr(anchorname, '/')) != NULL) {
-   if (!strlen(p))
+   if (strlen(p) == 1)
err(1, "pfctl_add_rule: bad anchor name %s",
anchorname);
} else

8<---8<---8<--8<

The problem with strrchr() & strlen() construct is the strrchr() returns
pointer to occurrence of '/' (if it is found). Consider anchorname
contains something like this:

anchorname = "slash/"

the 'p' then gets set to "/", thus !strlen(p) is never satisfied.
The invalid name must travel all the way down to kernel just to
find EINVAL reported by pf_begin_rules()/pf_find_or_create_ruleset().
After this change people should start seeing more helpful message.

updated patch is further below.

thanks and
regards
sasha

8<---8<---8<--8<
diff -r 8006a1eca673 src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cSat Sep 03 14:19:17 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cSat Sep 03 17:12:50 2016 +0200
@@ -1076,7 +1076,7 @@
sizeof(rule->anchor->path)) >= sizeof(rule->anchor->path))
 errx(1, "pfctl_add_rule: strlcpy");
if ((p = strrchr(anchor_call, '/')) != NULL) {
-   if (!strlen(p))
+   if (strlen(p) == 1)
err(1, "pfctl_add_rule: bad anchor name %s",
anchor_call);
} else
@@ -1342,7 +1342,7 @@
if (path[0])
snprintf([len], PATH_MAX - len, "/%s", pf->anchor->name);
else
-   snprintf([len], PATH_MAX - len, "%s", pf->anchor->name);
+   snprintf([len], PATH_MAX - len, "%s", pf->anchor->path);
 
if (depth) {
if (TAILQ_FIRST(rs->rules.active.ptr) != NULL) {
@@ -1447,6 +1447,7 @@
struct pfr_table trs;
char*path = NULL;
int  osize;
+   char*p;
 
bzero(, sizeof(pf));
RB_INIT(_anchors);
@@ -1483,7 +1484,15 @@
if (strlcpy(pf.anchor->path, anchorname,
sizeof(pf.anchor->path)) >= sizeof(pf.anchor->path))
errx(1, "pfctl_add_rule: strlcpy");
-   if (strlcpy(pf.anchor->name, anchorname,
+
+   if ((p = strrchr(anchorname, '/')) != NULL) {
+   if (strlen(p) == 1)
+   err(1, "pfctl_add_rule: bad anchor name %s",
+   anchorname);
+   } else
+   p = anchorname;
+
+   if (strlcpy(pf.anchor->name, p,
sizeof(pf.anchor->name)) >= sizeof(pf.anchor->name))
errx(1, "pfctl_add_rule: strlcpy");

pfctl mixes up anchorname with anchorpath

2016-09-03 Thread Alexandr Nedvedicky

Hello,

One of the teams in Oracle Solaris uses sophisticated naming scheme for PF
rulesets. The anchor (ruleset) is identified by something like that:

root/whatever:component:name/some-virtual-instance-long-name/inbound

That particular team hit a bug in pfctl, when they were trying to load rule to
ruleset specified by anchor above. pfctl(8) on OpenBSD suffers from same
problem:

echo 'pass'|pfctl -a 
root/whatever:component:name/some-virtual-instance-long-name/inbound -f -
pfctl: pfctl_add_rule: strlcpy

the command above bails out in pfctl_rules() function at line 1488:

1481 pf_init_ruleset(rs);
1482 rs->anchor = pf.anchor;
1483 if (strlcpy(pf.anchor->path, anchorname,
1484 sizeof(pf.anchor->path)) >= sizeof(pf.anchor->path))
1485 errx(1, "pfctl_add_rule: strlcpy");
1486 if (strlcpy(pf.anchor->name, anchorname,
1487 sizeof(pf.anchor->name)) >= sizeof(pf.anchor->name))
1488 errx(1, "pfctl_add_rule: strlcpy");
1489 

Looks like pfctl confuses anchorname with anchorpath. The anchorname uses 64B
buffer. Anchorname is a leaf-path component. If we stick to example above, then
anchorname should be '/inbound'.  The snippet above has been fixed by change
below:

+
+   if ((p = strrchr(anchorname, '/')) != NULL) {
+   if (!strlen(p))
+   err(1, "pfctl_add_rule: bad anchor name %s",
+   anchorname);
+   } else
+   p = anchorname;
+
+   if (strlcpy(pf.anchor->name, p,

same code already exists in pfctl_add_rule(). After giving a try I hit
a different error:

pfctl: pfctl_get_ticket: assertion failed

This time game over happened at line 1505:

1496if ((opts & PF_OPT_NOACTION) == 0) {
1497 /*
1498  * XXX For the time being we need to open transactions 
for
1499  * the main ruleset before parsing, because tables are 
still
1500  * loaded at parse time.
1501  */
1502 if (pfctl_ruleset_trans(, anchorname, pf.anchor))
1503 ERRX("pfctl_rules");
1504 pf.astack[0]->ruleset.tticket =
1505 pfctl_get_ticket(t, PF_TRANS_TABLE, anchorname);
1506 }

After some more debugging I've arrived to pfctl_load_ruleset():

1340 pf->anchor = rs->anchor;
1341 
1342 if (path[0])
1343 snprintf([len], PATH_MAX - len, "/%s", 
pf->anchor->name);
1344 else
1345 snprintf([len], PATH_MAX - len, "%s", 
pf->anchor->name);

I think the else branch should be using ->path instead of ->name.
Complete patch is further below. The patch works fine for me (Solaris),
still I'm not quite sure it's 100% correct.

thanks and
regards
sasha

8<---8<---8<--8<
diff -r 2f5f0295677c src/sbin/pfctl/pfctl.c
--- a/src/sbin/pfctl/pfctl.cFri Sep 02 15:53:45 2016 +0200
+++ b/src/sbin/pfctl/pfctl.cFri Sep 02 17:31:05 2016 +0200
@@ -1342,7 +1342,7 @@ pfctl_load_ruleset(struct pfctl *pf, cha
if (path[0])
snprintf([len], PATH_MAX - len, "/%s", pf->anchor->name);
else
-   snprintf([len], PATH_MAX - len, "%s", pf->anchor->name);
+   snprintf([len], PATH_MAX - len, "%s", pf->anchor->path);
 
if (depth) {
if (TAILQ_FIRST(rs->rules.active.ptr) != NULL) {
@@ -1447,6 +1447,7 @@ pfctl_rules(int dev, char *filename, int
struct pfr_table trs;
char*path = NULL;
int  osize;
+   char*p;
 
bzero(, sizeof(pf));
RB_INIT(_anchors);
@@ -1483,7 +1484,15 @@ pfctl_rules(int dev, char *filename, int
if (strlcpy(pf.anchor->path, anchorname,
sizeof(pf.anchor->path)) >= sizeof(pf.anchor->path))
errx(1, "pfctl_add_rule: strlcpy");
-   if (strlcpy(pf.anchor->name, anchorname,
+
+   if ((p = strrchr(anchorname, '/')) != NULL) {
+   if (!strlen(p))
+   err(1, "pfctl_add_rule: bad anchor name %s",
+   anchorname);
+   } else
+   p = anchorname;
+
+   if (strlcpy(pf.anchor->name, p,
sizeof(pf.anchor->name)) >= sizeof(pf.anchor->name))
errx(1, "pfctl_add_rule: strlcpy");

Re: removing expired once rules in pf_purge_thread()

2016-08-29 Thread Alexandr Nedvedicky

Hello,

mikeb has just pointed out the patch fell under the desk asking me to resend
it.

> henning@ and mikeb@ showed some interest to change handling of once rules to
> the same way as PF has it on Solaris. Just to refresh the audience on once
> option offered by PF:
> 
>  onceCreates a one shot rule that will remove itself from an active
>  ruleset after the first match.  In case this is the only rule in
>  the anchor, the anchor will be destroyed automatically after the
>  rule is matched.
>-- pf.conf(5)
> 
> Currently the once rules are removed by matching packet. Patch makes life for
> packets, which match once rules bit easier. Packets instead of removing rule
> from ruleset just mark rule as expired and put it to garbage colloector list.
> The list is processed by pf_purge_thread(), which just removes and deletes
> those expired rules. To get there we need to simplify pf_purge_rule() image,
> which currently looks as follows:
> 
> void
> pf_purge_rule(struct pf_ruleset *ruleset, struct pf_rule *rule,
>   struct pf_ruleset *aruleset, struct pf_rule *arule)
> 
>   - ruleset is the ruleset, where once rule is being removed from
> 
>   - rule is a once rule to remove
> 
>   - aruleset holds an anchor rule with once-rule we remove
> 
>   - arule an anchor which holds a once rule
> 
> To make pf_purge_rule() suitable for pf_purge_thread() it has to be changed 
> to:
> 
> void
> pf_purge_rule(struct pf_rule *once_rule)
> 
> To get there the ruleset and arule has to be carried by once_rule itself.
> Therefore patch adds those members to pf_rule:
>   struct pf_ruleset   *myruleset
>   struct pf_rule  *myarule
>   SLIST_ENTRY(pf_rule) gcle
> (the gcle is garbage colleter list link).
> 
> Patch sets myruleset as soon as rule gets inserted to ruleset in SIOCADDRULE
> ioctl. The myarule is set in pf_test_rule(), when once rule is marked as
> expired.
> 
> Don't forget to recompile all user-land bits (pfctl, proxies et. al.) when
> you'll be testing the patch, since pf_rule structure gets changed.
> 
> regards
> sasha
> 

updated version is below.

comments? O.K.?

8<---8<---8<--8<
diff -r ca96e396772c src/sbin/pfctl/pfctl_parser.c
--- a/src/sbin/pfctl/pfctl_parser.c Mon Aug 29 23:23:38 2016 +0200
+++ b/src/sbin/pfctl/pfctl_parser.c Tue Aug 30 00:41:03 2016 +0200
@@ -701,8 +701,12 @@
int verbose = opts & (PF_OPT_VERBOSE2 | PF_OPT_DEBUG);
char*p;
 
+   if ((r->rule_flag & PFRULE_EXPIRED) && (!verbose))
+   return;
+
if (verbose)
printf("@%d ", r->nr);
+
if (r->action > PF_MATCH)
printf("action(%d)", r->action);
else if (anchor_call[0]) {
@@ -1120,6 +1124,9 @@
printf(" ");
print_pool(>route, 0, 0, r->af, PF_POOL_ROUTE, verbose);
}
+
+   if (r->rule_flag & PFRULE_EXPIRED)
+   printf("[ rule expired ]");
 }
 
 void
diff -r ca96e396772c src/sys/net/pf.c
--- a/src/sys/net/pf.c  Mon Aug 29 23:23:38 2016 +0200
+++ b/src/sys/net/pf.c  Tue Aug 30 00:41:03 2016 +0200
@@ -311,6 +311,9 @@
 RB_GENERATE(pf_state_tree_id, pf_state,
 entry_id, pf_state_compare_id);
 
+SLIST_HEAD(pf_rule_gcl, pf_rule)   pf_rule_gcl =
+   SLIST_HEAD_INITIALIZER(pf_rule_gcl);
+
 __inline int
 pf_addr_compare(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
@@ -1174,6 +1177,27 @@
 /* END state table stuff */
 
 void
+pf_purge_expired_rules(int locked)
+{
+   struct pf_rule  *r;
+
+   if (SLIST_EMPTY(_rule_gcl))
+   return;
+
+   if (!locked)
+   rw_enter_write(_consistency_lock);
+
+   while ((r = SLIST_FIRST(_rule_gcl)) != NULL) {
+   SLIST_REMOVE(_rule_gcl, r, pf_rule, gcle);
+   KASSERT(r->rule_flag & PFRULE_EXPIRED);
+   pf_purge_rule(r);
+   }
+
+   if (!locked)
+   rw_exit_write(_consistency_lock);
+}
+
+void
 pf_purge_thread(void *v)
 {
int nloops = 0, s;
@@ -1191,6 +1215,7 @@
if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
pf_purge_expired_fragments();
pf_purge_expired_src_nodes(0);
+   pf_purge_expired_rules(0);
nloops = 0;
}
 
@@ -3491,6 +3516,10 @@
ruleset = _main_ruleset;
r = TAILQ_FIRST(pf_main_ruleset.rules.active.ptr);
while (r != NULL) {
+   if (r->rule_flag & PFRULE_EXPIRED) {
+   r = TAILQ_NEXT(r, entries);
+   goto nextrule;
+   }
r->evaluations++;
PF_TEST_ATTRIB((pfi_kif_match(r->kif, pd->kif) == r->ifnot),
r->skip[PF_SKIP_IFP].ptr);
@@ -3796,8 +3825,15

Re: removing expired once rules in pf_purge_thread()

2016-08-30 Thread Alexandr Nedvedicky

Hello,

On Tue, Aug 30, 2016 at 10:53:56AM +1000, David Gwynne wrote:
> 
> > On 17 Dec 2015, at 13:30, Richard Procter <richard.n.proc...@gmail.com> 
> > wrote:
> > 
> > 
> > Hi Sasha, 
> > 
> > On Fri, 18 Dec 2015, Alexandr Nedvedicky wrote:
> > 
> >>> Right. I'll just note though that the patch as it stands allows 
> >>> multiple winners [...] Whether that's a realistic issue, I don't know. 
> >>> I have though been bitten by enough edge cases like this to be very 
> >>> wary of them.
> >> 
> >> I think it's not realistic with current PF at OpenBSD. The pf_test() 
> >> function
> >> does not run concurrently, so there can be no such race.
> > 
> > Fair enough :) I presumed that an upcoming concurrent pf_test() would rely 
> > on this patch.
> > 
> >> FYI: PF@solaris uses mutex there to protect the insertion to gcl. Code 
> >> goes as
> >> follows at Solaris:
> >> 
> >>if (!(r->rule_flags & PFRULE_EXPIRED)) {
> >>mutex_enter(_mutex);
> >>/* retest under exclusive protection */
> >>if (!(r->rule_flags & PFRULE_EXPIRED)) {
> >>r->rule_flag |= PFRULE_EXPIRED;
> >>SLIST_INSERT_HEAD(_rule_gcl, r, gcle);
> >>}
> >>}
> > 
> > I was wondering about that. For this patch presumably it's thought 
> > unnecessary to mutex the SLIST ops (or for that matter to preserve rule 
> > lifetime) for one-shot rules as the purge thread runs so infrequently.
> 
> right now pf doesnt run concurrently so extra locking is unnecessary. when it
> does run concurrently then it will need a mutex, despite how infrequently the
> gc runs.

I see. I'll remove the mutex and resend the patch.

> 
> > 
> > BTW, the purge queue exists to pass a result between threads; an 
> > alternative is to recompute it in the purge thread by searching for 
> > expired rules: no need for the queue, its mutexes, etc. Easier to preserve 
> > rule lifetime, too. May need another anchor stack though. I might have a 
> > go next year when the code is more settled.
> 
> i believe the purge thread exists because once upon a time pool_free from
> interrupt context wasnt the best. i might be misremembering that though.

PF at Solaris still keeps purge thread around, although it's entirely
possible to let packet free memory. PF on Solaris does not have to deal with
such constrains.

> 
> currently that thread only looks at states. traversing the ruleset is
> possible i guess.

yes, ruleset traversal is possible, however gcl (garbage collector list) is
meant as a shortcut. If packet hits ONCE rule it sets the flag to indicate
rule has matched and inserts the rule to gcl.

the purge thread then does not need to walk traverse all rulesets to collect
'dead' once rules. Instead it just walks the gcl to find out, which ONCE
rules are dead to pull them out.

There is also yet another option:

remove gcl completely and just mark the ONCE rule got hit.  we can
leave dead ONCE rule to remain in ruleset until new rules will be
loaded or system will be rebooted.

> if the semantic of the once rules is that they only match
> once, and if pf will run on multiple cpus, then coordinating between the cpus
> will require atomicity or serialisation of some sort.

PF on Solaris is willing to pay price of not having ONCE rules 100% atomic,
there is currently no atomic op, which would set the flag to indicate rule
matched packet.


regards
sasha

let PF to send challenge ack

2016-09-30 Thread Alexandr Nedvedicky

Hello,

patch below makes life easier for clients, which always use same source port,
when talking to server (e.g. think of NFS). The scenario we are dealing with
is as follows:

- client mounts remote NFS share

- there is a PF sitting between client and NFS server. the mount
  operation in earlier step has created a state in PF

- client panics/loses power or whatever catastrophe happens preventing
  client to properly cease NFS sessions

- client boots up and attempts to mount same NFS share

- the SYN packet sent by client gets blocked by PF as it hits
  existing session, which is in established state.

The patch makes PF to send 'challenge ACK' for SYN packet, which matches
session in established state. Challenge ACK will have the last sequence numbers
firewall remembers for existing session. Client should response with RST to
challenge ACK. The RST will make firewall and remote server to kill old
session. Then client will retransmit SYN packet. The retransmitted SYN will
create a new state in PF and world will be nice and shiny again.

The challenge ACK sent by firewall essentially emulates behavior of NFS
server TCP stack. NFS server would also response with challenge ACK upon
reception of SYN, which matches existing connection.

The patch also comes with test case. I've just learned it's bit tricky
to use scapy for sending TCP packets. There are basically two pitfalls:

- the session established by scapy interferes with host's TCP stack,
where scapy is running. Host's TCP answers with RST to SYN-ACK,
which is sent as a response to scapy's SYN. Adding PF rule, which
blocks outbound RST solves the problem.

- the scapy's sr() function (send and receive) is very smart not
to receive a challenge_ack sent by remote PF. Test case uses
sniffer to verify remote PF answers with challenge ACK. If you
are not interested in further details stop reading here.

All packets (I should rather say protocol data units) are objects in scapy.
There is a method .answers(). Every protocol payload has its specific
answers() method. There is answers() method for IP, for UDP, for TCP, ...
Looking at answers() method for TCP we see something disappointing for
challenge ACK:

475def answers(self, other):
476if not isinstance(other, TCP):
477return 0
478if conf.checkIPsrc:
479if not ((self.sport == other.dport) and
480(self.dport == other.sport)):
481return 0
482if (abs(other.seq-self.ack) > 2+len(other.payload)):
483return 0
484return 1

there is a simple check for sequence number at line 482. The test prevents
sr() function to work for test case. The test sends SYN packet with
SeqNo 10, however PF answers with ACK 1000. I've tried to derive
TCP_OOW class (TCP out of window) with answers() method without check for
sequence numbers. Unfortunately I could not figure out a way to tell sr() to
use my TCP_OOW instead of default TCP class. I gave up and opted for poor man's
solution to use a sniffer instead.

thanks and
regards
sasha

8<---8<---8<--8<
diff -r a4cc143dcba1 src/sys/net/pf.c
--- a/src/sys/net/pf.c  Fri Sep 30 22:21:48 2016 +0200
+++ b/src/sys/net/pf.c  Fri Sep 30 22:22:49 2016 +0200
@@ -2815,6 +2815,25 @@ pf_send_tcp(const struct pf_rule *r, sa_
}
 }
 
+static void
+pf_send_challenge_ack(struct pf_pdesc *pd, struct pf_state *s,
+struct pf_state_peer *src, struct pf_state_peer *dst)
+{
+   /*
+* We are sending challenge ACK as a response to SYN packet, which
+* matches existing state (modulo TCP window check). Therefore packet
+* must be sent on behalf of destination.
+*
+* We expect sender to remain either silent, or send RST packet
+* so both, firewall and remote peer, can purge dead state from
+* memory.
+*/
+   pf_send_tcp(s->rule.ptr, pd->af, pd->dst, pd->src,
+   pd->hdr.tcp->th_dport, pd->hdr.tcp->th_sport, dst->seqlo,
+   src->seqlo, TH_ACK, 0, 0, s->rule.ptr->return_ttl, 1, 0,
+   pd->rdomain);
+}
+
 void
 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
 struct pf_rule *r, u_int rdomain)
@@ -4637,20 +4656,34 @@ pf_test_state(struct pf_pdesc *pd, struc
case IPPROTO_TCP:
if ((action = pf_synproxy(pd, state, reason)) != PF_PASS)
return (action); 
-   if (((pd->hdr.tcp->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
-   dst->state >= TCPS_FIN_WAIT_2 &&
-   src->state >= TCPS_FIN_WAIT_2) {
-   if (pf_status.debug >= LOG_NOTICE) {
-   log(LOG_NOTICE, "pf: state reuse ");
-

Re: pf_route pf_pdesc

2016-10-27 Thread Alexandr Nedvedicky

Hello,


On Wed, Oct 26, 2016 at 11:48:34PM +0200, Alexander Bluhm wrote:
> On Wed, Oct 19, 2016 at 11:49:56PM +0200, Alexander Bluhm wrote:
> > I would like to pass a struct pf_pdesc to pf_route() like it is
> > done in the other pf functions.  That means less parameters, more
> > consistency and later I can call functions that need an pd from
> > pf_route().
> 
> After splitting pf_pdesc and pf_headers into a separate header file,
> the diff looks like this.
> 
> While there, sort the includes like in the other pf files.
> 
> ok?
> 

OK sashan

1 2 3 4 5 6 7 >

1 - 100 of 609 matches

Mail list logo