David Gwynne writes:
> On Tue, Mar 05, 2019 at 12:03:05PM +1000, David Gwynne wrote:
>> this extends the fildrop mechanism so you can drop the packets with bpf
>> using the existing fildrop method, but with an extra tweak so you can
>> avoid the cost of copying packets to userland.
>>
>> i wanted to quickly drop some packets in the rx interrupt path to try
>> and prioritise some traffic getting processed by the system. the initial
>> version was going to use weird custom DLTs and extra bpf interface
>> pointers and stuff, but most of the glue is already in place with
>> the fildrop functionality.
>>
>> this also adds a bit to tcpdump so you can set a fildrop action. it
>> means tcpdump can be used as a quick and dirty firewall.
>
> there's a bit more discussion about this that i should have included in
> my original email.
>
> firstly, the functionality it offers. this effectively offers a firewall
> with the ability to filter arbitrary packets. this has significant
> overlap with the functionality that pf offers, but there are a couple of
> important differences. pf only handles IP traffic, but we don't
> really have a good story when it comes to filtering non-ip. we could
> implement something like pf for the next protocol that people need to
> manage, but what is that next protocol? pf like implies a highly
> optimised but constrained set of filters that deeply understands the
> protocol it is handling. is that next protol ieee1905p? cdp? ipx?
> macsec? where should that protocol be filtered in the stack?
>
> im arguing that bpf with fildrop has the benefit of already existing,
> it's in place, and it already has the ability to be configured with
> arbitrary policy. considering we've got this far without handling
> non-ip, spending more time on it seems unjustified.
>
> secondly, the performance aspects of this diff.
>
> bpf allows for arbitrarily complicated filters, so it is entirely
> possible to slow your box down a lot by writing really complicated
> filters. this is in comparison to pf where each rule has a limit
> on how much work it will do, which is also mitigated by the ruleset
> optimiser and skip steps. i don't have a good answer to that except to
> say you can already add such filters to bpf, they just don't do anything
> except copy packets at the moment.
>
> another interesting performance consideration is that bpf runs a lot
> earlier than pf, so filtering packets with bpf can avoid a lot of work
> in the stack. if you want to pass IP statefully, pf is a much better
> hammer, but to drop packets up front bpf is interesting.
>
> for example, thanks to hrvoje popovski i now have a setup where im
> pushing ~7 million packets per second through a box to do performance
> measurements. those packets are udp from random ips to port 7 on
> another set of random ips. if i have the following rule in pf.conf:
>
> block in quick proto udp to port 7
>
> i can rx and drop about 550kpps. if im sshed in using another
> interface, the system is super sluggish over that shell.
>
> if i use this diff and run the following;
>
> # tcpdump -B drop -i ix1 udp and port 7
>
> i'm dropping about 1.2 million pps, and the box is responsive when sshed
> in using another interface.
>
> so, to summarise, bpf can already be used to drop packets, this is just
> a tweak to make it faster, and a tweak so tcpdump can be used to set up
> that filtering.
>
I think this is a great development. Diff looks good as well.
>> Index: sys/net/bpf.c
>> ===================================================================
>> RCS file: /cvs/src/sys/net/bpf.c,v
>> retrieving revision 1.170
>> diff -u -p -r1.170 bpf.c
>> --- sys/net/bpf.c 13 Jul 2018 08:51:15 -0000 1.170
>> +++ sys/net/bpf.c 4 Mar 2019 22:30:32 -0000
>> @@ -926,9 +926,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t
>> *(u_int *)addr = d->bd_fildrop;
>> break;
>>
>> - case BIOCSFILDROP: /* set "filter-drop" flag */
>> - d->bd_fildrop = *(u_int *)addr ? 1 : 0;
>> + case BIOCSFILDROP: { /* set "filter-drop" flag */
>> + unsigned int fildrop = *(u_int *)addr;
>> + switch (fildrop) {
>> + case BPF_FILDROP_PASS:
>> + case BPF_FILDROP_CAPTURE:
>> + case BPF_FILDROP_DROP:
>> + d->bd_fildrop = fildrop;
>> + break;
>> + default:
>> + error = EINVAL;
>> + break;
>> + }
>> break;
>> + }
>>
>> case BIOCGDIRFILT: /* get direction filter */
>> *(u_int *)addr = d->bd_dirfilt;
>> @@ -1261,23 +1272,26 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>> pktlen += m0->m_len;
>>
>> SRPL_FOREACH(d, &sr, &bp->bif_dlist, bd_next) {
>> + struct srp_ref bsr;
>> + struct bpf_program *bf;
>> + struct bpf_insn *fcode = NULL;
>> +
>> atomic_inc_long(&d->bd_rcount);
>>
>> - if ((direction & d->bd_dirfilt) != 0)
>> - slen = 0;
>> - else {
>> - struct srp_ref bsr;
>> - struct bpf_program *bf;
>> - struct bpf_insn *fcode = NULL;
>> -
>> - bf = srp_enter(&bsr, &d->bd_rfilter);
>> - if (bf != NULL)
>> - fcode = bf->bf_insns;
>> - slen = bpf_mfilter(fcode, m, pktlen);
>> - srp_leave(&bsr);
>> - }
>> + if (ISSET(d->bd_dirfilt, direction))
>> + continue;
>> +
>> + bf = srp_enter(&bsr, &d->bd_rfilter);
>> + if (bf != NULL)
>> + fcode = bf->bf_insns;
>> + slen = bpf_mfilter(fcode, m, pktlen);
>> + srp_leave(&bsr);
>>
>> - if (slen > 0) {
>> + if (slen == 0)
>> + continue;
>> + if (d->bd_fildrop != BPF_FILDROP_PASS)
>> + drop = 1;
>> + if (d->bd_fildrop != BPF_FILDROP_DROP) {
>> if (!gottime++)
>> microtime(&tv);
>>
>> @@ -1285,9 +1299,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>> bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
>> &tv);
>> mtx_leave(&d->bd_mtx);
>> -
>> - if (d->bd_fildrop)
>> - drop = 1;
>> }
>> }
>> SRPL_LEAVE(&sr);
>> Index: sys/net/bpf.h
>> ===================================================================
>> RCS file: /cvs/src/sys/net/bpf.h,v
>> retrieving revision 1.65
>> diff -u -p -r1.65 bpf.h
>> --- sys/net/bpf.h 3 Feb 2018 13:37:37 -0000 1.65
>> +++ sys/net/bpf.h 4 Mar 2019 22:30:32 -0000
>> @@ -126,6 +126,13 @@ struct bpf_version {
>> #define BPF_DIRECTION_IN 1
>> #define BPF_DIRECTION_OUT (1<<1)
>>
>> +/*
>> + * Values for BIOCGFILDROP/BIOCSFILDROP
>> + */
>> +#define BPF_FILDROP_PASS 0 /* capture, pass */
>> +#define BPF_FILDROP_CAPTURE 1 /* capture, drop */
>> +#define BPF_FILDROP_DROP 2 /* no capture, drop */
>> +
>> struct bpf_timeval {
>> u_int32_t tv_sec;
>> u_int32_t tv_usec;
>> Index: share/man/man4/bpf.4
>> ===================================================================
>> RCS file: /cvs/src/share/man/man4/bpf.4,v
>> retrieving revision 1.38
>> diff -u -p -r1.38 bpf.4
>> --- share/man/man4/bpf.4 28 Apr 2016 19:07:19 -0000 1.38
>> +++ share/man/man4/bpf.4 4 Mar 2019 22:30:32 -0000
>> @@ -391,11 +391,24 @@ This flag is initialized to zero by defa
>> .Pp
>> .It Dv BIOCSFILDROP Fa "u_int *"
>> .It Dv BIOCGFILDROP Fa "u_int *"
>> -Sets or gets the status of the
>> +Sets or gets the
>> .Dq filter drop
>> -flag.
>> -If non-zero, packets matching any filters will be reported to the
>> -associated interface so that they can be dropped.
>> +action.
>> +The supported actions for packets matching the filter are:
>> +.Pp
>> +.Bl -tag -width "BPF_FILDROP_CAPTURE" -compact
>> +.It Dv BPF_FILDROP_PASS
>> +Accept and capture
>> +.It Dv BPF_FILDROP_CAPTURE
>> +Drop and capture
>> +.It Dv BPF_FILDROP_DROP
>> +Drop and do not capture
>> +.El
>> +.Pp
>> +Packets matching any filter configured to drop packets will be
>> +reported to the associated interface so that they can be dropped.
>> +The default action is
>> +.Dv BPF_FILDROP_PASS .
>> .Pp
>> .It Dv BIOCSDIRFILT Fa "u_int *"
>> .It Dv BIOCGDIRFILT Fa "u_int *"
>> Index: usr.sbin/tcpdump/privsep.c
>> ===================================================================
>> RCS file: /cvs/src/usr.sbin/tcpdump/privsep.c,v
>> retrieving revision 1.52
>> diff -u -p -r1.52 privsep.c
>> --- usr.sbin/tcpdump/privsep.c 17 Nov 2018 16:52:02 -0000 1.52
>> +++ usr.sbin/tcpdump/privsep.c 4 Mar 2019 22:30:32 -0000
>> @@ -224,7 +224,7 @@ priv_exec(int argc, char *argv[])
>> /* parse the arguments for required options */
>> opterr = 0;
>> while ((i = getopt(argc, argv,
>> - "ac:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) {
>> + "aB:c:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) {
>> switch (i) {
>> case 'n':
>> nflag++;
>> @@ -366,7 +366,7 @@ static void
>> impl_open_bpf(int fd, int *bpfd)
>> {
>> int snaplen, promisc, err;
>> - u_int dlt, dirfilt;
>> + u_int dlt, dirfilt, fildrop;
>> char device[IFNAMSIZ];
>> size_t iflen;
>>
>> @@ -376,10 +376,11 @@ impl_open_bpf(int fd, int *bpfd)
>> must_read(fd, &promisc, sizeof(int));
>> must_read(fd, &dlt, sizeof(u_int));
>> must_read(fd, &dirfilt, sizeof(u_int));
>> + must_read(fd, &fildrop, sizeof(fildrop));
>> iflen = read_string(fd, device, sizeof(device), __func__);
>> if (iflen == 0)
>> errx(1, "Invalid interface size specified");
>> - *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt);
>> + *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt, fildrop);
>> err = errno;
>> if (*bpfd < 0)
>> logmsg(LOG_DEBUG,
>> Index: usr.sbin/tcpdump/privsep.h
>> ===================================================================
>> RCS file: /cvs/src/usr.sbin/tcpdump/privsep.h,v
>> retrieving revision 1.11
>> diff -u -p -r1.11 privsep.h
>> --- usr.sbin/tcpdump/privsep.h 8 Nov 2018 14:06:09 -0000 1.11
>> +++ usr.sbin/tcpdump/privsep.h 4 Mar 2019 22:30:32 -0000
>> @@ -45,11 +45,11 @@ __dead void priv_exec(int, char **);
>> void priv_init_done(void);
>>
>> int setfilter(int, int, char *);
>> -int pcap_live(const char *, int, int, u_int, u_int);
>> +int pcap_live(const char *, int, int, u_int, u_int, u_int);
>>
>> struct bpf_program *priv_pcap_setfilter(pcap_t *, int, u_int32_t);
>> pcap_t *priv_pcap_live(const char *, int, int, int, char *, u_int,
>> - u_int);
>> + u_int, u_int);
>> pcap_t *priv_pcap_offline(const char *, char *);
>>
>> size_t priv_gethostbyaddr(char *, size_t, int, char *, size_t);
>> Index: usr.sbin/tcpdump/privsep_pcap.c
>> ===================================================================
>> RCS file: /cvs/src/usr.sbin/tcpdump/privsep_pcap.c,v
>> retrieving revision 1.23
>> diff -u -p -r1.23 privsep_pcap.c
>> --- usr.sbin/tcpdump/privsep_pcap.c 17 Nov 2018 16:52:02 -0000 1.23
>> +++ usr.sbin/tcpdump/privsep_pcap.c 4 Mar 2019 22:30:32 -0000
>> @@ -173,7 +173,7 @@ priv_pcap_setfilter(pcap_t *hpcap, int o
>> /* privileged part of priv_pcap_live */
>> int
>> pcap_live(const char *device, int snaplen, int promisc, u_int dlt,
>> - u_int dirfilt)
>> + u_int dirfilt, u_int fildrop)
>> {
>> int fd;
>> struct ifreq ifr;
>> @@ -201,6 +201,9 @@ pcap_live(const char *device, int snaple
>> if (ioctl(fd, BIOCSDIRFILT, &dirfilt) < 0)
>> goto error;
>>
>> + if (ioctl(fd, BIOCSFILDROP, &fildrop) < 0)
>> + goto error;
>> +
>> /* lock the descriptor */
>> if (ioctl(fd, BIOCLOCK, NULL) < 0)
>> goto error;
>> @@ -218,7 +221,7 @@ pcap_live(const char *device, int snaple
>> */
>> pcap_t *
>> priv_pcap_live(const char *dev, int slen, int prom, int to_ms,
>> - char *ebuf, u_int dlt, u_int dirfilt)
>> + char *ebuf, u_int dlt, u_int dirfilt, u_int fildrop)
>> {
>> int fd, err;
>> struct bpf_version bv;
>> @@ -247,6 +250,7 @@ priv_pcap_live(const char *dev, int slen
>> must_write(priv_fd, &prom, sizeof(int));
>> must_write(priv_fd, &dlt, sizeof(u_int));
>> must_write(priv_fd, &dirfilt, sizeof(u_int));
>> + must_write(priv_fd, &fildrop, sizeof(fildrop));
>> write_string(priv_fd, dev);
>>
>> fd = receive_fd(priv_fd);
>> Index: usr.sbin/tcpdump/tcpdump.8
>> ===================================================================
>> RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.8,v
>> retrieving revision 1.99
>> diff -u -p -r1.99 tcpdump.8
>> --- usr.sbin/tcpdump/tcpdump.8 6 Jul 2018 09:59:12 -0000 1.99
>> +++ usr.sbin/tcpdump/tcpdump.8 4 Mar 2019 22:30:32 -0000
>> @@ -29,6 +29,7 @@
>> .Nm tcpdump
>> .Op Fl AadefILlNnOopqStvXx
>> .Op Fl c Ar count
>> +.Op Fl B Ar fildrop
>> .Op Fl D Ar direction
>> .Op Fl E Oo Ar espalg : Oc Ns Ar espkey
>> .Op Fl F Ar file
>> @@ -58,6 +59,23 @@ The smaller of the entire packet or
>> bytes will be printed.
>> .It Fl a
>> Attempt to convert network and broadcast addresses to names.
>> +.It Fl B Ar fildrop
>> +Configure the drop action specified by
>> +.A fildrop
>> +to be used when the filter expression matches a packet.
>> +The actions are:
>> +.Pp
>> +.Bl -tag -width "capture" -offset indent -compact
>> +.It Cm pass
>> +Matching packets are accepted and captured.
>> +.It Cm capture
>> +Matching packets are dropped and captured.
>> +.It Cm drop
>> +Matching packets are dropped and not captured.
>> +.El
>> +.Pp
>> +The default action is
>> +.Cm pass .
>> .It Fl c Ar count
>> Exit after receiving
>> .Ar count
>> Index: usr.sbin/tcpdump/tcpdump.c
>> ===================================================================
>> RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.c,v
>> retrieving revision 1.88
>> diff -u -p -r1.88 tcpdump.c
>> --- usr.sbin/tcpdump/tcpdump.c 8 Nov 2018 14:06:09 -0000 1.88
>> +++ usr.sbin/tcpdump/tcpdump.c 4 Mar 2019 22:30:32 -0000
>> @@ -61,6 +61,7 @@
>>
>> int Aflag; /* dump ascii */
>> int aflag; /* translate network and broadcast addresses */
>> +int Bflag; /* BPF fildrop setting */
>> int dflag; /* print filter code */
>> int eflag; /* print ethernet header */
>> int fflag; /* don't translate "foreign" IP address */
>> @@ -231,7 +232,7 @@ main(int argc, char **argv)
>>
>> opterr = 0;
>> while ((op = getopt(argc, argv,
>> - "Aac:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1)
>> + "AaB:c:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1)
>> switch (op) {
>>
>> case 'A':
>> @@ -243,6 +244,19 @@ main(int argc, char **argv)
>> aflag = 1;
>> break;
>>
>> + case 'B':
>> + if (strcasecmp(optarg, "pass") == 0)
>> + Bflag = BPF_FILDROP_PASS;
>> + else if (strcasecmp(optarg, "capture") == 0)
>> + Bflag = BPF_FILDROP_CAPTURE;
>> + else if (strcasecmp(optarg, "drop") == 0)
>> + Bflag = BPF_FILDROP_DROP;
>> + else {
>> + error("invalid BPF fildrop option: %s",
>> + optarg);
>> + }
>> + break;
>> +
>> case 'c':
>> cnt = strtonum(optarg, 1, INT_MAX, &errstr);
>> if (errstr)
>> @@ -440,7 +454,7 @@ main(int argc, char **argv)
>> error("%s", ebuf);
>> }
>> pd = priv_pcap_live(device, snaplen, !pflag, 1000, ebuf,
>> - dlt, dirfilt);
>> + dlt, dirfilt, Bflag);
>> if (pd == NULL)
>> error("%s", ebuf);
>>
>> @@ -700,7 +714,7 @@ __dead void
>> usage(void)
>> {
>> (void)fprintf(stderr,
>> -"Usage: %s [-AadefILlNnOopqStvXx] [-c count] [-D direction]\n",
>> +"Usage: %s [-AadefILlNnOopqStvXx] [-B fildrop] [-c count] [-D direction]\n",
>> program_name);
>> (void)fprintf(stderr,
>> "\t [-E [espalg:]espkey] [-F file] [-i interface] [-r file]\n");
>>