On Mon, Feb 13, 2023 at 08:30:03AM +0100, Alexandr Nedvedicky wrote: > Hello, > > This diff looks good to me. Though I still have some > doubts about accuracy of comment here: > > </snip> > > return (kn->kn_data > 0); > > @@ -1510,6 +1599,15 @@ bpf_catchpacket(struct bpf_d *d, u_char > > ++d->bd_dcount; > > return; > > } > > + > > + /* > > + * there's a small chance bpf_wait_cb is running > > + * concurrently with this and two wakeups will be > > + * generated. > > + */ > > + if (timeout_del(&d->bd_wait_tmo)) > > + bpf_put(d); > > + > > ROTATE_BUFFERS(d); > > do_wakeup = 1; > > curlen = 0; > > @@ -1530,12 +1628,27 @@ bpf_catchpacket(struct bpf_d *d, u_char > > I'm still failing to spot the race the comment is talking > about. code in bpf_wait_cb() grabs `d->bd_mtx` mutex, the > same mutex which is held by bpf_catchpacket() caller.
oh yeah. i forgot to remove the comment. > However my doubts about comment should not prevent you > committing code which solves the issue and looks good. > > OK sashan thank you. i'll put this in sometime in the next few days unless someone objects. Index: sys/net/bpf.c =================================================================== RCS file: /cvs/src/sys/net/bpf.c,v retrieving revision 1.220 diff -u -p -r1.220 bpf.c --- sys/net/bpf.c 10 Feb 2023 14:34:17 -0000 1.220 +++ sys/net/bpf.c 8 Mar 2023 03:58:08 -0000 @@ -77,6 +77,10 @@ #define BPF_BUFSIZE 32768 +#define BPF_S_IDLE 0 +#define BPF_S_WAIT 1 +#define BPF_S_DONE 2 + #define PRINET 26 /* interruptible */ /* @@ -101,6 +105,7 @@ int bpf_setif(struct bpf_d *, struct ifr int bpfkqfilter(dev_t, struct knote *); void bpf_wakeup(struct bpf_d *); void bpf_wakeup_cb(void *); +void bpf_wait_cb(void *); int _bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int); void bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t, const struct bpf_hdr *); @@ -392,11 +397,13 @@ bpfopen(dev_t dev, int flag, int mode, s bd->bd_sig = SIGIO; mtx_init(&bd->bd_mtx, IPL_NET); task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd); + timeout_set(&bd->bd_wait_tmo, bpf_wait_cb, bd); smr_init(&bd->bd_smr); sigio_init(&bd->bd_sigio); klist_init_mutex(&bd->bd_klist, &bd->bd_mtx); bd->bd_rtout = 0; /* no timeout by default */ + bd->bd_wtout = INFSLP; /* wait for the buffer to fill by default */ refcnt_init(&bd->bd_refcnt); LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list); @@ -435,6 +442,7 @@ bpfclose(dev_t dev, int flag, int mode, (d)->bd_hbuf = (d)->bd_sbuf; \ (d)->bd_hlen = (d)->bd_slen; \ (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_state = BPF_S_IDLE; \ (d)->bd_slen = 0; \ (d)->bd_fbuf = NULL; @@ -492,7 +500,7 @@ bpfread(dev_t dev, struct uio *uio, int ROTATE_BUFFERS(d); break; } - if (d->bd_immediate && d->bd_slen != 0) { + if (d->bd_state == BPF_S_DONE) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. @@ -611,6 +619,21 @@ bpf_wakeup_cb(void *xd) bpf_put(d); } +void +bpf_wait_cb(void *xd) +{ + struct bpf_d *d = xd; + + mtx_enter(&d->bd_mtx); + if (d->bd_state == BPF_S_WAIT) { + d->bd_state = BPF_S_DONE; + bpf_wakeup(d); + } + mtx_leave(&d->bd_mtx); + + bpf_put(d); +} + int bpfwrite(dev_t dev, struct uio *uio, int ioflag) { @@ -674,17 +697,64 @@ bpf_resetd(struct bpf_d *d) MUTEX_ASSERT_LOCKED(&d->bd_mtx); KASSERT(d->bd_in_uiomove == 0); + if (timeout_del(&d->bd_wait_tmo)) + bpf_put(d); + if (d->bd_hbuf != NULL) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; } + d->bd_state = BPF_S_IDLE; d->bd_slen = 0; d->bd_hlen = 0; d->bd_rcount = 0; d->bd_dcount = 0; } +static int +bpf_set_wtout(struct bpf_d *d, uint64_t wtout) +{ + mtx_enter(&d->bd_mtx); + d->bd_wtout = wtout; + mtx_leave(&d->bd_mtx); + + return (0); +} + +static int +bpf_set_wtimeout(struct bpf_d *d, const struct timeval *tv) +{ + uint64_t nsec; + + if (tv->tv_sec < 0 || !timerisvalid(tv)) + return (EINVAL); + + nsec = TIMEVAL_TO_NSEC(tv); + if (nsec > MAXTSLP) + return (EOVERFLOW); + + return (bpf_set_wtout(d, nsec)); +} + +static int +bpf_get_wtimeout(struct bpf_d *d, struct timeval *tv) +{ + uint64_t nsec; + + mtx_enter(&d->bd_mtx); + nsec = d->bd_wtout; + mtx_leave(&d->bd_mtx); + + if (nsec == INFSLP) + return (ENXIO); + + memset(tv, 0, sizeof(*tv)); + NSEC_TO_TIMEVAL(nsec, tv); + + return (0); +} + /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. @@ -698,6 +768,9 @@ bpf_resetd(struct bpf_d *d) * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. + * BIOCSWTIMEOUT Set wait timeout. + * BIOCGWTIMEOUT Get wait timeout. + * BIOCDWTIMEOUT Del wait timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. @@ -720,6 +793,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t case BIOCGDLTLIST: case BIOCGETIF: case BIOCGRTIMEOUT: + case BIOCGWTIMEOUT: case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: @@ -727,6 +801,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: + case BIOCSWTIMEOUT: + case BIOCDWTIMEOUT: case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCGDIRFILT: @@ -933,7 +1009,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t * Set immediate mode. */ case BIOCIMMEDIATE: - d->bd_immediate = *(u_int *)addr; + error = bpf_set_wtout(d, *(int *)addr ? 0 : INFSLP); + break; + + /* + * Wait timeout. + */ + case BIOCSWTIMEOUT: + error = bpf_set_wtimeout(d, (const struct timeval *)addr); + break; + case BIOCGWTIMEOUT: + error = bpf_get_wtimeout(d, (struct timeval *)addr); + break; + case BIOCDWTIMEOUT: + error = bpf_set_wtout(d, INFSLP); break; case BIOCVERSION: @@ -1193,7 +1282,7 @@ filt_bpfread(struct knote *kn, long hint MUTEX_ASSERT_LOCKED(&d->bd_mtx); kn->kn_data = d->bd_hlen; - if (d->bd_immediate) + if (d->bd_wtout == 0) kn->kn_data += d->bd_slen; return (kn->kn_data > 0); @@ -1510,6 +1599,11 @@ bpf_catchpacket(struct bpf_d *d, u_char ++d->bd_dcount; return; } + + /* cancel pending wtime */ + if (timeout_del(&d->bd_wait_tmo)) + bpf_put(d); + ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; @@ -1530,12 +1624,27 @@ bpf_catchpacket(struct bpf_d *d, u_char bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen); d->bd_slen = curlen + totlen; - if (d->bd_immediate) { + switch (d->bd_wtout) { + case 0: /* * Immediate mode is set. A packet arrived so any * reads should be woken up. */ + if (d->bd_state == BPF_S_IDLE) + d->bd_state = BPF_S_DONE; do_wakeup = 1; + break; + case INFSLP: + break; + default: + if (d->bd_state == BPF_S_IDLE) { + d->bd_state = BPF_S_WAIT; + + bpf_get(d); + if (!timeout_add_nsec(&d->bd_wait_tmo, d->bd_wtout)) + bpf_put(d); + } + break; } if (do_wakeup) Index: sys/net/bpf.h =================================================================== RCS file: /cvs/src/sys/net/bpf.h,v retrieving revision 1.70 diff -u -p -r1.70 bpf.h --- sys/net/bpf.h 3 Aug 2020 03:21:24 -0000 1.70 +++ sys/net/bpf.h 8 Mar 2023 03:58:08 -0000 @@ -119,6 +119,9 @@ struct bpf_version { #define BIOCGDLTLIST _IOWR('B',123, struct bpf_dltlist) #define BIOCGDIRFILT _IOR('B',124, u_int) #define BIOCSDIRFILT _IOW('B',125, u_int) +#define BIOCSWTIMEOUT _IOW('B',126, struct timeval) +#define BIOCGWTIMEOUT _IOR('B',126, struct timeval) +#define BIOCDWTIMEOUT _IO('B',126) /* * Direction filters for BIOCSDIRFILT/BIOCGDIRFILT Index: sys/net/bpfdesc.h =================================================================== RCS file: /cvs/src/sys/net/bpfdesc.h,v retrieving revision 1.47 diff -u -p -r1.47 bpfdesc.h --- sys/net/bpfdesc.h 9 Jul 2022 12:48:21 -0000 1.47 +++ sys/net/bpfdesc.h 8 Mar 2023 03:58:08 -0000 @@ -79,6 +79,7 @@ struct bpf_d { struct bpf_if *bd_bif; /* interface descriptor */ uint64_t bd_rtout; /* [m] Read timeout in nanoseconds */ + uint64_t bd_wtout; /* [m] Wait time in nanoseconds */ u_long bd_nreaders; /* [m] # threads asleep in bpfread() */ struct bpf_program_smr *bd_rfilter; /* read filter code */ @@ -88,8 +89,7 @@ struct bpf_d { u_long bd_dcount; /* number of packets dropped */ u_char bd_promisc; /* true if listening promiscuously */ - u_char bd_state; /* idle, waiting, or timed out */ - u_char bd_immediate; /* true to return on packet arrival */ + u_char bd_state; /* [m] idle, waiting, or timed out */ u_char bd_locked; /* true if descriptor is locked */ u_char bd_fildrop; /* true if filtered packets will be dropped */ u_char bd_dirfilt; /* direction filter */ @@ -103,7 +103,8 @@ struct bpf_d { int bd_unit; /* logical unit number */ LIST_ENTRY(bpf_d) bd_list; /* descriptor list */ - struct task bd_wake_task; /* delay pgsigio() and selwakeup() */ + struct task bd_wake_task; /* defer pgsigio() and selwakeup() */ + struct timeout bd_wait_tmo; /* delay wakeup after catching pkt */ struct smr_entry bd_smr; Index: sbin/pflogd/pflogd.c =================================================================== RCS file: /cvs/src/sbin/pflogd/pflogd.c,v retrieving revision 1.62 diff -u -p -r1.62 pflogd.c --- sbin/pflogd/pflogd.c 25 Jul 2019 17:32:33 -0000 1.62 +++ sbin/pflogd/pflogd.c 8 Mar 2023 03:58:09 -0000 @@ -251,8 +251,8 @@ pflog_read_live(const char *source, int struct timeval to; to.tv_sec = to_ms / 1000; to.tv_usec = (to_ms * 1000) % 1000000; - if (ioctl(p->fd, BIOCSRTIMEOUT, &to) == -1) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSRTIMEOUT: %s", + if (ioctl(p->fd, BIOCSWTIMEOUT, &to) == -1) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSWTIMEOUT: %s", pcap_strerror(errno)); goto bad; } Index: share/man/man4/bpf.4 =================================================================== RCS file: /cvs/src/share/man/man4/bpf.4,v retrieving revision 1.44 diff -u -p -r1.44 bpf.4 --- share/man/man4/bpf.4 11 Sep 2022 06:38:11 -0000 1.44 +++ share/man/man4/bpf.4 8 Mar 2023 03:58:09 -0000 @@ -220,6 +220,8 @@ The allowable ioctls are .Dv BIOCIMMEDIATE , .Dv BIOCLOCK , .Dv BIOCSRTIMEOUT , +.Dv BIOCSWTIMEOUT , +.Dv BIOCDWTIMEOUT , .Dv BIOCVERSION , .Dv TIOCGPGRP , and @@ -301,6 +303,18 @@ This is useful for programs like .Xr rarpd 8 , which must respond to messages in real time. The default for a new file is off. +.Pp +.It Dv BIOCSWTIMEOUT Fa "struct timeval *" +.It Dv BIOCGWTIMEOUT Fa "struct timeval *" +.It Dv BIOCDWTIMEOUT +Sets, gets, or deletes (resets) the wait timeout parameter. +The +.Ar timeval +specifies the length of time to wait between receiving a packet and +the kernel buffer becoming readable. +By default, or when reset, the wait timeout is infinite, meaning +the age of packets in the kernel buffer does not make the buffer +readable. .Pp .It Dv BIOCSETF Fa "struct bpf_program *" Sets the filter program used by the kernel to discard uninteresting packets.