i tested the diff, and just committed it. it should be in a snapshot soon.

as a mitigation, you could remove link1 for now and just let the pppoe link 
stay up.

dlg

> On 31 Oct 2025, at 02:31, [email protected] wrote:
> 
> Shall I test something and if yes, what exactly?
> 
> Is there anything, I can do to help you? Or is a official patch already on 
> the way?
> 
> Sorry for asking, I don't want to hurry. The thing is, I can't update my 
> router from 7.6 to 7.7 (or 7.8), because the same bug is as well in 7.7 :-(
> 
> Thank you and kind regards,
> Illya
> 
> Am 2025-10-28 09:14, schrieb Vitaliy Makkoveev:
>> On Tue, Oct 28, 2025 at 04:27:48PM +1000, David Gwynne wrote:
>>> On Tue, Oct 28, 2025 at 08:56:00AM +0300, Vitaliy Makkoveev wrote:
>>> > On Tue, Oct 28, 2025 at 02:26:57PM +1000, David Gwynne wrote:
>>> > > On Mon, Oct 27, 2025 at 09:43:04PM +0100, Alexander Bluhm wrote:
>>> > > > On Mon, Oct 27, 2025 at 08:54:02PM +0300, Vitaliy Makkoveev wrote:
>>> > > > > Seems the whole sppp_output() should take kernel lock. I have no 
>>> > > > > ability
>>> > > > > to test this diff.
>>> > > >
>>> > > > I also came to the conclusion that some big kernel lock is missing.
>>> > > > Just wondering why this bug appears now.  We have removed kernel
>>> > > > lock from network stack a long time ago.
>>> > >
>>> > > this is because of src/sys/net/if_pppoe.c r1.87, which moved pppoe away
>>> > > from queueing outgoing packets on the ifq to direct dispatch. because
>>> > > pppoe wasn't marked mpsafe, sppp_output was called by the ifq machinery
>>> > > with the kernel lock held.
>>> > >
>>> > > the testing i (and semarie) did was without the link1 flag, but that's
>>> > > being used here. IFF_LINK1 is aliased to IFF_AUTO flag in the code and
>>> > > is used to implement autodial functionality in the sppp code. the
>>> > > kernel lock is only really needed to start moving the interface
>>> > > through the connection state machine, which can be done in parallel
>>> > > to the data path.
>>> > >
>>> > > also, fiddling with ifp->if_flags like this should be done while
>>> > > holding NET_LOCK, which isn't guaranteed to be held here. the diff
>>> > > below moves the autodial stuff into a task run from systq, which
>>> > > holds the kernel lock like the ppp state machine requires.
>>> > >
>>> > > this diff compiles. i have no idea if it works.
>>> > >
>>> >
>>> > According the trace, sppp_output() running with shared netlock. Is it
>>> > still enough? I especially concern about if_enqueue() and the following
>>> > bridge(4) codepaths.
>>> hrm. i was confusing pppoe output and enqueue. if_pppoe.c r1.87 only
>>> changed when enqueue is called, the pppoe output calls (which is
>>> sppp_output) is called the same way now as it has been for a while. it
>>> must have been a change to how the stack is called that caused this?
>> This is the trace from panic report:
>> panic: kernel diagnostic assertion "_kernel_lock_held()" failed: file 
>> "/usr/src
>> panic(ffffffff825f1d7f) at panic+0xd5
>> __assert(ffffffff8262d946,ffffffff8267c399,de,ffffffff8267bf7e) at
>> __assert+0x2
>> 9
>> pppoe_tls(ffff8000004a7000) at pppoe_tls+0x13c
>> sppp_output(ffff8000004a7000,fffffd803e90f500,ffff800001393960,fffffd82709bb5d8
>> ) at sppp_output+0x187
>> if_output_tso(ffff8000004a7000,ffff8000360e6590,ffff800001393960,fffffd82709bb5
>> d8,5d4) at if_output_tso+0xf6
>> ip_output(fffffd803e90f500,0,fffffd8271bf7820,800,0,fffffd8271bf78b8,98eb194f7a
>> d82df2) at ip_output+0x7ee
>> tcp_output(ffff800001e5d9e0) at tcp_output+0x1a53
>> tcp_connect(ffff8000015668a8,fffffd803e90f100) at tcp_connect+0x28c
>> sys_connect(ffff800035fbd250,ffff8000360e6930,ffff8000360e68b0) at
>> sys_connect+
>> 0x1c0
>> sys_connect() and following tcp_connect() only helds the per-socket
>> socket lock and the shared netlock. The first one is not essential for
>> the sppp_output() path.
>>> anyway, to answer your questions (i think): interface output and
>>> enqueue should not need the caller to be holding any locks for them.
>>> the bridge stuff is pretty gross, but the worst i can see is that it
>>> can lose counter updates, otherwise it doesnt appear to rely on the
>>> kernel or net locks.
>> I see. Thanks for explanation.
>>> >
>>> > > Index: if_sppp.h
>>> > > ===================================================================
>>> > > RCS file: /cvs/src/sys/net/if_sppp.h,v
>>> > > diff -u -p -r1.31 if_sppp.h
>>> > > --- if_sppp.h 15 Jan 2025 06:15:44 -0000 1.31
>>> > > +++ if_sppp.h 28 Oct 2025 04:25:22 -0000
>>> > > @@ -174,6 +174,7 @@ struct sppp {
>>> > >   time_t pp_last_receive; /* peer's last "sign of life" */
>>> > >   time_t pp_last_activity; /* second of last payload data s/r */
>>> > >   enum ppp_phase pp_phase; /* phase we're currently in */
>>> > > + struct task pp_autodial;
>>> > >   int state[IDX_COUNT]; /* state machine */
>>> > >   u_char  confid[IDX_COUNT]; /* id of last configuration request */
>>> > >   int rst_counter[IDX_COUNT]; /* restart counter */
>>> > > Index: if_spppsubr.c
>>> > > ===================================================================
>>> > > RCS file: /cvs/src/sys/net/if_spppsubr.c,v
>>> > > diff -u -p -r1.198 if_spppsubr.c
>>> > > --- if_spppsubr.c 19 Aug 2025 12:34:15 -0000 1.198
>>> > > +++ if_spppsubr.c 28 Oct 2025 04:25:22 -0000
>>> > > @@ -227,6 +227,7 @@ static struct timeout keepalive_ch;
>>> > >   struct ifnet *ifp = &sp->pp_if; \
>>> > >   int debug = ifp->if_flags & IFF_DEBUG
>>> > >
>>> > > +void sppp_autodial(void *);
>>> > >  int sppp_output(struct ifnet *ifp, struct mbuf *m,
>>> > >         struct sockaddr *dst, struct rtentry *rt);
>>> > >
>>> > > @@ -570,6 +571,20 @@ sppp_input(struct ifnet *ifp, struct mbu
>>> > >   goto drop;
>>> > >  }
>>> > >
>>> > > +void
>>> > > +sppp_autodial(void *arg)
>>> > > +{
>>> > > + struct sppp *sp = arg;
>>> > > + struct ifnet *ifp = &sp->pp_if;
>>> > > +
>>> > > + NET_LOCK();
>>> > > + if ((ifp->if_flags & (IFF_RUNNING | IFF_AUTO)) == IFF_AUTO) {
>>> > > + ifp->if_flags |= IFF_RUNNING;
>>> > > + lcp.Open(sp);
>>> > > + }
>>> > > + NET_UNLOCK();
>>> > > +}
>>> > > +
>>> > >  /*
>>> > >   * Enqueue transmit packet.
>>> > >   */
>>> > > @@ -581,6 +596,7 @@ sppp_output(struct ifnet *ifp, struct mb
>>> > >   struct timeval tv;
>>> > >   int s, rv = 0;
>>> > >   u_int16_t protocol;
>>> > > + int if_flags;
>>> > >
>>> > >  #ifdef DIAGNOSTIC
>>> > >   if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) {
>>> > > @@ -596,22 +612,20 @@ sppp_output(struct ifnet *ifp, struct mb
>>> > >   getmicrouptime(&tv);
>>> > >   sp->pp_last_activity = tv.tv_sec;
>>> > >
>>> > > - if ((ifp->if_flags & IFF_UP) == 0 ||
>>> > > -    (ifp->if_flags & (IFF_RUNNING | IFF_AUTO)) == 0) {
>>> > > + if_flags = ifp->if_flags;
>>> > > + if ((if_flags & IFF_UP) == 0 ||
>>> > > +    (if_flags & (IFF_RUNNING | IFF_AUTO)) == 0) {
>>> > >   m_freem (m);
>>> > >   splx (s);
>>> > >   return (ENETDOWN);
>>> > >   }
>>> > >
>>> > > - if ((ifp->if_flags & (IFF_RUNNING | IFF_AUTO)) == IFF_AUTO) {
>>> > > + if ((if_flags & (IFF_RUNNING | IFF_AUTO)) == IFF_AUTO) {
>>> > >   /*
>>> > >   * Interface is not yet running, but auto-dial.  Need
>>> > >   * to start LCP for it.
>>> > >   */
>>> > > - ifp->if_flags |= IFF_RUNNING;
>>> > > - splx(s);
>>> > > - lcp.Open(sp);
>>> > > - s = splnet();
>>> > > + task_add(systq, &sp->pp_autodial);
>>> > >   }
>>> > >
>>> > >   if (dst->sa_family == AF_INET) {
>>> > > @@ -746,6 +760,8 @@ sppp_attach(struct ifnet *ifp)
>>> > >   sp->pp_up = lcp.Up;
>>> > >   sp->pp_down = lcp.Down;
>>> > >
>>> > > + task_set(&sp->pp_autodial, sppp_autodial, sp);
>>> > > +
>>> > >   for (i = 0; i < IDX_COUNT; i++)
>>> > >   timeout_set(&sp->ch[i], (cps[i])->TO, (void *)sp);
>>> > >   timeout_set(&sp->pap_my_to_ch, sppp_pap_my_TO, (void *)sp);
>>> > > @@ -762,6 +778,8 @@ sppp_detach(struct ifnet *ifp)
>>> > >  {
>>> > >   struct sppp **q, *p, *sp = (struct sppp*) ifp;
>>> > >   int i;
>>> > > +
>>> > > + taskq_del_barrier(systq, &sp->pp_autodial);
>>> > >
>>> > >   sppp_ipcp_destroy(sp);
>>> > >   sppp_ipv6cp_destroy(sp);

Reply via email to