On 11/24/25 5:33 AM, Hangbin Liu wrote:
> The current ad_churn_machine implementation only transitions the
> actor/partner churn state to churned or none after the churn timer expires.
> However, IEEE 802.1AX-2014 specifies that a port should enter the none
> state immediately once the actor’s port state enters synchronization.
> 
> Another issue is that if the churn timer expires while the churn machine is
> not in the monitor state (e.g. already in churn), the state may remain
> stuck indefinitely with no further transitions. This becomes visible in
> multi-aggregator scenarios. For example:
> 
> Ports 1 and 2 are in aggregator 1 (active)
> Ports 3 and 4 are in aggregator 2 (backup)
> 
> Ports 1 and 2 should be in none
> Ports 3 and 4 should be in churned
> 
> If a failover occurs due to port 2 link down/up, aggregator 2 becomes active.
> Under the current implementation, the resulting states may look like:
> 
> agg 1 (backup): port 1 -> none, port 2 -> churned
> agg 2 (active): ports 3,4 keep in churned.
> 
> The root cause is that ad_churn_machine() only clears the
> AD_PORT_CHURNED flag and starts a timer. When a churned port becomes active,
> its RX state becomes AD_RX_CURRENT, preventing the churn flag from being set
> again, leaving no way to retrigger the timer. Fixing this solely in
> ad_rx_machine() is insufficient.
> 
> This patch rewrites ad_churn_machine according to IEEE 802.1AX-2014
> (Figures 6-23 and 6-24), ensuring correct churn detection, state transitions,
> and timer behavior. With new implementation, there is no need to set
> AD_PORT_CHURNED in ad_rx_machine().

I think this change is too invasive at this point of the cycle. I think
it should be moved to the next one or even to net-next.

> Fixes: 14c9551a32eb ("bonding: Implement port churn-machine (AD standard 
> 43.4.17).")
> Reported-by: Liang Li <[email protected]>
> Tested-by: Liang Li <[email protected]>
> Signed-off-by: Hangbin Liu <[email protected]>
> ---
>  drivers/net/bonding/bond_3ad.c | 104 ++++++++++++++++++++++++++-------
>  1 file changed, 84 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
> index d6bd3615d129..98b8d5040148 100644
> --- a/drivers/net/bonding/bond_3ad.c
> +++ b/drivers/net/bonding/bond_3ad.c
> @@ -1240,7 +1240,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
> port *port)
>       /* first, check if port was reinitialized */
>       if (port->sm_vars & AD_PORT_BEGIN) {
>               port->sm_rx_state = AD_RX_INITIALIZE;
> -             port->sm_vars |= AD_PORT_CHURNED;
>       /* check if port is not enabled */
>       } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled)
>               port->sm_rx_state = AD_RX_PORT_DISABLED;
> @@ -1248,8 +1247,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
> port *port)
>       else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) ||
>                (port->sm_rx_state == AD_RX_DEFAULTED) ||
>                (port->sm_rx_state == AD_RX_CURRENT))) {
> -             if (port->sm_rx_state != AD_RX_CURRENT)
> -                     port->sm_vars |= AD_PORT_CHURNED;
>               port->sm_rx_timer_counter = 0;
>               port->sm_rx_state = AD_RX_CURRENT;
>       } else {
> @@ -1333,7 +1330,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
> port *port)
>                       port->partner_oper.port_state |= 
> LACP_STATE_LACP_TIMEOUT;
>                       port->sm_rx_timer_counter = 
> __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT));
>                       port->actor_oper_port_state |= LACP_STATE_EXPIRED;
> -                     port->sm_vars |= AD_PORT_CHURNED;
>                       break;
>               case AD_RX_DEFAULTED:
>                       __update_default_selected(port);
> @@ -1365,39 +1361,107 @@ static void ad_rx_machine(struct lacpdu *lacpdu, 
> struct port *port)
>   * ad_churn_machine - handle port churn's state machine
>   * @port: the port we're looking at
>   *
> + * IEEE 802.1AX-2014 Figure 6-23 - Actor Churn Detection machine state 
> diagram
> + *
> + *                                                     BEGIN || (! 
> port_enabled)
> + *                                                               |
> + *                                      (3)                (1)   v
> + *   +----------------------+     ActorPort.Sync     
> +-------------------------+
> + *   |    NO_ACTOR_CHURN    | <--------------------- |   ACTOR_CHURN_MONITOR 
>   |
> + *   |======================|                        
> |=========================|
> + *   | actor_churn = FALSE; |    ! ActorPort.Sync    | actor_churn = FALSE;  
>   |
> + *   |                      | ---------------------> | Start 
> actor_churn_timer |
> + *   +----------------------+           (4)          
> +-------------------------+
> + *             ^                                                 |
> + *             |                                                 |
> + *             |                                      actor_churn_timer 
> expired
> + *             |                                                 |
> + *       ActorPort.Sync                                          |  (2)
> + *             |              +--------------------+             |
> + *        (3)  |              |   ACTOR_CHURN      |             |
> + *             |              |====================|             |
> + *             +------------- | actor_churn = True | <-----------+
> + *                            |                    |
> + *                            +--------------------+
> + *
> + * Similar for the Figure 6-24 - Partner Churn Detection machine state 
> diagram
>   */
>  static void ad_churn_machine(struct port *port)
>  {
> -     if (port->sm_vars & AD_PORT_CHURNED) {
> +     bool partner_synced = port->partner_oper.port_state & 
> LACP_STATE_SYNCHRONIZATION;
> +     bool actor_synced = port->actor_oper_port_state & 
> LACP_STATE_SYNCHRONIZATION;
> +     bool partner_churned = port->sm_vars & AD_PORT_PARTNER_CHURN;
> +     bool actor_churned = port->sm_vars & AD_PORT_ACTOR_CHURN;
> +
> +     /* ---- 1. begin or port not enabled ---- */
> +     if ((port->sm_vars & AD_PORT_BEGIN) || !port->is_enabled) {
>               port->sm_vars &= ~AD_PORT_CHURNED;
> +
>               port->sm_churn_actor_state = AD_CHURN_MONITOR;
>               port->sm_churn_partner_state = AD_CHURN_MONITOR;
> +
>               port->sm_churn_actor_timer_counter =
>                       __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0);
>               port->sm_churn_partner_timer_counter =
> -                      __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0);
> +                     __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0);
> +

Please avoid white-space changes only, or if you are going to target
net-next, move them to a pre-req patch.

>               return;
>       }
> -     if (port->sm_churn_actor_timer_counter &&
> -         !(--port->sm_churn_actor_timer_counter) &&
> -         port->sm_churn_actor_state == AD_CHURN_MONITOR) {
> -             if (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION) {
> +
> +     if (port->sm_churn_actor_timer_counter)
> +             port->sm_churn_actor_timer_counter--;
> +
> +     if (port->sm_churn_partner_timer_counter)
> +             port->sm_churn_partner_timer_counter--;
> +
> +     /* ---- 2. timer expired, enter CHURN ---- */
> +     if (port->sm_churn_actor_state == AD_CHURN_MONITOR &&
> +         !actor_churned && !port->sm_churn_actor_timer_counter) {
> +             port->sm_vars |= AD_PORT_ACTOR_CHURN;
> +             port->sm_churn_actor_state = AD_CHURN;
> +             port->churn_actor_count++;
> +             actor_churned = true;
> +     }
> +
> +     if (port->sm_churn_partner_state == AD_CHURN_MONITOR &&
> +         !partner_churned && !port->sm_churn_partner_timer_counter) {
> +             port->sm_vars |= AD_PORT_PARTNER_CHURN;
> +             port->sm_churn_partner_state = AD_CHURN;
> +             port->churn_partner_count++;
> +             partner_churned = true;
> +     }
> +
> +     /* ---- 3. CHURN_MONITOR/CHURN + sync -> NO_CHURN ---- */
> +     if ((port->sm_churn_actor_state == AD_CHURN_MONITOR && !actor_churned) 
> ||
> +         (port->sm_churn_actor_state == AD_CHURN && actor_churned)) {

Is this                                             ^^^^^^^^^^^^^^^^

test needed ? I *think* the state machine `actor_churned == true` when
`sm_churn_actor_state == AD_CHURN`

> +             if (actor_synced) {
> +                     port->sm_vars &= ~AD_PORT_ACTOR_CHURN;
>                       port->sm_churn_actor_state = AD_NO_CHURN;
> -             } else {
> -                     port->churn_actor_count++;
> -                     port->sm_churn_actor_state = AD_CHURN;
> +                     actor_churned = false;
>               }

I think this part is not described by the state diagram above?!?

>       }
> -     if (port->sm_churn_partner_timer_counter &&
> -         !(--port->sm_churn_partner_timer_counter) &&
> -         port->sm_churn_partner_state == AD_CHURN_MONITOR) {
> -             if (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) 
> {
> +
> +     if ((port->sm_churn_partner_state == AD_CHURN_MONITOR && 
> !partner_churned) ||
> +         (port->sm_churn_partner_state == AD_CHURN && partner_churned)) {
> +             if (partner_synced) {
> +                     port->sm_vars &= ~AD_PORT_PARTNER_CHURN;
>                       port->sm_churn_partner_state = AD_NO_CHURN;
> -             } else {
> -                     port->churn_partner_count++;
> -                     port->sm_churn_partner_state = AD_CHURN;
> +                     partner_churned = false;
>               }

Possibly move this `if` block in a separate helper and reuse for both
partner and actor.

>       }
> +
> +     /* ---- 4. NO_CHURN + !sync -> MONITOR ---- */
> +     if (port->sm_churn_actor_state == AD_NO_CHURN && !actor_churned && 
> !actor_synced) {
> +             port->sm_churn_actor_state = AD_CHURN_MONITOR;
> +             port->sm_churn_actor_timer_counter =
> +                     __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0);

Should this clear sm_vars & AD_PORT_ACTOR_CHURN, too?

> +     }
> +
> +     if (port->sm_churn_partner_state == AD_NO_CHURN && !partner_churned && 
> !partner_synced) {
> +             port->sm_churn_partner_state = AD_CHURN_MONITOR;
> +             port->sm_churn_partner_timer_counter =
> +                     __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0);

Same question here.

/P


Reply via email to