Hello,

We now get this kind of error for the current master branch (5bb3e81e):

vlib_worker_thread_barrier_sync_int: worker thread deadlock

Testing previous commits indicates the problem started with the recent
commit 9121c415 "bonding: adjust link state based on active slaves"
(AuthorDate May 18, CommitDate May 27).

We can reproduce the problem using the following config:

unix {
  nodaemon
  exec /etc/vpp/commands.txt
}
cpu {
  workers 10
}

where commands.txt looks like this:

create bond mode lacp load-balance l23
create int rdma host-if enp101s0f1 name Interface101
create int rdma host-if enp179s0f1 name Interface179
bond add BondEthernet0 Interface101
bond add BondEthernet0 Interface179
create sub-interfaces BondEthernet0 1012
create sub-interfaces BondEthernet0 1013
set int ip address BondEthernet0.1012 10.1.1.1/30
set int ip address BondEthernet0.1013 10.1.2.1/30
set int state BondEthernet0 up
set int state Interface101 up
set int state Interface179 up
set int state BondEthernet0.1012 up
set int state BondEthernet0.1013 up

Then we get the "worker thread deadlock" every time at startup, after
just a few seconds.

We get the following gdb backtrace (for a release build):

vlib_worker_thread_barrier_sync_int: worker thread deadlock
Thread 3 "vpp_wk_0" received signal SIGABRT, Aborted.
[Switching to Thread 0x7ffe027fe700 (LWP 12171)]
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
51      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at
../sysdeps/unix/sysv/linux/raise.c:51
#1  0x00007ffff42ff801 in __GI_abort () at abort.c:79
#2  0x000055555555c700 in os_panic () at vpp/src/vpp/vnet/main.c:371
#3  0x00007ffff5dd03ab in vlib_worker_thread_barrier_sync_int
(vm=0x7fffb87c0300, func_name=<optimized out>) at
vpp/src/vlib/threads.c:1517
#4  0x00007ffff77bfa9c in dpo_get_next_node (child_type=<optimized
out>, child_proto=<optimized out>, parent_dpo=0x7fffb9cebda0) at
vpp/src/vnet/dpo/dpo.c:430
#5  dpo_stack (child_type=<optimized out>, child_proto=<optimized out>,
dpo=<optimized out>, parent=0x7fffb9cebda0) at
vpp/src/vnet/dpo/dpo.c:521
#6  0x00007ffff77c50ac in load_balance_set_bucket_i (lb=0x7fffb8e784c0,
bucket=<optimized out>, buckets=0x7fffb8e784e0, next=<optimized out>)
    at vpp/src/vnet/dpo/load_balance.c:252
#7  load_balance_fill_buckets_norm (lb=0x7fffb8e784c0,
nhs=0x7fffb9cebda0, buckets=0x7fffb8e784e0, n_buckets=<optimized out>)
at vpp/src/vnet/dpo/load_balance.c:525
#8  load_balance_fill_buckets (lb=0x7fffb8e784c0, nhs=0x7fffb9cebda0,
buckets=0x7fffb8e784e0, n_buckets=<optimized out>, flags=<optimized
out>)
    at vpp/src/vnet/dpo/load_balance.c:589
#9  0x00007ffff77c4d5f in load_balance_multipath_update (dpo=<optimized
out>, raw_nhs=<optimized out>, flags=<optimized out>) at
vpp/src/vnet/dpo/load_balance.c:88
#10 0x00007ffff778e0fc in fib_entry_src_mk_lb
(fib_entry=0x7fffb90dd770, esrc=0x7fffb8c60150,
fct=FIB_FORW_CHAIN_TYPE_UNICAST_IP4, dpo_lb=0x7fffb90dd798)
    at vpp/src/vnet/fib/fib_entry_src.c:645
#11 0x00007ffff778e4b7 in fib_entry_src_action_install
(fib_entry=0x7fffb90dd770, source=FIB_SOURCE_INTERFACE) at
vpp/src/vnet/fib/fib_entry_src.c:705
#12 0x00007ffff778f0b0 in fib_entry_src_action_reactivate
(fib_entry=0x7fffb90dd770, source=FIB_SOURCE_INTERFACE) at
vpp/src/vnet/fib/fib_entry_src.c:1221
#13 0x00007ffff778d873 in fib_entry_back_walk_notify
(node=0x7fffb90dd770, ctx=0x7fffb89c21d0) at
vpp/src/vnet/fib/fib_entry.c:316
#14 0x00007ffff778343b in fib_walk_advance (fwi=<optimized out>) at
vpp/src/vnet/fib/fib_walk.c:368
#15 0x00007ffff7784107 in fib_walk_sync (parent_type=<optimized out>,
parent_index=<optimized out>, ctx=0x7fffb89c22a0) at
vpp/src/vnet/fib/fib_walk.c:792
#16 0x00007ffff779a43b in fib_path_back_walk_notify (node=<optimized
out>, ctx=0x7fffb89c22a0) at vpp/src/vnet/fib/fib_path.c:1226
#17 0x00007ffff778343b in fib_walk_advance (fwi=<optimized out>) at
vpp/src/vnet/fib/fib_walk.c:368
#18 0x00007ffff7784107 in fib_walk_sync (parent_type=<optimized out>,
parent_index=<optimized out>, ctx=0x7fffb89c2330) at
vpp/src/vnet/fib/fib_walk.c:792
#19 0x00007ffff77a6dec in adj_glean_interface_state_change
(vnm=<optimized out>, sw_if_index=5, flags=<optimized out>) at
vpp/src/vnet/adj/adj_glean.c:166
#20 adj_nbr_hw_sw_interface_state_change (vnm=<optimized out>,
sw_if_index=5, arg=<optimized out>) at vpp/src/vnet/adj/adj_glean.c:183
#21 0x00007ffff70e06cc in vnet_hw_interface_walk_sw (vnm=0x7ffff7b570f0
<vnet_main>, hw_if_index=<optimized out>, fn=0x7ffff77a6da0
<adj_nbr_hw_sw_interface_state_change>, ctx=0x1)
    at vpp/src/vnet/interface.c:1062
#22 0x00007ffff77a6b72 in adj_glean_hw_interface_state_change (vnm=0x2,
hw_if_index=3097238656, flags=<optimized out>) at
vpp/src/vnet/adj/adj_glean.c:205
#23 0x00007ffff70df60c in call_elf_section_interface_callbacks
(vnm=0x7ffff7b570f0 <vnet_main>, if_index=1, flags=<optimized out>,
elts=0x7ffff7b571a0 <vnet_main+176>)
    at vpp/src/vnet/interface.c:251
#24 vnet_hw_interface_set_flags_helper (vnm=0x7ffff7b570f0 <vnet_main>,
hw_if_index=1, flags=VNET_HW_INTERFACE_FLAG_LINK_UP,
helper_flags=<optimized out>)
    at vpp/src/vnet/interface.c:331
#25 0x00007ffff71b300f in bond_enable_collecting_distributing
(vm=<optimized out>, sif=0x7fffb95de168) at
vpp/src/vnet/bonding/cli.c:178
#26 0x00007fffad765636 in lacp_mux_action_collecting_distributing
(p1=0x7fffb87c0300, p2=0x7fffb95de168) at
vpp/src/plugins/lacp/mux_machine.c:173
#27 0x00007fffad7654ff in lacp_mux_action_attached (p1=0x7fffb87c0300,
p2=0x7fffb95de168) at vpp/src/plugins/lacp/mux_machine.c:140
#28 0x00007fffad764d41 in lacp_rx_action_current (p1=<optimized out>,
p2=0x7fffb95de168) at vpp/src/plugins/lacp/rx_machine.c:374
#29 0x00007fffad76a0b0 in lacp_packet_scan (vm=0x7fffb89c1c80,
sif=<optimized out>) at vpp/src/plugins/lacp/input.c:39
#30 lacp_input (vm=<optimized out>, b0=<optimized out>, bi0=<optimized
out>) at vpp/src/plugins/lacp/input.c:216
#31 0x00007fffad76d57f in lacp_node_fn (vm=0x7fffb87c0300,
node=0x7fffb9ca8680, frame=0x7fffb95bb1c0) at
vpp/src/plugins/lacp/node.c:87
#32 0x00007ffff5d84faa in dispatch_node (vm=0x7fffb87c0300,
node=0x7fffb9ca8680, type=VLIB_NODE_TYPE_INTERNAL,
dispatch_state=VLIB_NODE_STATE_POLLING, frame=<optimized out>, 
    last_time_stamp=<optimized out>) at vpp/src/vlib/main.c:1235
#33 dispatch_pending_node (vm=0x7fffb87c0300,
pending_frame_index=<optimized out>, last_time_stamp=<optimized out>)
at vpp/src/vlib/main.c:1403
#34 vlib_main_or_worker_loop (vm=<optimized out>, is_main=0) at
vpp/src/vlib/main.c:1862
#35 vlib_worker_loop (vm=<optimized out>) at vpp/src/vlib/main.c:1996
#36 0x00007ffff51d0e94 in clib_calljmp () from vpp/build-root/install-
vpp-native/vpp/lib/libvppinfra.so.20.09
#37 0x00007ffe027fdd10 in ?? ()
#38 0x00007fffafa6297a in eal_thread_loop () from vpp/build-
root/install-vpp-native/vpp/lib/vpp_plugins/dpdk_plugin.so
#39 0x0310050131027ef8 in ?? ()
#40 0xd6080e05f2080188 in ?? ()
#41 0x0306102b02060305 in ?? ()

For a debug build something else happens, we don't get the "worker
thread deadlock" error message but instead this:

Thread 3 "vpp_wk_0" received signal SIGABRT, Aborted.
[Switching to Thread 0x7ffe027fe700 (LWP 20881)]
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
51      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) bt
#0  __GI_raise (sig=sig@entry=6) at
../sysdeps/unix/sysv/linux/raise.c:51
#1  0x00007ffff4629801 in __GI_abort () at abort.c:79
#2  0x00000000004071e3 in os_panic () at vpp/src/vpp/vnet/main.c:371
#3  0x00007ffff55085b9 in debugger () at vpp/src/vppinfra/error.c:84
#4  0x00007ffff5508337 in _clib_error (how_to_die=2, function_name=0x0,
line_number=0, fmt=0x7ffff6171250 "%s:%d (%s) assertion `%s' fails")
    at vpp/src/vppinfra/error.c:143
#5  0x00007ffff60cfabd in vlib_log (level=VLIB_LOG_LEVEL_DEBUG,
class=393219, fmt=0x7ffff770fda1 "[%U]:sync-start: %U") at
vpp/src/vlib/log.c:102
#6  0x00007ffff74c57db in fib_walk_sync (parent_type=FIB_NODE_TYPE_ADJ,
parent_index=1, ctx=0x7fffb90f61d0) at vpp/src/vnet/fib/fib_walk.c:780
#7  0x00007ffff753a7e1 in adj_glean_interface_state_change
(vnm=0x7ffff7b48fb8 <vnet_main>, sw_if_index=5, flags=1) at
vpp/src/vnet/adj/adj_glean.c:166
#8  0x00007ffff753b285 in adj_nbr_hw_sw_interface_state_change
(vnm=0x7ffff7b48fb8 <vnet_main>, sw_if_index=5, arg=0x1) at
vpp/src/vnet/adj/adj_glean.c:183
#9  0x00007ffff6b3f222 in vnet_hw_interface_walk_sw (vnm=0x7ffff7b48fb8
<vnet_main>, hw_if_index=1, fn=0x7ffff753b260
<adj_nbr_hw_sw_interface_state_change>, ctx=0x1)
    at vpp/src/vnet/interface.c:1062
#10 0x00007ffff753a943 in adj_glean_hw_interface_state_change
(vnm=0x7ffff7b48fb8 <vnet_main>, hw_if_index=1, flags=1) at
vpp/src/vnet/adj/adj_glean.c:205
#11 0x00007ffff6b45aff in call_elf_section_interface_callbacks
(vnm=0x7ffff7b48fb8 <vnet_main>, if_index=1, flags=1,
elts=0x7ffff7b49068 <vnet_main+176>)
    at vpp/src/vnet/interface.c:251
#12 0x00007ffff6b3d12e in vnet_hw_interface_set_flags_helper
(vnm=0x7ffff7b48fb8 <vnet_main>, hw_if_index=1,
flags=VNET_HW_INTERFACE_FLAG_LINK_UP, 
    helper_flags=VNET_INTERFACE_SET_FLAGS_HELPER_WANT_REDISTRIBUTE) at
vpp/src/vnet/interface.c:331
#13 0x00007ffff6b3d1c6 in vnet_hw_interface_set_flags
(vnm=0x7ffff7b48fb8 <vnet_main>, hw_if_index=1,
flags=VNET_HW_INTERFACE_FLAG_LINK_UP)
    at vpp/src/vnet/interface.c:501
#14 0x00007ffff6d53ecc in bond_enable_collecting_distributing
(vm=0x7fffb8ef28c0, sif=0x7fffb9f6a6a8) at
vpp/src/vnet/bonding/cli.c:178
#15 0x00007fffad945d4b in lacp_mux_action_collecting_distributing
(p1=0x7fffb8ef28c0, p2=0x7fffb9f6a6a8) at
vpp/src/plugins/lacp/mux_machine.c:173
#16 0x00007fffad9326bc in lacp_machine_dispatch (machine=0x7fffadb696e0
<lacp_mux_machine>, vm=0x7fffb8ef28c0, sif=0x7fffb9f6a6a8, event=5,
state=0x7fffb9f6a780)
    at vpp/src/plugins/lacp/lacp.c:327
#17 0x00007fffad945b9a in lacp_mux_action_attached (p1=0x7fffb8ef28c0,
p2=0x7fffb9f6a6a8) at vpp/src/plugins/lacp/mux_machine.c:140
#18 0x00007fffad9326bc in lacp_machine_dispatch (machine=0x7fffadb696e0
<lacp_mux_machine>, vm=0x7fffb8ef28c0, sif=0x7fffb9f6a6a8, event=1,
state=0x7fffb9f6a780)
    at vpp/src/plugins/lacp/lacp.c:327
#19 0x00007fffad943933 in lacp_set_port_selected (vm=0x7fffb8ef28c0,
sif=0x7fffb9f6a6a8) at vpp/src/plugins/lacp/selection.c:51
#20 0x00007fffad94350a in lacp_selection_logic (vm=0x7fffb8ef28c0,
sif=0x7fffb9f6a6a8) at vpp/src/plugins/lacp/selection.c:83
#21 0x00007fffad9443f5 in lacp_rx_action_current (p1=0x7fffb8ef28c0,
p2=0x7fffb9f6a6a8) at vpp/src/plugins/lacp/rx_machine.c:374
#22 0x00007fffad9326bc in lacp_machine_dispatch (machine=0x7fffadb692c0
<lacp_rx_machine>, vm=0x7fffb8ef28c0, sif=0x7fffb9f6a6a8, event=5,
state=0x7fffb9f6a778)
    at vpp/src/plugins/lacp/lacp.c:327
#23 0x00007fffad951ba7 in lacp_packet_scan (vm=0x7fffb8ef28c0,
sif=0x7fffb9f6a6a8) at vpp/src/plugins/lacp/input.c:39
#24 0x00007fffad951293 in lacp_input (vm=0x7fffb8ef28c0,
b0=0x103ff65400, bi0=16767312) at vpp/src/plugins/lacp/input.c:216
#25 0x00007fffad959946 in lacp_node_fn (vm=0x7fffb8ef28c0,
node=0x7fffba651140, frame=0x7fffb93a8cc0) at
vpp/src/plugins/lacp/node.c:87
#26 0x00007ffff60de2d5 in dispatch_node (vm=0x7fffb8ef28c0,
node=0x7fffba651140, type=VLIB_NODE_TYPE_INTERNAL,
dispatch_state=VLIB_NODE_STATE_POLLING, frame=0x7fffb93a8cc0, 
    last_time_stamp=56709265274273836) at vpp/src/vlib/main.c:1235
#27 0x00007ffff60deba7 in dispatch_pending_node (vm=0x7fffb8ef28c0,
pending_frame_index=2, last_time_stamp=56709265274273836) at
vpp/src/vlib/main.c:1403
#28 0x00007ffff60d9001 in vlib_main_or_worker_loop (vm=0x7fffb8ef28c0,
is_main=0) at vpp/src/vlib/main.c:1862
#29 0x00007ffff60d83a7 in vlib_worker_loop (vm=0x7fffb8ef28c0) at
vpp/src/vlib/main.c:1996
#30 0x00007ffff612f7c1 in vlib_worker_thread_fn (arg=0x7fffb4ca0440) at
vpp/src/vlib/threads.c:1795
#31 0x00007ffff552c744 in clib_calljmp () at
vpp/src/vppinfra/longjmp.S:123
#32 0x00007ffe027fdce0 in ?? ()
#33 0x00007ffff6127f73 in vlib_worker_thread_bootstrap_fn
(arg=0x7fffb4ca0440) at vpp/src/vlib/threads.c:584
Backtrace stopped: previous frame inner to this frame (corrupt stack?)

Perhaps someone with better insight in the code than me can see why the
commit 9121c415 would give this problem, and how it could be fixed?

Best regards
Elias
-=-=-=-=-=-=-=-=-=-=-=-
Links: You receive all messages sent to this group.

View/Reply Online (#16577): https://lists.fd.io/g/vpp-dev/message/16577
Mute This Topic: https://lists.fd.io/mt/74548458/21656
Group Owner: vpp-dev+ow...@lists.fd.io
Unsubscribe: https://lists.fd.io/g/vpp-dev/unsub  [arch...@mail-archive.com]
-=-=-=-=-=-=-=-=-=-=-=-

Reply via email to