Hi OVS experts, Our ovs-vswitchd runs to core at the ovs_mutex_trylock(&ukey->mutex) in the function revalidator_sweep__.
I've sent the mail before but have no response. https://mail.openvswitch.org/pipermail/ovs-discuss/2023-August/052604.html So I'm trying to send this mail again. And I may apologize in advance because I would like to post as much useful information as possible to help identify potential issues. So this mail will have a really long text. Compared to the mail 2023-August/052604.html, we upgrade the OVS to 2.17.8 and DPDK to 22.11 to pray for good luck that maybe the community has potential fixes for this issue. But unfortunately, the ovs-vswitchd still runs to core. Here are some local debug information: (gdb) bt #0 0x00007f8751bbf337 in raise () from /lib64/libc.so.6 #1 0x00007f8751bc0a28 in abort () from /lib64/libc.so.6 #2 0x000055c52ed06c7e in ovs_abort_valist (err_no=<optimized out>, format=<optimized out>, args=args@entry=0x7f8744249370) at lib/util.c:499 #3 0x000055c52ed06d14 in ovs_abort (err_no=err_no@entry=0, format=format@entry=0x55c52f01b1e8 "%s: %s() passed uninitialized ovs_mutex") at lib/util.c:491 #4 0x000055c52ecd17e1 in ovs_mutex_trylock_at (l_=l_@entry=0x7f8718dcc098, where=where@entry=0x55c52eff5c60 "ofproto/ofproto-dpif-upcall.c:3044") at lib/ovs-thread.c:106 #5 0x000055c52ebf25f9 in revalidator_sweep__ (revalidator=revalidator@entry=0x55c533082c70, purge=purge@entry=false) at ofproto/ofproto-dpif-upcall.c:3044 #6 0x000055c52ebf640f in revalidator_sweep (revalidator=0x55c533082c70) at ofproto/ofproto-dpif-upcall.c:3102 #7 udpif_revalidator (arg=0x55c533082c70) at ofproto/ofproto-dpif-upcall.c:1101 #8 0x000055c52ecd239f in ovsthread_wrapper (aux_=<optimized out>) at lib/ovs-thread.c:422 #9 0x00007f8753d16e65 in start_thread () from /lib64/libpthread.so.0 #10 0x00007f8751c8788d in clone () from /lib64/libc.so.6 bt output with pretty print (gdb) bt full #0 0x00007f8751bbf337 in raise () from /lib64/libc.so.6 No symbol table info available. #1 0x00007f8751bc0a28 in abort () from /lib64/libc.so.6 No symbol table info available. #2 0x000055c52ed06c7e in ovs_abort_valist (err_no=<optimized out>, format=<optimized out>, args=args@entry=0x7f8744249370) at lib/util.c:499 No locals. #3 0x000055c52ed06d14 in ovs_abort (err_no=err_no@entry=0, format=format@entry=0x55c52f01b1e8 "%s: %s() passed uninitialized ovs_mutex") at lib/util.c:491 args = {{ gp_offset = 32, fp_offset = 48, overflow_arg_area = 0x7f8744249450, reg_save_area = 0x7f8744249390 }} #4 0x000055c52ecd17e1 in ovs_mutex_trylock_at (l_=l_@entry=0x7f8718dcc098, where=where@entry=0x55c52eff5c60 "ofproto/ofproto-dpif-upcall.c:3044") at lib/ovs-thread.c:106 l = 0x7f8718dcc098 error = <optimized out> __func__ = "ovs_mutex_trylock_at" #5 0x000055c52ebf25f9 in revalidator_sweep__ (revalidator=revalidator@entry=0x55c533082c70, purge=purge@entry=false) at ofproto/ofproto-dpif-upcall.c:3044 ukey_state = <optimized out> cursor_52 = { impl = 0x7f86f826c8c0, bucket_idx = 2, entry_idx = 3, node = 0x7f86f8dc8020 } odp_actions_stub = {140218217949376, 9951266880679575560, 55834640392, 10323741882, 0, 2, 140218217949592, 168, 140218217949416, 12885426177, 140218940560728, 281510948569217, 1125934266581005, 281509421383809, 1099511627785, 0, 140218226622112, 4, 819, 10323556447, 0, 2, 140218226622328, 148, 140218226622152, 12885753857, 140218940560856, 2984229694, 18446744069414584320, 18446462603027808255, 4294967295, 1407392063422464, 140218257906000, 3, 248, 10322875848, 0, 2, 140218257906216, 168, 140218257906040, 12884901889, 140218940560984, 0, 0, 0, 0, 0, 140218228630736, 5, 1088, 10292016731, 24, 2, 140218228630952, 168, 140218228630776, 12884901889, 140218940561112, 0, 0, 0, 0, 0, 140218255617104, 2, 120, 10124105851, 0, 2, 140218255617320, 148, 140218255617144, 12884901889, 140218940561240, 0, 0, 0, 0, 0, 140218243823504, 0, 0, 0, 0, 2, 140218243823720, 140, 140218243823544, 12884901889, 140218940561368, 0 <repeats 37 times>} odp_actions = { base = 0x7f8744249550, data = 0x7f8744249550, size = 0, allocated = 1024, header = 0x0, msg = 0x0, list_node = { prev = 0xcccccccccccccccc, next = 0xcccccccccccccccc }, source = OFPBUF_STUB } ukey = 0x7f8718dcc050 n_ops = 0 ops = { ... a really long list .... { ukey = 0x0, stats = { n_packets = 0, n_bytes = 0, used = 0, tcp_flags = 0 }, dop = { type = 0, error = 0, { flow_put = { flags = (DPIF_FP_CREATE | unknown: 905450480), key = 0x50178a28944953ad, key_len = 12884901889, mask = 0x0, mask_len = 0, actions = 0x0, actions_len = 0, ufid = 0x1, pmd_id = 788417076, stats = 0x55c52ecd15f8 <ovs_mutex_lock_at+24> }, flow_del = { key = 0x2694413835f813f1, key_len = 5771233354389738413, ufid = 0x300000001, terse = false, pmd_id = 0, stats = 0x0 }, execute = { actions = 0x2694413835f813f1, actions_len = 5771233354389738413, needs_help = true, probe = false, mtu = 3, hash = 0, flow = 0x0, packet = 0x0 }, flow_get = { key = 0x2694413835f813f1, key_len = 5771233354389738413, ufid = 0x300000001, pmd_id = 0, buffer = 0x0, flow = 0x0 } } } } ... a really long list .... } umap = 0x55c53301f998 cur = <optimized out> i = 39 udpif = 0x55c53301ee20 dump_seq = 3090869337 reval_seq = 3090869356 slice = <optimized out> __func__ = "revalidator_sweep__" #6 0x000055c52ebf640f in revalidator_sweep (revalidator=0x55c533082c70) at ofproto/ofproto-dpif-upcall.c:3102 No locals. #7 udpif_revalidator (arg=0x55c533082c70) at ofproto/ofproto-dpif-upcall.c:1101 revalidator = 0x55c533082c70 udpif = 0x55c53301ee20 leader = true start_time = 10324370685 last_reval_seq = 3090867551 n_flows = 14393 #8 0x000055c52ecd239f in ovsthread_wrapper (aux_=<optimized out>) at lib/ovs-thread.c:422 auxp = <optimized out> aux = { start = 0x55c52ebf6350 <udpif_revalidator>, arg = 0x55c533082c70, name = "revalidator\000\000\000\000" } id = 7 subprogram_name = 0x7f87280008c0 "pN\314(\207\177" #9 0x00007f8753d16e65 in start_thread () from /lib64/libpthread.so.0 No symbol table info available. #10 0x00007f8751c8788d in clone () from /lib64/libc.so.6 No symbol table info available. The umap of loop iteration udpif->ukeys[39] has ukeys (output with ovs_dump_udpif_keys): (struct umap *) 0x55c53301f998: (struct udpif_key *) 0x7f86f8df3930: key_len = 148, mask_len = 152 ufid = a18d9eac-9718-21db-7f97-4a287638e2ef hash = 0x7b0c4227, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 1, n_bytes = 115 used = 10324368860, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871a6db4e0: key_len = 140, mask_len = 152 ufid = 4e04d989-8729-49fc-1d0a-46ef2449ef75 hash = 0x2fe84627, pmd_id = 3 state = UKEY_VISIBLE state_where = 0x55c52eff5da8 "ofproto/ofproto-dpif-upcall.c:2036" n_packets = 0, n_bytes = 0 used = 0, tcp_flags = 0x0000 (struct udpif_key *) 0x7f8719909d50: key_len = 160, mask_len = 172 ufid = aa5170a7-818e-c902-44bf-4f60ff18f0f7 hash = 0x f037027, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 1, n_bytes = 66 used = 10324370390, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871a5af4c0: key_len = 168, mask_len = 172 ufid = f9773957-96f7-8c4e-7ba3-46cf73f7b3e0 hash = 0x32da0a27, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 5, n_bytes = 1243 used = 10324368587, tcp_flags = 0x0018 (struct udpif_key *) 0x7f8718834ff0: key_len = 148, mask_len = 152 ufid = 0d733731-a642-a60f-4f4f-4c6425dd398e hash = 0x61e7a027, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 2, n_bytes = 120 used = 10324367306, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871aafb0c0: key_len = 168, mask_len = 172 ufid = c9a3ab02-8f78-44ee-6583-45e778fa98a5 hash = 0x3e3fbc27, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 1, n_bytes = 60 used = 10324370056, tcp_flags = 0x0000 (struct udpif_key *) 0x7f8718c5d660: key_len = 160, mask_len = 172 ufid = 01a3b747-909a-8077-c2d1-4e3542235fd6 hash = 0x7cff9c27, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 8, n_bytes = 492 used = 10324368705, tcp_flags = 0x0000 (struct udpif_key *) 0x7f8718d04340: key_len = 168, mask_len = 172 ufid = d3965df7-bb19-f997-e736-41e7c701b5f7 hash = 0x8db4d027, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 7, n_bytes = 3781 used = 10324367542, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871aa85860: key_len = 160, mask_len = 172 ufid = 44c1e2ec-b297-fa54-851c-41bf0c255865 hash = 0xec913027, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 1, n_bytes = 66 used = 10324367277, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871a260390: key_len = 168, mask_len = 172 ufid = b6c15cd7-907d-8695-a503-4253a3cb0b7a hash = 0xd35a2c27, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 7, n_bytes = 462 used = 10324367339, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871bfae340: key_len = 140, mask_len = 152 ufid = 84068184-aed0-3d90-5923-4d6f53fc309c hash = 0xaa917227, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 5, n_bytes = 322 used = 10324369800, tcp_flags = 0x0000 (struct udpif_key *) 0x7f8718d91730: key_len = 148, mask_len = 152 ufid = 70833d4f-afe7-4c55-6267-4458ebddf3fa hash = 0x7ad61a27, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 1, n_bytes = 140 used = 10324365702, tcp_flags = 0x0018 (struct udpif_key *) 0x7f871879ad50: key_len = 168, mask_len = 172 ufid = d6b2900e-adbe-02b5-e7d5-48f216c1710c hash = 0xde1a0e27, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 0, n_bytes = 0 used = 10324369841, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871a061ec0: key_len = 160, mask_len = 172 ufid = 47dea4f7-9248-e065-2764-438e3d727cd0 hash = 0x55b18227, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 0, n_bytes = 0 used = 10324369527, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871ab964b0: key_len = 168, mask_len = 172 ufid = 286cf642-9ca0-8b63-f6ad-443081df74e2 hash = 0x8d81d627, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 187, n_bytes = 41849 used = 10324366880, tcp_flags = 0x0000 (struct udpif_key *) 0x7f86f93190e0: key_len = 168, mask_len = 172 ufid = 9de8567b-b3f2-f4e9-cc00-4279bc9be36a hash = 0xfb47e827, pmd_id = 3 state = UKEY_VISIBLE state_where = 0x55c52eff5e18 "ofproto/ofproto-dpif-upcall.c:2089" n_packets = 0, n_bytes = 0 used = 0, tcp_flags = 0x0000 (struct udpif_key *) 0x7f86f8feceb0: key_len = 168, mask_len = 172 ufid = 3f87d43a-8835-1833-1d24-4dbe0213ace2 hash = 0x 7274627, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 0, n_bytes = 0 used = 10324370446, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871a74abc0: key_len = 148, mask_len = 152 ufid = d33fdfff-bed2-5328-2f23-4d76ec8b406e hash = 0xad0aba27, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 1149, n_bytes = 73127 used = 10324370702, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871966d0d0: key_len = 168, mask_len = 172 ufid = b410e4d7-86cb-f77e-e4ff-4f7e42c2c28a hash = 0xc5ac4227, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 0, n_bytes = 0 used = 10324369984, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871b49ddb0: key_len = 168, mask_len = 172 ufid = c9fd9fa3-b621-d37b-507c-4e22b318ce81 hash = 0xa0cdd627, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 9, n_bytes = 2957 used = 10324370136, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871a9a7940: key_len = 160, mask_len = 172 ufid = 4c0b3bce-8bde-4621-6393-4935fe209d85 hash = 0x d0d6e27, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 3, n_bytes = 206 used = 10324370325, tcp_flags = 0x0000 (struct udpif_key *) 0x7f8719fae470: key_len = 148, mask_len = 152 ufid = a6a3c8f2-8de5-14eb-110b-4ed2bf989a23 hash = 0xeda41827, pmd_id = 3 state = UKEY_OPERATIONAL state_where = 0x55c52eff6358 "ofproto/ofproto-dpif-upcall.c:2957" n_packets = 43, n_bytes = 31311 used = 10324368839, tcp_flags = 0x0018 (struct udpif_key *) 0x7f86f8d1adb0: key_len = 148, mask_len = 152 ufid = 80b9fe38-8aba-d4b3-c119-410418fe1092 hash = 0x137d4027, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 2, n_bytes = 124 used = 10324367255, tcp_flags = 0x0000 (struct udpif_key *) 0x7f871ba370d0: key_len = 168, mask_len = 172 ufid = 7e5c26fd-af10-ba15-653c-454a828c068d hash = 0x9306ba27, pmd_id = 3 state = UKEY_EVICTED state_where = 0x55c52eff5b48 "ofproto/ofproto-dpif-upcall.c:2608" n_packets = 5, n_bytes = 820 used = 10324368550, tcp_flags = 0x0000 The length is 24. The umap details: (gdb) p *(struct umap *) 0x55c53301f998 $12 = { mutex = { lock = { __data = { __lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 2, __spins = 0, __elision = 0, __list = { __prev = 0x0, __next = 0x0 } }, __size = '\000' <repeats 16 times>, "\002", '\000' <repeats 22 times>, __align = 0 }, where = 0x55c52efef4be "<unlocked>" }, cmap = { impl = { p = 0x7f86f826c8c0 } } } As we can see the umap 0x55c53301f998 does not have a ukey 0x7f8718dcc050 (but bt full output has ukey = 0x7f8718dcc050). And this ukey = 0x7f8718dcc050 indeed has a mutex with an uninitialized 'where' pointer. Maybe this pointer is just invalid. (gdb) p *(struct udpif_key *)0x7f8718dcc050 $11 = { ... mutex = { lock = { __data = { __lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = -1, __spins = 0, __elision = 0, __list = { __prev = 0x0, __next = 0x0 } }, __size = '\000' <repeats 16 times>, "\377\377\377\377", '\000' <repeats 19 times>, __align = 0 }, where = 0x0 }, ... } There seems to be an out-of-bounds access to the linked list of ukeys here. So, I would greatly appreciate your help, as it is crucial for OVS to operate in our production environment. I can provide further debug related output information at any time. Waiting for your response... Thank you very much in advance. Best regards, LIU Yulong _______________________________________________ discuss mailing list disc...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-discuss