Re: [PATCH bpf-next] bpf: add support to read cpu_entry in bpf program
On 4/27/24 8:18 AM, Florian Lehner wrote: Add new field "cpu_entry" to bpf_perf_event_data which could be read by bpf programs attached to perf events. The value contains the CPU value recorded by specifying sample_type with PERF_SAMPLE_CPU when calling perf_event_open(). You can use bpf_cast_to_kern_ctx kfunc which can cast 'struct bpf_perf_event_data' ctx to 'struct bpf_perf_event_data_kern'. struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; You can access bpf_perf_event_data_kern->data and then to access 'cpu_entry' field. Signed-off-by: Florian Lehner --- include/uapi/linux/bpf_perf_event.h | 4 kernel/trace/bpf_trace.c | 13 + tools/include/uapi/linux/bpf_perf_event.h | 4 3 files changed, 21 insertions(+) diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h index eb1b9d21250c..4856b4396ece 100644 --- a/include/uapi/linux/bpf_perf_event.h +++ b/include/uapi/linux/bpf_perf_event.h @@ -14,6 +14,10 @@ struct bpf_perf_event_data { bpf_user_pt_regs_t regs; __u64 sample_period; __u64 addr; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; }; #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index afb232b1d7c2..2b303221af5c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2176,6 +2176,11 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; + case bpf_ctx_range(struct bpf_perf_event_data, cpu_entry): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; default: if (size != sizeof(long)) return false; @@ -2208,6 +2213,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, addr, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, cpu_entry): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, cpu_entry, 8, +target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, diff --git a/tools/include/uapi/linux/bpf_perf_event.h b/tools/include/uapi/linux/bpf_perf_event.h index eb1b9d21250c..4856b4396ece 100644 --- a/tools/include/uapi/linux/bpf_perf_event.h +++ b/tools/include/uapi/linux/bpf_perf_event.h @@ -14,6 +14,10 @@ struct bpf_perf_event_data { bpf_user_pt_regs_t regs; __u64 sample_period; __u64 addr; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; }; #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
Re: [syzbot] [bpf?] [trace?] WARNING in group_send_sig_info
On 4/27/24 9:34 AM, syzbot wrote: Hello, syzbot found the following issue on: HEAD commit:443574b03387 riscv, bpf: Fix kfunc parameters incompatibil.. git tree: bpf console output: https://syzkaller.appspot.com/x/log.txt?x=11ca8fe718 kernel config: https://syzkaller.appspot.com/x/.config?x=6fb1be60a193d440 dashboard link: https://syzkaller.appspot.com/bug?extid=1902c6d326478ce2dfb0 compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40 Unfortunately, I don't have any reproducer for this issue yet. Downloadable assets: disk image: https://storage.googleapis.com/syzbot-assets/3f355021a085/disk-443574b0.raw.xz vmlinux: https://storage.googleapis.com/syzbot-assets/44cf4de7472a/vmlinux-443574b0.xz kernel image: https://storage.googleapis.com/syzbot-assets/a99a36c7ad65/bzImage-443574b0.xz IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+1902c6d326478ce2d...@syzkaller.appspotmail.com [ cut here ] raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 1 PID: 7785 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x29/0x40 kernel/locking/irqflag-debug.c:10 Modules linked in: CPU: 1 PID: 7785 Comm: syz-executor.3 Not tainted 6.8.0-syzkaller-05236-g443574b03387 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 RIP: 0010:warn_bogus_irq_restore+0x29/0x40 kernel/locking/irqflag-debug.c:10 Code: 90 f3 0f 1e fa 90 80 3d de 59 01 04 00 74 06 90 c3 cc cc cc cc c6 05 cf 59 01 04 01 90 48 c7 c7 20 ba aa 8b e8 f8 d5 e7 f5 90 <0f> 0b 90 90 90 c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 0f 1f RSP: 0018:c9000399fbb8 EFLAGS: 00010246 RAX: 4aede97b00455d00 RBX: 192000733f7c RCX: 88802a129e00 RDX: RSI: RDI: RBP: c9000399fc50 R08: 8157cc12 R09: 1110172a51a2 R10: dc00 R11: ed10172a51a3 R12: dc00 R13: 192000733f78 R14: c9000399fbe0 R15: 0246 FS: 7ae76480() GS:8880b950() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7ffc27e190f8 CR3: 6cb5 CR4: 003506f0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:151 [inline] _raw_spin_unlock_irqrestore+0x120/0x140 kernel/locking/spinlock.c:194 spin_unlock_irqrestore include/linux/spinlock.h:406 [inline] unlock_task_sighand include/linux/sched/signal.h:754 [inline] do_send_sig_info kernel/signal.c:1302 [inline] group_send_sig_info+0x2e0/0x310 kernel/signal.c:1453 bpf_send_signal_common+0x2dd/0x430 kernel/trace/bpf_trace.c:881 bpf_send_signal kernel/trace/bpf_trace.c:886 [inline] bpf_send_signal+0x19/0x30 kernel/trace/bpf_trace.c:884 bpf_prog_8cc4ff36b5985b6a+0x1d/0x1f bpf_dispatcher_nop_func include/linux/bpf.h:1234 [inline] __bpf_prog_run include/linux/filter.h:650 [inline] bpf_prog_run include/linux/filter.h:664 [inline] __bpf_trace_run kernel/trace/bpf_trace.c:2381 [inline] bpf_trace_run2+0x375/0x420 kernel/trace/bpf_trace.c:2420 trace_sys_exit include/trace/events/syscalls.h:44 [inline] syscall_exit_work+0x153/0x170 kernel/entry/common.c:163 syscall_exit_to_user_mode_prepare kernel/entry/common.c:194 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:199 [inline] syscall_exit_to_user_mode+0x273/0x360 kernel/entry/common.c:212 do_syscall_64+0x10a/0x240 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x6d/0x75 The following are related functions. struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; rcu_read_lock(); for (;;) { sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) break; /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * * We need to ensure that tsk->sighand is still the same * after we take the lock, we can race with de_thread() or * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ spin_lock_irqsave(>siglock, *flags); if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(>siglock, *flags); } rcu_read_unlock();
Re: BUG: unable to handle kernel paging request in bpf_probe_read_compat_str
On 12/20/23 1:19 AM, Hou Tao wrote: Hi, On 12/14/2023 11:40 AM, xingwei lee wrote: Hello I found a bug in net/bpf in the lastest upstream linux and comfired in the lastest net tree and lastest net bpf titled BUG: unable to handle kernel paging request in bpf_probe_read_compat_str If you fix this issue, please add the following tag to the commit: Reported-by: xingwei Lee kernel: net 9702817384aa4a3700643d0b26e71deac0172cfd / bpf 2f2fee2bf74a7e31d06fc6cb7ba2bd4dd7753c99 Kernel config: https://syzkaller.appspot.com/text?tag=KernelConfig=b50bd31249191be8 in the lastest bpf tree, the crash like: TITLE: BUG: unable to handle kernel paging request in bpf_probe_read_compat_str CORRUPTED: false () MAINTAINERS (TO): [a...@linux-foundation.org linux...@kvack.org] MAINTAINERS (CC): [linux-kernel@vger.kernel.org] BUG: unable to handle page fault for address: ff0 Thanks for the report and reproducer. The output is incomplete. It should be: "BUG: unable to handle page fault for address: ff60". The address is a vsyscall address, so handle_page_fault() considers that the fault address is in userspace instead of kernel space, and there will be no fix-up for the exception and oops happened. Will post a fix and a selftest for it. There is a proposed fix here: https://lore.kernel.org/bpf/87r0jwquhv.ffs@tglx/ Not sure the fix in the above link is merged to some upstream branch or not. #PF: supervisor read access in kernel mode #PF: error_code(0x) - not-present page PGD cf7a067 P4D cf7a067 PUD cf7c067 PMD cf9f067 0 Oops: [#1] PREEMPT SMP KASAN CPU: 1 PID: 8219 Comm: 9de Not tainted 6.7.0-rc41 Hardware name: QEMU Standard PC (i440FX + PIIX, 4 RIP: 0010:strncpy_from_kernel_nofault+0xc4/0x270 mm/maccess.c:91 Code: 83 85 6c 17 00 00 01 48 8b 2c 24 eb 18 e8 0 RSP: 0018:c900114e7ac0 EFLAGS: 00010293 RAX: RBX: c900114e7b30 RCX:2 RDX: 8880183abcc0 RSI: 81b8c9c4 RDI:c RBP: ff60 R08: 0001 R09:0 R10: 0001 R11: 0001 R12:8 R13: ff60 R14: 0008 R15:0 FS: () GS:88823bc0(0 CS: 0010 DS: ES: CR0: 80050033 CR2: ff60 CR3: 0cf77000 CR4:0 PKRU: 5554 Call Trace: bpf_probe_read_kernel_str_common kernel/trace/bpf_trace.c:262 [inline] bpf_probe_read_compat_str kernel/trace/bpf_trace.c:310 [inline] bpf_probe_read_compat_str+0x12f/0x170 kernel/trace/bpf_trace.c:303 bpf_prog_f17ebaf3f5f7baf8+0x42/0x44 bpf_dispatcher_nop_func include/linux/bpf.h:1196 [inline] __bpf_prog_run include/linux/filter.h:651 [inline] bpf_prog_run include/linux/filter.h:658 [inline] __bpf_trace_run kernel/trace/bpf_trace.c:2307 [inline] bpf_trace_run2+0x14e/0x410 kernel/trace/bpf_trace.c:2346 trace_kfree include/trace/events/kmem.h:94 [inline] kfree+0xec/0x150 mm/slab_common.c:1043 vma_numab_state_free include/linux/mm.h:638 [inline] __vm_area_free+0x3e/0x140 kernel/fork.c:525 remove_vma+0x128/0x170 mm/mmap.c:146 exit_mmap+0x453/0xa70 mm/mmap.c:3332 __mmput+0x12a/0x4d0 kernel/fork.c:1349 mmput+0x62/0x70 kernel/fork.c:1371 exit_mm kernel/exit.c:567 [inline] do_exit+0x9aa/0x2ac0 kernel/exit.c:858 do_group_exit+0xd4/0x2a0 kernel/exit.c:1021 __do_sys_exit_group kernel/exit.c:1032 [inline] __se_sys_exit_group kernel/exit.c:1030 [inline] __x64_sys_exit_group+0x3e/0x50 kernel/exit.c:1030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x41/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b =* repro.c =* // autogenerated by syzkaller (https://github.com/google/syzkaller) #define _GNU_SOURCE #include #include #include #include #include #include #include #include #ifndef __NR_bpf #define __NR_bpf 321 #endif #define BITMASK(bf_off, bf_len) (((1ull << (bf_len)) - 1) << (bf_off)) #define STORE_BY_BITMASK(type, htobe, addr, val, bf_off, bf_len) \ *(type*)(addr) = \ htobe((htobe(*(type*)(addr)) & ~BITMASK((bf_off), (bf_len))) | \ (((type)(val) << (bf_off)) & BITMASK((bf_off), (bf_len uint64_t r[1] = {0x}; int main(void) { syscall(__NR_mmap, /*addr=*/0x1000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=*/0x32ul, /*fd=*/-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x2000ul, /*len=*/0x100ul, /*prot=*/7ul, /*flags=*/0x32ul, /*fd=*/-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x2100ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=*/0x32ul, /*fd=*/-1, /*offset=*/0ul); intptr_t res = 0; *(uint32_t*)0x20c0 = 0x11; *(uint32_t*)0x20c4 = 0xb; *(uint64_t*)0x20c8 = 0x2180; *(uint8_t*)0x2180 = 0x18; STORE_BY_BITMASK(uint8_t, , 0x2181, 0, 0, 4); STORE_BY_BITMASK(uint8_t, , 0x2181, 0, 4, 4); *(uint16_t*)0x2182 = 0; *(uint32_t*)0x2184 = 0; *(uint8_t*)0x2188 = 0; *(uint8_t*)0x2189 = 0; *(uint16_t*)0x218a = 0;
Re: [PATCH net] bpf: test_run: fix WARNING in format_decode
On 11/21/23 7:50 PM, Edward Adam Davis wrote: Confirm that skb->len is not 0 to ensure that skb length is valid. Fixes: 114039b34201 ("bpf: Move skb->len == 0 checks into __bpf_redirect") Reported-by: syzbot+e2c932aec5c8a6e1d...@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Stan, Could you take a look at this patch? --- net/bpf/test_run.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c9fdcc5cdce1..78258a822a5c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -845,6 +845,9 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + if (!skb->len) + return -EINVAL; + if (!__skb) return 0;
Re: [PATCH] bpf: Fix backport of "bpf: restrict unknown scalars of mixed signed bounds for unprivileged"
On 4/19/21 4:56 PM, Samuel Mendoza-Jonas wrote: The 4.14 backport of 9d7eceede ("bpf: restrict unknown scalars of mixed signed bounds for unprivileged") adds the PTR_TO_MAP_VALUE check to the wrong location in adjust_ptr_min_max_vals(), most likely because 4.14 doesn't include the commit that updates the if-statement to a switch-statement (aad2eeaf4 "bpf: Simplify ptr_min_max_vals adjustment"). Move the check to the proper location in adjust_ptr_min_max_vals(). Fixes: 17efa65350c5a ("bpf: restrict unknown scalars of mixed signed bounds for unprivileged") Signed-off-by: Samuel Mendoza-Jonas Reviewed-by: Frank van der Linden Reviewed-by: Ethan Chen Just to be clear, the patch is for 4.14 stable branch. Acked-by: Yonghong Song
Re: [PATCH v2] tools: do not include scripts/Kbuild.include
On 4/16/21 6:00 AM, Masahiro Yamada wrote: Since commit d9f4ff50d2aa ("kbuild: spilt cc-option and friends to scripts/Makefile.compiler"), some kselftests fail to build. The tools/ directory opted out Kbuild, and went in a different direction. They copy any kind of files to the tools/ directory in order to do whatever they want in their world. tools/build/Build.include mimics scripts/Kbuild.include, but some tool Makefiles included the Kbuild one to import a feature that is missing in tools/build/Build.include: - Commit ec04aa3ae87b ("tools/thermal: tmon: use "-fstack-protector" only if supported") included scripts/Kbuild.include from tools/thermal/tmon/Makefile to import the cc-option macro. - Commit c2390f16fc5b ("selftests: kvm: fix for compilers that do not support -no-pie") included scripts/Kbuild.include from tools/testing/selftests/kvm/Makefile to import the try-run macro. - Commit 9cae4ace80ef ("selftests/bpf: do not ignore clang failures") included scripts/Kbuild.include from tools/testing/selftests/bpf/Makefile to import the .DELETE_ON_ERROR target. - Commit 0695f8bca93e ("selftests/powerpc: Handle Makefile for unrecognized option") included scripts/Kbuild.include from tools/testing/selftests/powerpc/pmu/ebb/Makefile to import the try-run macro. Copy what they need into tools/build/Build.include, and make them include it instead of scripts/Kbuild.include. Link: https://lore.kernel.org/lkml/86dadf33-70f7-a5ac-cb8c-64966d2f4...@linux.ibm.com/ Fixes: d9f4ff50d2aa ("kbuild: spilt cc-option and friends to scripts/Makefile.compiler") Reported-by: Janosch Frank Reported-by: Christian Borntraeger Signed-off-by: Masahiro Yamada LGTM although I see some tools Makefile directly added ".DELETE_ON_ERROR:" in their Makefile. Acked-by: Yonghong Song
Re: 5.?? regression: strace testsuite OOpses kernel on ia64
On 4/9/21 2:20 PM, Sergei Trofimovich wrote: On Tue, 23 Feb 2021 18:53:21 + Sergei Trofimovich wrote: The crash seems to be related to sock_filter-v test from strace: https://github.com/strace/strace/blob/master/tests/seccomp-filter-v.c Here is an OOps: [ 818.089904] BUG: Bad page map in process sock_filter-v pte:0001 pmd:118580001 [ 818.089904] page:e6a429c8 refcount:1 mapcount:-1 mapping: index:0x0 pfn:0x0 [ 818.089904] flags: 0x1000(reserved) [ 818.089904] raw: 1000 a0004008 a0004008 [ 818.089904] raw: 0001fffe [ 818.089904] page dumped because: bad pte [ 818.089904] addr: vm_flags:04044011 anon_vma: mapping: index:0 [ 818.095483] file:(null) fault:0x0 mmap:0x0 readpage:0x0 [ 818.095483] CPU: 0 PID: 5990 Comm: sock_filter-v Not tainted 5.11.0-3-gbfa5a4929c90 #57 [ 818.095483] Hardware name: hp server rx3600 , BIOS 04.03 04/08/2008 [ 818.095483] [ 818.095483] Call Trace: [ 818.095483] [] show_stack+0x90/0xc0 [ 818.095483] sp=e00118707bb0 bsp=e001187013c0 [ 818.095483] [] dump_stack+0x120/0x160 [ 818.095483] sp=e00118707d80 bsp=e00118701348 [ 818.095483] [] print_bad_pte+0x300/0x3a0 [ 818.095483] sp=e00118707d80 bsp=e001187012e0 [ 818.099483] [] unmap_page_range+0xa90/0x11a0 [ 818.099483] sp=e00118707d80 bsp=e00118701140 [ 818.099483] [] unmap_vmas+0xc0/0x100 [ 818.099483] sp=e00118707da0 bsp=e00118701108 [ 818.099483] [] exit_mmap+0x150/0x320 [ 818.099483] sp=e00118707da0 bsp=e001187010d8 [ 818.099483] [] mmput+0x60/0x200 [ 818.099483] sp=e00118707e20 bsp=e001187010b0 [ 818.103482] [] do_exit+0x6f0/0x18a0 [ 818.103482] sp=e00118707e20 bsp=e00118701038 [ 818.103482] [] do_group_exit+0x90/0x2a0 [ 818.103482] sp=e00118707e30 bsp=e00118700ff0 [ 818.103482] [] sys_exit_group+0x20/0x40 [ 818.103482] sp=e00118707e30 bsp=e00118700f98 [ 818.107482] [] ia64_trace_syscall+0xf0/0x130 [ 818.107482] sp=e00118707e30 bsp=e00118700f98 [ 818.107482] [] ia64_ivt+0x00040720/0x400 [ 818.107482] sp=e00118708000 bsp=e00118700f98 [ 818.115482] Disabling lock debugging due to kernel taint [ 818.115482] BUG: Bad rss-counter state mm:2eec6412 type:MM_FILEPAGES val:-1 [ 818.132256] Unable to handle kernel NULL pointer dereference (address 0068) [ 818.133904] sock_filter-v-X[5999]: Oops 11012296146944 [1] [ 818.133904] Modules linked in: acpi_ipmi ipmi_si usb_storage e1000 ipmi_devintf ipmi_msghandler rtc_efi [ 818.133904] [ 818.133904] CPU: 0 PID: 5999 Comm: sock_filter-v-X Tainted: GB 5.11.0-3-gbfa5a4929c90 #57 [ 818.133904] Hardware name: hp server rx3600 , BIOS 04.03 04/08/2008 [ 818.133904] psr : 121008026010 ifs : 8288 ip : []Tainted: GB (5.11.0-3-gbfa5a4929c90) [ 818.133904] ip is at bpf_prog_free+0x21/0xe0 [ 818.133904] unat: pfs : 0307 rsc : 0003 [ 818.133904] rnat: bsps: pr : 00106a5a51665965 [ 818.133904] ldrs: ccv : 12088904 fpsr: 0009804c8a70033f [ 818.133904] csd : ssd : [ 818.133904] b0 : a00100d54080 b6 : a00100d53fe0 b7 : a001cef0 [ 818.133904] f6 : 0ffefb0c50daa1b67f89a f7 : 0ffed8b3e4fdb0800 [ 818.133904] f8 : 10017fbd1bc00 f9 : 1000eb95f [ 818.133904] f10 : 10008ade20716a6c83cc1 f11 : 1003e02b7 [ 818.133904] r1 : a0010176b300 r2 : a0028004 r3 : [ 818.133904] r8 : 0008 r9 : e0011873f800 r10 : e00102c18600 [ 818.133904] r11 : e00102c19600 r12 : e0011873f7f0 r13 : e00118738000 [ 818.133904] r14 : 0068 r15 : a0028028 r16 : e5606a70 [ 818.133904] r17 : e00102c18600 r18 : e00104370748 r19 : e00102c18600 [ 818.133904] r20 : e00102c18600 r21 : e5606a78 r22 : a0010156bd28 [ 818.133904] r23 : a0010147fdf4 r24 : 4000 r25 : e00104370750 [ 818.133904] r26 : a001012f7088 r27 : a00100d53fe0 r28 : 0001 [ 818.133904] r29 : e0011873f800 r30 : e0011873f810 r31 :
Re: [syzbot] WARNING in bpf_test_run
On 4/1/21 3:05 PM, Yonghong Song wrote: On 4/1/21 4:29 AM, syzbot wrote: Hello, syzbot found the following issue on: HEAD commit: 36e79851 libbpf: Preserve empty DATASEC BTFs during static.. git tree: bpf-next console output: https://syzkaller.appspot.com/x/log.txt?x=1569bb06d0 kernel config: https://syzkaller.appspot.com/x/.config?x=7eff0f22b8563a5f dashboard link: https://syzkaller.appspot.com/bug?extid=774c590240616eaa3423 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=17556b7cd0 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1772be26d0 The issue was bisected to: commit 997acaf6b4b59c6a9c259740312a69ea549cc684 Author: Mark Rutland Date: Mon Jan 11 15:37:07 2021 + lockdep: report broken irq restoration bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=10197016d0 final oops: https://syzkaller.appspot.com/x/report.txt?x=12197016d0 console output: https://syzkaller.appspot.com/x/log.txt?x=14197016d0 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+774c590240616eaa3...@syzkaller.appspotmail.com Fixes: 997acaf6b4b5 ("lockdep: report broken irq restoration") [ cut here ] WARNING: CPU: 0 PID: 8725 at include/linux/bpf-cgroup.h:193 bpf_cgroup_storage_set include/linux/bpf-cgroup.h:193 [inline] WARNING: CPU: 0 PID: 8725 at include/linux/bpf-cgroup.h:193 bpf_test_run+0x65e/0xaa0 net/bpf/test_run.c:109 I will look at this issue. Thanks! Modules linked in: CPU: 0 PID: 8725 Comm: syz-executor927 Not tainted 5.12.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:bpf_cgroup_storage_set include/linux/bpf-cgroup.h:193 [inline] RIP: 0010:bpf_test_run+0x65e/0xaa0 net/bpf/test_run.c:109 Code: e9 29 fe ff ff e8 b2 9d 3a fa 41 83 c6 01 bf 08 00 00 00 44 89 f6 e8 51 a5 3a fa 41 83 fe 08 0f 85 74 fc ff ff e8 92 9d 3a fa <0f> 0b bd f0 ff e9 5c fd ff ff e8 81 9d 3a fa 83 c5 01 bf 08 RSP: 0018:c900017bfaf0 EFLAGS: 00010293 RAX: RBX: c9f29000 RCX: RDX: 88801bc68000 RSI: 8739543e RDI: 0003 RBP: 0007 R08: 0008 R09: 0001 R10: 8739542f R11: R12: dc00 R13: 888021dd54c0 R14: 0008 R15: FS: 7f00157d7700() GS:8880b9c0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f0015795718 CR3: 157ae000 CR4: 001506f0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: bpf_prog_test_run_skb+0xabc/0x1c70 net/bpf/test_run.c:628 bpf_prog_test_run kernel/bpf/syscall.c:3132 [inline] __do_sys_bpf+0x218b/0x4f40 kernel/bpf/syscall.c:4411 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 Run on my qemu (4 cpus) with C reproducer and I cannot reproduce the result. It already ran 30 minutes and still running. Checked the code, it is just doing a lot of parallel bpf_prog_test_run's. The failure is in the below WARN_ON_ONCE code: 175 static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage 176 *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) 177 { 178 enum bpf_cgroup_storage_type stype; 179 int i, err = 0; 180 181 preempt_disable(); 182 for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { 183 if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) 184 continue; 185 186 this_cpu_write(bpf_cgroup_storage_info[i].task, current); 187 for_each_cgroup_storage_type(stype) 188 this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], 189storage[stype]); 190 goto out; 191 } 192 err = -EBUSY; 193 WARN_ON_ONCE(1); 194 195 out: 196 preempt_enable(); 197 return err; 198 } Basically it shows the stress test triggered a warning due to limited kernel resource. entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x446199 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:7f00157d72f8 EFLAGS: 0246 ORIG_RAX: 0141 RAX: ffda RBX: 004cb440 RCX: 00446199 RDX: 0028 RSI: 2080 RDI: 000a RBP: 0049b074 R08: R09: R10: R11: 0246 R12: f9abde7200f522cd R13: 3952ddf3af240c07 R14: 1631e0d82d3fa99d R15: 004cb448 --- This report is generated by a bot. It may contain errors. See https://goo.gl/tpsmEJ f
Re: [syzbot] WARNING in bpf_test_run
On 4/1/21 4:29 AM, syzbot wrote: Hello, syzbot found the following issue on: HEAD commit:36e79851 libbpf: Preserve empty DATASEC BTFs during static.. git tree: bpf-next console output: https://syzkaller.appspot.com/x/log.txt?x=1569bb06d0 kernel config: https://syzkaller.appspot.com/x/.config?x=7eff0f22b8563a5f dashboard link: https://syzkaller.appspot.com/bug?extid=774c590240616eaa3423 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=17556b7cd0 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1772be26d0 The issue was bisected to: commit 997acaf6b4b59c6a9c259740312a69ea549cc684 Author: Mark Rutland Date: Mon Jan 11 15:37:07 2021 + lockdep: report broken irq restoration bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=10197016d0 final oops: https://syzkaller.appspot.com/x/report.txt?x=12197016d0 console output: https://syzkaller.appspot.com/x/log.txt?x=14197016d0 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+774c590240616eaa3...@syzkaller.appspotmail.com Fixes: 997acaf6b4b5 ("lockdep: report broken irq restoration") [ cut here ] WARNING: CPU: 0 PID: 8725 at include/linux/bpf-cgroup.h:193 bpf_cgroup_storage_set include/linux/bpf-cgroup.h:193 [inline] WARNING: CPU: 0 PID: 8725 at include/linux/bpf-cgroup.h:193 bpf_test_run+0x65e/0xaa0 net/bpf/test_run.c:109 I will look at this issue. Thanks! Modules linked in: CPU: 0 PID: 8725 Comm: syz-executor927 Not tainted 5.12.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:bpf_cgroup_storage_set include/linux/bpf-cgroup.h:193 [inline] RIP: 0010:bpf_test_run+0x65e/0xaa0 net/bpf/test_run.c:109 Code: e9 29 fe ff ff e8 b2 9d 3a fa 41 83 c6 01 bf 08 00 00 00 44 89 f6 e8 51 a5 3a fa 41 83 fe 08 0f 85 74 fc ff ff e8 92 9d 3a fa <0f> 0b bd f0 ff ff ff e9 5c fd ff ff e8 81 9d 3a fa 83 c5 01 bf 08 RSP: 0018:c900017bfaf0 EFLAGS: 00010293 RAX: RBX: c9f29000 RCX: RDX: 88801bc68000 RSI: 8739543e RDI: 0003 RBP: 0007 R08: 0008 R09: 0001 R10: 8739542f R11: R12: dc00 R13: 888021dd54c0 R14: 0008 R15: FS: 7f00157d7700() GS:8880b9c0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f0015795718 CR3: 157ae000 CR4: 001506f0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: bpf_prog_test_run_skb+0xabc/0x1c70 net/bpf/test_run.c:628 bpf_prog_test_run kernel/bpf/syscall.c:3132 [inline] __do_sys_bpf+0x218b/0x4f40 kernel/bpf/syscall.c:4411 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x446199 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:7f00157d72f8 EFLAGS: 0246 ORIG_RAX: 0141 RAX: ffda RBX: 004cb440 RCX: 00446199 RDX: 0028 RSI: 2080 RDI: 000a RBP: 0049b074 R08: R09: R10: R11: 0246 R12: f9abde7200f522cd R13: 3952ddf3af240c07 R14: 1631e0d82d3fa99d R15: 004cb448 --- This report is generated by a bot. It may contain errors. See https://goo.gl/tpsmEJ for more information about syzbot. syzbot engineers can be reached at syzkal...@googlegroups.com. syzbot will keep track of this issue. See: https://goo.gl/tpsmEJ#status for how to communicate with syzbot. For information about bisection process see: https://goo.gl/tpsmEJ#bisection syzbot can test patches for this issue, for details see: https://goo.gl/tpsmEJ#testing-patches
Re: linux-next: manual merge of the net-next tree with the net tree
On 3/19/21 12:21 AM, Daniel Borkmann wrote: On 3/19/21 3:11 AM, Piotr Krysiuk wrote: Hi Daniel, On Fri, Mar 19, 2021 at 12:16 AM Stephen Rothwell wrote: diff --cc kernel/bpf/verifier.c index 44e4ec1640f1,f9096b049cd6.. --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@@ -5876,10 -6056,22 +6060,23 @@@ static int retrieve_ptr_limit(const str if (mask_to_left) *ptr_limit = MAX_BPF_STACK + off; else - *ptr_limit = -off; - return 0; + *ptr_limit = -off - 1; + return *ptr_limit >= max ? -ERANGE : 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; PTR_TO_MAP_VALUE logic above looks like copy-paste of old PTR_TO_MAP_VALUE code from before "bpf: Fix off-by-one for area size in creating mask to left" and is apparently affected by the same off-by-one, except this time on "key_size" area and not "value_size". This needs to be fixed in the same way as we did with PTR_TO_MAP_VALUE. What is the best way to proceed? Hm, not sure why PTR_TO_MAP_KEY was added by 69c087ba6225 in the first place, I presume noone expects this to be used from unprivileged as the comment says. Resolution should be to remove the PTR_TO_MAP_KEY case entirely from that switch until we have an actual user. Alexei suggested so that we don't forget it in the future if bpf_capable() requirement is removed. https://lore.kernel.org/bpf/c837ae55-2487-2f39-47f6-a18781dc6...@fb.com/ I am okay with either way, fix it or remove it. Thanks, Daniel
Re: CLANG LTO compatibility issue with DEBUG_INFO_BTF
On 3/18/21 8:45 PM, Jisheng Zhang wrote: Hi, When trying the latest 5.12-rc3 with both LTO_CLANG_THIN and DEBUG_INFO_BTF enabled, I met lots of warnings such as: ... tag__recode_dwarf_type: couldn't find 0x4a7ade5 type for 0x4ab9f88 (subroutine_type)! ftype__recode_dwarf_types: couldn't find 0x4a7ade5 type for 0x4ab9fa4 (formal_parameter)! ... namespace__recode_dwarf_types: couldn't find 0x4a8ff4a type for 0x4aba05c (member)! namespace__recode_dwarf_types: couldn't find 0x4a7ae9b type for 0x4aba084 (member)! ... WARN: multiple IDs found for 'path': 281, 729994 - using 281 WARN: multiple IDs found for 'task_struct': 421, 730101 - using 421 ... then finally get build error: FAILED unresolved symbol vfs_truncate Is this a known issue? Do we need to make DEBUG_INFO_BTF depend on !LTO? This is a known issue for pahole. pahole does not handle dwarf well generated with LTO. Bill Wendling from google is looking at the issue and I will help look at the issue as well. Since bpf heavily depends on BTF, at this point, I suggest if you are using bpf, please do not turn on LTO. Or if you build with LTO, just turn off DEBUG_INFO_BTF in your config. Thanks! pahole version: v1.20 clang version: 11.0 Thanks
Re: [PATCH] bpf: selftests: remove unused 'nospace_err' in tests for batched ops in array maps
On 3/15/21 6:29 AM, Pedro Tammela wrote: This seems to be a reminiscent from the hashmap tests. Signed-off-by: Pedro Tammela Acked-by: Yonghong Song
Re: [BUG] One-liner array initialization with two pointers in BPF results in NULLs
On 3/10/21 3:48 AM, Florent Revest wrote: On Wed, Mar 10, 2021 at 6:16 AM Yonghong Song wrote: On 3/9/21 7:43 PM, Yonghong Song wrote: On 3/9/21 5:54 PM, Florent Revest wrote: I noticed that initializing an array of pointers using this syntax: __u64 array[] = { (__u64), (__u64) }; (which is a fairly common operation with macros such as BPF_SEQ_PRINTF) always results in array[0] and array[1] being NULL. Interestingly, if the array is only initialized with one pointer, ex: __u64 array[] = { (__u64) }; Then array[0] will not be NULL. Or if the array is initialized field by field, ex: __u64 array[2]; array[0] = (__u64) array[1] = (__u64) Then array[0] and array[1] will not be NULL either. I'm assuming that this should have something to do with relocations and might be a bug in clang or in libbpf but because I don't know much about these, I thought that reporting could be a good first step. :) Thanks for reporting. What you guess is correct, this is due to relocations :-( The compiler notoriously tend to put complex initial values into rodata section. For example, for __u64 array[] = { (__u64), (__u64) }; the compiler will put { (__u64), (__u64) } into rodata section. But and themselves need relocation since they are address of static variables which will sit inside .data section. So in the elf file, you will see the following relocations: RELOCATION RECORDS FOR [.rodata]: OFFSET TYPE VALUE 0018 R_BPF_64_64 .data 0020 R_BPF_64_64 .data Right :) Thank you for the explanations Yonghong! Currently, libbpf does not handle relocation inside .rodata section, so they content remains 0. Just for my own edification, why is .rodata relocation not yet handled in libbpf ? Is it because of a read-only mapping that makes it more difficult ? We don't have this use case before. In general, people do not put string pointers in init code in the declaration. I think bpf_seq_printf() is special about this and hence triggering the issue. To support relocation of rodata section, kernel needs to be involved and this is actually more complicated as the relocation is against .data section. Two issues the kernel needs to deal with: - .data section will be another map in kernel, so i.e., relocation of .rodata map value against another map. - .data section may be modified, some protection might be needed to prevent this. We may ignore this requirement since user space may have similar issue. This is a corner case, if we can workaround in the libbpf, in this particular case, bpf_tracing.h. I think it will be good enough, not adding further complexity in kernel for such a corner case. That is why you see the issue with pointer as NULL. With array size of 1, compiler does not bother to put it into rodata section. I *guess* that it works in the macro due to some kind of heuristics, e.g., nested blocks, etc, and llvm did not promote the array init value to rodata. I will double check whether llvm can complete prevent such transformation. Maybe in the future libbpf is able to handle relocations for rodata section too. But for the time being, please just consider to use either macro, or the explicit array assignment. Digging into the compiler, the compiler tries to make *const* initial value into rodata section if the initial value size > 64, so in this case, macro does not work either. I think this is how you discovered the issue. Indeed, I was using a macro similar to BPF_SEQ_PRINTF and this is how I found the bug. The llvm does not provide target hooks to influence this transformation. Oh, that is unfortunate :) Thanks for looking into it! I feel that the real fix would be in libbpf anyway and the rest is just workarounds. The real fix will need libbpf and kernel. So, there are two workarounds, (1).__u64 param_working[2]; param_working[0] = (__u64)str1; param_working[1] = (__u64)str2; (2). BPF_SEQ_PRINTF(seq, "%s ", str1); BPF_SEQ_PRINTF(seq, "%s", str2); (2) is a bit impractical for my actual usecase. I am implementing a bpf_snprintf helper (patch series Coming Soon TM) and I wanted to keep the selftest short with a few BPF_SNPRINTF() calls that exercise most format specifiers. In practice, if you have at least one non-const format argument, you should be fine. But if all format arguments are constant, then none of them should be strings. Just for context, this does not only happen for strings but also for all sorts of pointers, for example, when I try to do address lookup of global __ksym variables, which is important for my selftest. Currently, in bpf_seq_printf(), we do memory copy for string and certain ipv4/ipv6 addresses. ipv4 is not an issue as the compiler less likely put it into rodata. for ipv6, if it is a constant, we can just directly put it into the format string. For many other sort of pointers, we ju
Re: [BUG] One-liner array initialization with two pointers in BPF results in NULLs
On 3/9/21 7:43 PM, Yonghong Song wrote: On 3/9/21 5:54 PM, Florent Revest wrote: I noticed that initializing an array of pointers using this syntax: __u64 array[] = { (__u64), (__u64) }; (which is a fairly common operation with macros such as BPF_SEQ_PRINTF) always results in array[0] and array[1] being NULL. Interestingly, if the array is only initialized with one pointer, ex: __u64 array[] = { (__u64) }; Then array[0] will not be NULL. Or if the array is initialized field by field, ex: __u64 array[2]; array[0] = (__u64) array[1] = (__u64) Then array[0] and array[1] will not be NULL either. I'm assuming that this should have something to do with relocations and might be a bug in clang or in libbpf but because I don't know much about these, I thought that reporting could be a good first step. :) Thanks for reporting. What you guess is correct, this is due to relocations :-( The compiler notoriously tend to put complex initial values into rodata section. For example, for __u64 array[] = { (__u64), (__u64) }; the compiler will put { (__u64), (__u64) } into rodata section. But and themselves need relocation since they are address of static variables which will sit inside .data section. So in the elf file, you will see the following relocations: RELOCATION RECORDS FOR [.rodata]: OFFSET TYPE VALUE 0018 R_BPF_64_64 .data 0020 R_BPF_64_64 .data Currently, libbpf does not handle relocation inside .rodata section, so they content remains 0. That is why you see the issue with pointer as NULL. With array size of 1, compiler does not bother to put it into rodata section. I *guess* that it works in the macro due to some kind of heuristics, e.g., nested blocks, etc, and llvm did not promote the array init value to rodata. I will double check whether llvm can complete prevent such transformation. Maybe in the future libbpf is able to handle relocations for rodata section too. But for the time being, please just consider to use either macro, or the explicit array assignment. Digging into the compiler, the compiler tries to make *const* initial value into rodata section if the initial value size > 64, so in this case, macro does not work either. I think this is how you discovered the issue. The llvm does not provide target hooks to influence this transformation. So, there are two workarounds, (1).__u64 param_working[2]; param_working[0] = (__u64)str1; param_working[1] = (__u64)str2; (2). BPF_SEQ_PRINTF(seq, "%s ", str1); BPF_SEQ_PRINTF(seq, "%s", str2); In practice, if you have at least one non-const format argument, you should be fine. But if all format arguments are constant, then none of them should be strings. Maybe we could change marco unsigned long long ___param[] = { args }; to declare an array explicitly and then have a loop to assign each array element? Thanks for the reproducer! I attached below a repro with a dummy selftest that I expect should pass but fails to pass with the latest clang and bpf-next. Hopefully, the logic should be simple: I try to print two strings from pointers in an array using bpf_seq_printf but depending on how the array is initialized the helper either receives the string pointers or NULL pointers: test_bug:FAIL:read unexpected read: actual 'str1= str2= str1=STR1 str2=STR2 ' != expected 'str1=STR1 str2=STR2 str1=STR1 str2=STR2 ' Signed-off-by: Florent Revest --- tools/testing/selftests/bpf/prog_tests/bug.c | 41 +++ tools/testing/selftests/bpf/progs/test_bug.c | 43 2 files changed, 84 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bug.c create mode 100644 tools/testing/selftests/bpf/progs/test_bug.c diff --git a/tools/testing/selftests/bpf/prog_tests/bug.c b/tools/testing/selftests/bpf/prog_tests/bug.c new file mode 100644 index ..4b0fafd936b7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bug.c @@ -0,0 +1,41 @@ +#include +#include "test_bug.skel.h" + +static int duration; + +void test_bug(void) +{ + struct test_bug *skel; + struct bpf_link *link; + char buf[64] = {}; + int iter_fd, len; + + skel = test_bug__open_and_load(); + if (CHECK(!skel, "test_bug__open_and_load", + "skeleton open_and_load failed\n")) + goto destroy; + + link = bpf_program__attach_iter(skel->progs.bug, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto destroy; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + len = read(iter_fd, buf, sizeof(buf)); + CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)); + // BUG: We expect the strings to be printed in
Re: [BUG] One-liner array initialization with two pointers in BPF results in NULLs
On 3/9/21 5:54 PM, Florent Revest wrote: I noticed that initializing an array of pointers using this syntax: __u64 array[] = { (__u64), (__u64) }; (which is a fairly common operation with macros such as BPF_SEQ_PRINTF) always results in array[0] and array[1] being NULL. Interestingly, if the array is only initialized with one pointer, ex: __u64 array[] = { (__u64) }; Then array[0] will not be NULL. Or if the array is initialized field by field, ex: __u64 array[2]; array[0] = (__u64) array[1] = (__u64) Then array[0] and array[1] will not be NULL either. I'm assuming that this should have something to do with relocations and might be a bug in clang or in libbpf but because I don't know much about these, I thought that reporting could be a good first step. :) Thanks for reporting. What you guess is correct, this is due to relocations :-( The compiler notoriously tend to put complex initial values into rodata section. For example, for __u64 array[] = { (__u64), (__u64) }; the compiler will put { (__u64), (__u64) } into rodata section. But and themselves need relocation since they are address of static variables which will sit inside .data section. So in the elf file, you will see the following relocations: RELOCATION RECORDS FOR [.rodata]: OFFSET TYPE VALUE 0018 R_BPF_64_64 .data 0020 R_BPF_64_64 .data Currently, libbpf does not handle relocation inside .rodata section, so they content remains 0. That is why you see the issue with pointer as NULL. With array size of 1, compiler does not bother to put it into rodata section. I *guess* that it works in the macro due to some kind of heuristics, e.g., nested blocks, etc, and llvm did not promote the array init value to rodata. I will double check whether llvm can complete prevent such transformation. Maybe in the future libbpf is able to handle relocations for rodata section too. But for the time being, please just consider to use either macro, or the explicit array assignment. Thanks for the reproducer! I attached below a repro with a dummy selftest that I expect should pass but fails to pass with the latest clang and bpf-next. Hopefully, the logic should be simple: I try to print two strings from pointers in an array using bpf_seq_printf but depending on how the array is initialized the helper either receives the string pointers or NULL pointers: test_bug:FAIL:read unexpected read: actual 'str1= str2= str1=STR1 str2=STR2 ' != expected 'str1=STR1 str2=STR2 str1=STR1 str2=STR2 ' Signed-off-by: Florent Revest --- tools/testing/selftests/bpf/prog_tests/bug.c | 41 +++ tools/testing/selftests/bpf/progs/test_bug.c | 43 2 files changed, 84 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bug.c create mode 100644 tools/testing/selftests/bpf/progs/test_bug.c diff --git a/tools/testing/selftests/bpf/prog_tests/bug.c b/tools/testing/selftests/bpf/prog_tests/bug.c new file mode 100644 index ..4b0fafd936b7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bug.c @@ -0,0 +1,41 @@ +#include +#include "test_bug.skel.h" + +static int duration; + +void test_bug(void) +{ + struct test_bug *skel; + struct bpf_link *link; + char buf[64] = {}; + int iter_fd, len; + + skel = test_bug__open_and_load(); + if (CHECK(!skel, "test_bug__open_and_load", + "skeleton open_and_load failed\n")) + goto destroy; + + link = bpf_program__attach_iter(skel->progs.bug, NULL); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto destroy; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + len = read(iter_fd, buf, sizeof(buf)); + CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)); + // BUG: We expect the strings to be printed in both cases but only the + // second case works. + // actual 'str1= str2= str1=STR1 str2=STR2 ' + // != expected 'str1=STR1 str2=STR2 str1=STR1 str2=STR2 ' + ASSERT_STREQ(buf, "str1=STR1 str2=STR2 str1=STR1 str2=STR2 ", "read"); + + close(iter_fd); + +free_link: + bpf_link__destroy(link); +destroy: + test_bug__destroy(skel); +} + diff --git a/tools/testing/selftests/bpf/progs/test_bug.c b/tools/testing/selftests/bpf/progs/test_bug.c new file mode 100644 index ..c41e69483785 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_bug.c @@ -0,0 +1,43 @@ +#include "bpf_iter.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +SEC("iter/task") +int bug(struct bpf_iter__task *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + + /* We want to print two strings */ + static const char fmt[] = "str1=%s str2=%s "; + static char str1[] =
Re: [PATCH] selftests/bpf: Simplify the calculation of variables
On 3/2/21 11:52 PM, Jiapeng Chong wrote: Fix the following coccicheck warnings: ./tools/testing/selftests/bpf/test_sockmap.c:735:35-37: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Yonghong Song
Re: [PATCH] bpf: Simplify the calculation of variables
On 3/2/21 11:20 PM, Jiapeng Chong wrote: Fix the following coccicheck warnings: ./tools/bpf/bpf_dbg.c:1201:55-57: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Yonghong Song
Re: [PATCH v5 bpf-next 1/6] bpf: enable task local storage for tracing programs
On 2/23/21 2:28 PM, Song Liu wrote: To access per-task data, BPF programs usually creates a hash table with pid as the key. This is not ideal because: 1. The user need to estimate the proper size of the hash table, which may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write extra logic. Task local storage overcomes these issues and offers a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Unlike LSM programs, tracing programs can be called in IRQ contexts. Helpers that access task local storage are updated to use raw_spin_lock_irqsave() instead of raw_spin_lock_bh(). Tracing programs can attach to functions on the task free path, e.g. exit_creds(). To avoid allocating task local storage after bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate new storage when the task is not refcounted (task->usage == 0). Reported-by: kernel test robot For a patch like this, typically we do not put the above Reported-by here as it is not really reported by the kernel test robot. If no revision is required, maybe maintainer can remove it before applying. Acked-by: KP Singh Signed-off-by: Song Liu --- include/linux/bpf.h| 7 ++ include/linux/bpf_lsm.h| 22 - include/linux/bpf_types.h | 2 +- include/linux/sched.h | 5 kernel/bpf/Makefile| 3 +-- kernel/bpf/bpf_local_storage.c | 28 +- kernel/bpf/bpf_lsm.c | 4 kernel/bpf/bpf_task_storage.c | 43 +- kernel/fork.c | 5 kernel/trace/bpf_trace.c | 4 10 files changed, 51 insertions(+), 72 deletions(-) [...]
Re: KMSAN: uninit-value in bpf_iter_prog_supported
On 2/8/21 11:35 PM, Dmitry Vyukov wrote: On Sun, Feb 7, 2021 at 1:20 PM syzbot wrote: Hello, syzbot found the following issue on: HEAD commit:73d62e81 kmsan: random: prevent boot-time reports in _mix_.. git tree: https://github.com/google/kmsan.git master console output: https://syzkaller.appspot.com/x/log.txt?x=17ac5f64d0 kernel config: https://syzkaller.appspot.com/x/.config?x=df698232b2ac45c9 dashboard link: https://syzkaller.appspot.com/bug?extid=580f4f2a272e452d55cb compiler: Debian clang version 11.0.1-2 userspace arch: i386 Unfortunately, I don't have any reproducer for this issue yet. IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+580f4f2a272e452d5...@syzkaller.appspotmail.com +BPF maintainers = BUG: KMSAN: uninit-value in bpf_iter_prog_supported+0x3dd/0x6a0 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/bpf_iter.c:329 I will take a look. Thanks. CPU: 0 PID: 18494 Comm: bpf_preload Not tainted 5.10.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack syzkaller/managers/upstream-kmsan-gce-386/kernel/lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 syzkaller/managers/upstream-kmsan-gce-386/kernel/lib/dump_stack.c:118 kmsan_report+0xfb/0x1e0 syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/kmsan/kmsan_report.c:118 __msan_warning+0x5f/0xa0 syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/kmsan/kmsan_instr.c:197 bpf_iter_prog_supported+0x3dd/0x6a0 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/bpf_iter.c:329 check_attach_btf_id syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/verifier.c:11772 [inline] bpf_check+0x11872/0x1c380 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/verifier.c:11900 bpf_prog_load syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/syscall.c:2210 [inline] __do_sys_bpf+0x17483/0x1aee0 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/syscall.c:4399 __se_sys_bpf+0x8e/0xa0 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/syscall.c:4357 __x64_sys_bpf+0x4a/0x70 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/syscall.c:4357 do_syscall_64+0x9f/0x140 syzkaller/managers/upstream-kmsan-gce-386/kernel/arch/x86/entry/common.c:48 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fb70b5ab469 Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ff 49 2b 00 f7 d8 64 89 01 48 RSP: 002b:7ffdbb4cde38 EFLAGS: 0246 ORIG_RAX: 0141 RAX: ffda RBX: 0065b110 RCX: 7fb70b5ab469 RDX: 0078 RSI: 7ffdbb4cdef0 RDI: 0005 RBP: 7ffdbb4cdef0 R08: 00100017 R09: R10: 7ffdbb4ce0e8 R11: 0246 R12: R13: 7ffdbb4cdf20 R14: R15: Uninit was created at: kmsan_save_stack_with_flags syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/kmsan/kmsan.c:121 [inline] kmsan_internal_poison_shadow+0x5c/0xf0 syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/kmsan/kmsan.c:104 kmsan_slab_alloc+0x8d/0xe0 syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/kmsan/kmsan_hooks.c:76 slab_alloc_node syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/slub.c:2906 [inline] slab_alloc syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/slub.c:2915 [inline] kmem_cache_alloc_trace+0x893/0x1000 syzkaller/managers/upstream-kmsan-gce-386/kernel/mm/slub.c:2932 kmalloc syzkaller/managers/upstream-kmsan-gce-386/kernel/./include/linux/slab.h:552 [inline] bpf_iter_reg_target+0x81/0x3f0 syzkaller/managers/upstream-kmsan-gce-386/kernel/kernel/bpf/bpf_iter.c:276 bpf_sk_storage_map_iter_init+0x6a/0x85 syzkaller/managers/upstream-kmsan-gce-386/kernel/net/core/bpf_sk_storage.c:870 do_one_initcall+0x362/0x8d0 syzkaller/managers/upstream-kmsan-gce-386/kernel/init/main.c:1220 do_initcall_level+0x1e7/0x35a syzkaller/managers/upstream-kmsan-gce-386/kernel/init/main.c:1293 do_initcalls+0x127/0x1cb syzkaller/managers/upstream-kmsan-gce-386/kernel/init/main.c:1309 do_basic_setup+0x33/0x36 syzkaller/managers/upstream-kmsan-gce-386/kernel/init/main.c:1329 kernel_init_freeable+0x238/0x38b syzkaller/managers/upstream-kmsan-gce-386/kernel/init/main.c:1529 kernel_init+0x1f/0x840 syzkaller/managers/upstream-kmsan-gce-386/kernel/init/main.c:1418 ret_from_fork+0x1f/0x30 syzkaller/managers/upstream-kmsan-gce-386/kernel/arch/x86/entry/entry_64.S:296 = --- This report is generated by a bot. It may contain errors. See https://goo.gl/tpsmEJ for more information about syzbot. syzbot engineers can be reached at
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/6/21 11:44 AM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 8:33 PM Yonghong Song wrote: On 2/6/21 11:28 AM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 8:22 PM Sedat Dilek wrote: On Sat, Feb 6, 2021 at 8:17 PM Yonghong Song wrote: On 2/6/21 10:10 AM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 6:53 PM Yonghong Song wrote: On 2/6/21 8:24 AM, Mark Wieelard wrote: Hi, On Sat, Feb 06, 2021 at 12:26:44AM -0800, Yonghong Song wrote: With the above vmlinux, the issue appears to be handling DW_ATE_signed_1, DW_ATE_unsigned_{1,24,40}. The following patch should fix the issue: That doesn't really make sense to me. Why is the compiler emitting a DW_TAG_base_type that needs to be interpreted according to the DW_AT_name attribute? If the issue is that the size of the base type cannot be expressed in bytes then the DWARF spec provides the following option: If the value of an object of the given type does not fully occupy the storage described by a byte size attribute, the base type entry may also have a DW_AT_bit_size and a DW_AT_data_bit_offset attribute, both of whose values are integer constant values (see Section 2.19 on page 55). The bit size attribute describes the actual size in bits used to represent values of the given type. The data bit offset attribute is the offset in bits from the beginning of the containing storage to the beginning of the value. Bits that are part of the offset are padding. If this attribute is omitted a default data bit offset of zero is assumed. Would it be possible to use that encoding of those special types? If I agree with you. I do not like comparing me as well. Unfortunately, there is no enough information in dwarf to find out actual information. The following is the dwarf dump with vmlinux (Sedat provided) for DW_ATE_unsigned_1. 0x000e97e9: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) There is no DW_AT_bit_size and DW_AT_bit_offset for base type. AFAIK, these two attributes typically appear in struct/union members together with DW_AT_byte_size. Maybe compilers (clang in this case) can emit DW_AT_bit_size = 1 and DW_AT_bit_offset = 0/7 (depending on big/little endian) and this case, we just test and get DW_AT_bit_size and it should work. But I think BTF does not need this (DW_ATE_unsigned_1) for now. I checked dwarf dump and it is mostly used for some arith operation encoded in dump (in this case, e.g., shift by 1 bit) 0x15cf: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) 0x00010ed9: DW_TAG_formal_parameter DW_AT_location(DW_OP_lit0, DW_OP_not, DW_OP_convert (0x15cf) "DW_ATE_unsigned_1", DW_OP_convert (0x15d4) "DW_ATE_unsigned_8", DW_OP_stack_value) DW_AT_abstract_origin (0x00013984 "branch") Look at clang frontend, only the following types are encoded with unsigned dwarf type. case BuiltinType::UShort: case BuiltinType::UInt: case BuiltinType::UInt128: case BuiltinType::ULong: case BuiltinType::WChar_U: case BuiltinType::ULongLong: Encoding = llvm::dwarf::DW_ATE_unsigned; break; not, can we try to come up with some extension that doesn't require consumers to match magic names? You want me to upload mlx5_core.ko? I just sent out a patch. You are cc'ed. I also attached in this email. Yes, it would be great if you can upload mlx5_core.ko so I can double check with this DW_ATE_unsigned_160 which is really usual. Yupp, just built a new pahole :-). Re-building linux-kernel... Will upload mlx5_core.ko - need zstd-ed it before. Hmm, I guess you want a mlx5_core.ko with your patch applied-to-pahole-1.20 :-)? this should work too. I want to check dwarf data. My patch won't impact dwarf generation. Usual Dropbox-Link: https://www.dropbox.com/sh/kvyh8ps7na0r1h5/AABfyNfDZ2bESse_bo4h05fFa?dl=0 See "for-yhs" directory: 1. mlx5-module_yhs-v1 ("[PATCH dwarves] btf_encoder: sanitize non-regular int base type") 2. mlx5-module_yhs-dileks-v4 (with the last diff-v4 I tried successfully) Thanks, with llvm-dwarfdump, I can see 0x00d65616: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_160") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x14) 0x00d88e81: DW_TAG_variable DW_AT_location(indexed (0xad) loclist = 0x0005df42: [0x00088c8e, 0x00088c97): DW_OP_breg9 R9+0, DW_OP_convert (0x00d65616) "DW_ATE_unsigned_160&qu
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/6/21 11:28 AM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 8:22 PM Sedat Dilek wrote: On Sat, Feb 6, 2021 at 8:17 PM Yonghong Song wrote: On 2/6/21 10:10 AM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 6:53 PM Yonghong Song wrote: On 2/6/21 8:24 AM, Mark Wieelard wrote: Hi, On Sat, Feb 06, 2021 at 12:26:44AM -0800, Yonghong Song wrote: With the above vmlinux, the issue appears to be handling DW_ATE_signed_1, DW_ATE_unsigned_{1,24,40}. The following patch should fix the issue: That doesn't really make sense to me. Why is the compiler emitting a DW_TAG_base_type that needs to be interpreted according to the DW_AT_name attribute? If the issue is that the size of the base type cannot be expressed in bytes then the DWARF spec provides the following option: If the value of an object of the given type does not fully occupy the storage described by a byte size attribute, the base type entry may also have a DW_AT_bit_size and a DW_AT_data_bit_offset attribute, both of whose values are integer constant values (see Section 2.19 on page 55). The bit size attribute describes the actual size in bits used to represent values of the given type. The data bit offset attribute is the offset in bits from the beginning of the containing storage to the beginning of the value. Bits that are part of the offset are padding. If this attribute is omitted a default data bit offset of zero is assumed. Would it be possible to use that encoding of those special types? If I agree with you. I do not like comparing me as well. Unfortunately, there is no enough information in dwarf to find out actual information. The following is the dwarf dump with vmlinux (Sedat provided) for DW_ATE_unsigned_1. 0x000e97e9: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) There is no DW_AT_bit_size and DW_AT_bit_offset for base type. AFAIK, these two attributes typically appear in struct/union members together with DW_AT_byte_size. Maybe compilers (clang in this case) can emit DW_AT_bit_size = 1 and DW_AT_bit_offset = 0/7 (depending on big/little endian) and this case, we just test and get DW_AT_bit_size and it should work. But I think BTF does not need this (DW_ATE_unsigned_1) for now. I checked dwarf dump and it is mostly used for some arith operation encoded in dump (in this case, e.g., shift by 1 bit) 0x15cf: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) 0x00010ed9: DW_TAG_formal_parameter DW_AT_location(DW_OP_lit0, DW_OP_not, DW_OP_convert (0x15cf) "DW_ATE_unsigned_1", DW_OP_convert (0x15d4) "DW_ATE_unsigned_8", DW_OP_stack_value) DW_AT_abstract_origin (0x00013984 "branch") Look at clang frontend, only the following types are encoded with unsigned dwarf type. case BuiltinType::UShort: case BuiltinType::UInt: case BuiltinType::UInt128: case BuiltinType::ULong: case BuiltinType::WChar_U: case BuiltinType::ULongLong: Encoding = llvm::dwarf::DW_ATE_unsigned; break; not, can we try to come up with some extension that doesn't require consumers to match magic names? You want me to upload mlx5_core.ko? I just sent out a patch. You are cc'ed. I also attached in this email. Yes, it would be great if you can upload mlx5_core.ko so I can double check with this DW_ATE_unsigned_160 which is really usual. Yupp, just built a new pahole :-). Re-building linux-kernel... Will upload mlx5_core.ko - need zstd-ed it before. Hmm, I guess you want a mlx5_core.ko with your patch applied-to-pahole-1.20 :-)? this should work too. I want to check dwarf data. My patch won't impact dwarf generation. - Sedat - When looking with llvm-dwarf for DW_ATE_unsigned_160: 0x00d65616: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_160") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x14) If you need further information, please let me know. Thanks. - Sedat -
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/6/21 10:10 AM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 6:53 PM Yonghong Song wrote: On 2/6/21 8:24 AM, Mark Wieelard wrote: Hi, On Sat, Feb 06, 2021 at 12:26:44AM -0800, Yonghong Song wrote: With the above vmlinux, the issue appears to be handling DW_ATE_signed_1, DW_ATE_unsigned_{1,24,40}. The following patch should fix the issue: That doesn't really make sense to me. Why is the compiler emitting a DW_TAG_base_type that needs to be interpreted according to the DW_AT_name attribute? If the issue is that the size of the base type cannot be expressed in bytes then the DWARF spec provides the following option: If the value of an object of the given type does not fully occupy the storage described by a byte size attribute, the base type entry may also have a DW_AT_bit_size and a DW_AT_data_bit_offset attribute, both of whose values are integer constant values (see Section 2.19 on page 55). The bit size attribute describes the actual size in bits used to represent values of the given type. The data bit offset attribute is the offset in bits from the beginning of the containing storage to the beginning of the value. Bits that are part of the offset are padding. If this attribute is omitted a default data bit offset of zero is assumed. Would it be possible to use that encoding of those special types? If I agree with you. I do not like comparing me as well. Unfortunately, there is no enough information in dwarf to find out actual information. The following is the dwarf dump with vmlinux (Sedat provided) for DW_ATE_unsigned_1. 0x000e97e9: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) There is no DW_AT_bit_size and DW_AT_bit_offset for base type. AFAIK, these two attributes typically appear in struct/union members together with DW_AT_byte_size. Maybe compilers (clang in this case) can emit DW_AT_bit_size = 1 and DW_AT_bit_offset = 0/7 (depending on big/little endian) and this case, we just test and get DW_AT_bit_size and it should work. But I think BTF does not need this (DW_ATE_unsigned_1) for now. I checked dwarf dump and it is mostly used for some arith operation encoded in dump (in this case, e.g., shift by 1 bit) 0x15cf: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) 0x00010ed9: DW_TAG_formal_parameter DW_AT_location(DW_OP_lit0, DW_OP_not, DW_OP_convert (0x15cf) "DW_ATE_unsigned_1", DW_OP_convert (0x15d4) "DW_ATE_unsigned_8", DW_OP_stack_value) DW_AT_abstract_origin (0x00013984 "branch") Look at clang frontend, only the following types are encoded with unsigned dwarf type. case BuiltinType::UShort: case BuiltinType::UInt: case BuiltinType::UInt128: case BuiltinType::ULong: case BuiltinType::WChar_U: case BuiltinType::ULongLong: Encoding = llvm::dwarf::DW_ATE_unsigned; break; not, can we try to come up with some extension that doesn't require consumers to match magic names? You want me to upload mlx5_core.ko? I just sent out a patch. You are cc'ed. I also attached in this email. Yes, it would be great if you can upload mlx5_core.ko so I can double check with this DW_ATE_unsigned_160 which is really usual. When looking with llvm-dwarf for DW_ATE_unsigned_160: 0x00d65616: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_160") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x14) If you need further information, please let me know. Thanks. - Sedat - From 239c797090abbdc5253d0ff1e9e657c5006fbbee Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 6 Feb 2021 10:21:45 -0800 Subject: [PATCH dwarves] btf_encoder: sanitize non-regular int base type clang with dwarf5 may generate non-regular int base type, i.e., not a signed/unsigned char/short/int/longlong/__int128. Such base types are often used to describe how an actual parameter or variable is generated. For example, 0x15cf: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) 0x00010ed9: DW_TAG_formal_parameter DW_AT_location(DW_OP_lit0, DW_OP_not, DW_OP_convert (0x15cf) "DW_ATE_unsigned_1", DW_OP_convert (0x15d4) "DW_ATE_unsigned_8", DW_OP_stack_value) DW_AT_abstract_origin (0x00013984 "branch")
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/6/21 8:24 AM, Mark Wieelard wrote: Hi, On Sat, Feb 06, 2021 at 12:26:44AM -0800, Yonghong Song wrote: With the above vmlinux, the issue appears to be handling DW_ATE_signed_1, DW_ATE_unsigned_{1,24,40}. The following patch should fix the issue: That doesn't really make sense to me. Why is the compiler emitting a DW_TAG_base_type that needs to be interpreted according to the DW_AT_name attribute? If the issue is that the size of the base type cannot be expressed in bytes then the DWARF spec provides the following option: If the value of an object of the given type does not fully occupy the storage described by a byte size attribute, the base type entry may also have a DW_AT_bit_size and a DW_AT_data_bit_offset attribute, both of whose values are integer constant values (see Section 2.19 on page 55). The bit size attribute describes the actual size in bits used to represent values of the given type. The data bit offset attribute is the offset in bits from the beginning of the containing storage to the beginning of the value. Bits that are part of the offset are padding. If this attribute is omitted a default data bit offset of zero is assumed. Would it be possible to use that encoding of those special types? If I agree with you. I do not like comparing me as well. Unfortunately, there is no enough information in dwarf to find out actual information. The following is the dwarf dump with vmlinux (Sedat provided) for DW_ATE_unsigned_1. 0x000e97e9: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) There is no DW_AT_bit_size and DW_AT_bit_offset for base type. AFAIK, these two attributes typically appear in struct/union members together with DW_AT_byte_size. Maybe compilers (clang in this case) can emit DW_AT_bit_size = 1 and DW_AT_bit_offset = 0/7 (depending on big/little endian) and this case, we just test and get DW_AT_bit_size and it should work. But I think BTF does not need this (DW_ATE_unsigned_1) for now. I checked dwarf dump and it is mostly used for some arith operation encoded in dump (in this case, e.g., shift by 1 bit) 0x15cf: DW_TAG_base_type DW_AT_name ("DW_ATE_unsigned_1") DW_AT_encoding (DW_ATE_unsigned) DW_AT_byte_size (0x00) 0x00010ed9: DW_TAG_formal_parameter DW_AT_location(DW_OP_lit0, DW_OP_not, DW_OP_convert (0x15cf) "DW_ATE_unsigned_1", DW_OP_convert (0x15d4) "DW_ATE_unsigned_8", DW_OP_stack_value) DW_AT_abstract_origin (0x00013984 "branch") Look at clang frontend, only the following types are encoded with unsigned dwarf type. case BuiltinType::UShort: case BuiltinType::UInt: case BuiltinType::UInt128: case BuiltinType::ULong: case BuiltinType::WChar_U: case BuiltinType::ULongLong: Encoding = llvm::dwarf::DW_ATE_unsigned; break; not, can we try to come up with some extension that doesn't require consumers to match magic names? Thanks, Mark
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/5/21 10:52 PM, Sedat Dilek wrote: On Sat, Feb 6, 2021 at 7:26 AM Sedat Dilek wrote: On Sat, Feb 6, 2021 at 6:53 AM Sedat Dilek wrote: On Sat, Feb 6, 2021 at 6:44 AM Sedat Dilek wrote: On Sat, Feb 6, 2021 at 4:34 AM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 10:54 PM Yonghong Song wrote: On 2/5/21 12:31 PM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 9:03 PM Yonghong Song wrote: On 2/5/21 11:24 AM, Arnaldo Carvalho de Melo wrote: Em Fri, Feb 05, 2021 at 11:10:08AM -0800, Yonghong Song escreveu: On 2/5/21 11:06 AM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 7:53 PM Sedat Dilek wrote: Grepping through linux.git/tools I guess some BTF tools/libs need to know what BTF_INT_UNSIGNED is? BTF_INT_UNSIGNED needs kernel support. Maybe to teach pahole to ignore this for now until kernel infrastructure is ready. Yeah, I thought about doing that. Not sure whether this information will be useful or not for BTF. This needs to be discussed separately. Maybe search for the rationale for its introduction in DWARF. In LLVM, we have: uint8_t BTFEncoding; switch (Encoding) { case dwarf::DW_ATE_boolean: BTFEncoding = BTF::INT_BOOL; break; case dwarf::DW_ATE_signed: case dwarf::DW_ATE_signed_char: BTFEncoding = BTF::INT_SIGNED; break; case dwarf::DW_ATE_unsigned: case dwarf::DW_ATE_unsigned_char: BTFEncoding = 0; break; I think DW_ATE_unsigned can be ignored in pahole since the default encoding = 0. A simple comment is enough. Yonghong Son, do you have a patch/diff for me? Looking at error message from log: LLVM_OBJCOPY=/opt/binutils/bin/objcopy /opt/pahole/bin/pahole -J .tmp_vmlinux.btf [115] INT DW_ATE_unsigned_1 Error emitting BTF type Encountered error while encoding BTF. Not exactly what is the root cause. Maybe bt->bit_size is not encoded correctly. Could you put vmlinux (in the above it is .tmp_vmlinux.btf) somewhere, I or somebody else can investigate and provide a proper fix. [ TO: Masahiro ] Thanks for taking care Yonghong - hope this is your first name, if not I am sorry. In case of mixing my first and last name you will make me female - Dilek is a Turkish female first name :-). So, in some cultures you need to be careful. Anyway... back to business and facts. Out of frustration I killed my last build via `make distclean`. The whole day I tested diverse combination of GCC-10 and LLVM-12 together with BTF Kconfigs, selfmade pahole, etc. I will do ne run with some little changes: #1: Pass LLVM_IAS=1 to make (means use Clang's Integrated ASsembler - as per Nick this leads to the same error - should be unrelated) #2: I did: DEBUG_INFO_COMPRESSED y -> n #2 I did in case you need vmlinux and I have to upload - I will compress the resulting vmlinux with ZSTD. You need vmlinux or .tmp_vmlinux.btf file? Nick was not allowed from his company to download from a Dropbox link. So, as an alternative I can offer GoogleDrive... ...or bomb into your INBOX :-). Now, why I CCed Masahiro: In case of ERRORs when running `scripts/link-vmlinux.sh` above files will be removed. Last, I found a hack to bypass this - means to keep these files (I need to check old emails). Masahiro, you see a possibility to have a way to keep these files in case of ERRORs without doing hackery? From a previous post in this thread: + info BTF .btf.vmlinux.bin.o + [ != silent_ ] + printf %-7s %s\n BTF .btf.vmlinux.bin.o BTF .btf.vmlinux.bin.o + LLVM_OBJCOPY=llvm-objcopy /opt/pahole/bin/pahole -J .tmp_vmlinux.btf [2] INT long unsigned int Error emitting BTF type Encountered error while encoding BTF. + llvm-objcopy --only-section=.BTF --set-section-flags .BTF=alloc,readonly --strip-all .tmp_vmlinux.btf .btf.vmlinux.bin.o ... + info BTFIDS vmlinux + [ != silent_ ] + printf %-7s %s\n BTFIDS vmlinux BTFIDS vmlinux + ./tools/bpf/resolve_btfids/resolve_btfids vmlinux FAILED: load BTF from vmlinux: Invalid argument + on_exit + [ 255 -ne 0 ] + cleanup + rm -f .btf.vmlinux.bin.o + rm -f .tmp_System.map + rm -f .tmp_vmlinux.btf .tmp_vmlinux.kallsyms1 .tmp_vmlinux.kallsyms1.S .tmp_vmlinux.kallsyms1.o .tmp_vmlinux.kallsyms2 .tmp_vmlinux.kallsyms2.S .tmp_vmlinux.kallsyms 2.o + rm -f System.map + rm -f vmlinux + rm -f vmlinux.o make[3]: *** [Makefile:1166: vmlinux] Error 255 ^^^ Look here. With this diff: $ git diff scripts/link-vmlinux.sh diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index eef40fa9485d..40f1b6aae553 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -330,7 +330,7 @@ vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} # fill in BTF IDs if [ -n "${CONFIG_DEBUG_INFO_BTF}" -a -n "${CONFIG_BPF}" ]; then info BTFIDS vmlinux - ${RESOLVE_BTFIDS} vmlinux + ##${RESOLVE_BTFIDS} vmlinux fi if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then This files are kept - not removed: $ LC_ALL=C ll .*btf* vmlinux
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/5/21 12:31 PM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 9:03 PM Yonghong Song wrote: On 2/5/21 11:24 AM, Arnaldo Carvalho de Melo wrote: Em Fri, Feb 05, 2021 at 11:10:08AM -0800, Yonghong Song escreveu: On 2/5/21 11:06 AM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 7:53 PM Sedat Dilek wrote: Grepping through linux.git/tools I guess some BTF tools/libs need to know what BTF_INT_UNSIGNED is? BTF_INT_UNSIGNED needs kernel support. Maybe to teach pahole to ignore this for now until kernel infrastructure is ready. Yeah, I thought about doing that. Not sure whether this information will be useful or not for BTF. This needs to be discussed separately. Maybe search for the rationale for its introduction in DWARF. In LLVM, we have: uint8_t BTFEncoding; switch (Encoding) { case dwarf::DW_ATE_boolean: BTFEncoding = BTF::INT_BOOL; break; case dwarf::DW_ATE_signed: case dwarf::DW_ATE_signed_char: BTFEncoding = BTF::INT_SIGNED; break; case dwarf::DW_ATE_unsigned: case dwarf::DW_ATE_unsigned_char: BTFEncoding = 0; break; I think DW_ATE_unsigned can be ignored in pahole since the default encoding = 0. A simple comment is enough. Yonghong Son, do you have a patch/diff for me? Looking at error message from log: LLVM_OBJCOPY=/opt/binutils/bin/objcopy /opt/pahole/bin/pahole -J .tmp_vmlinux.btf [115] INT DW_ATE_unsigned_1 Error emitting BTF type Encountered error while encoding BTF. Not exactly what is the root cause. Maybe bt->bit_size is not encoded correctly. Could you put vmlinux (in the above it is .tmp_vmlinux.btf) somewhere, I or somebody else can investigate and provide a proper fix. Thanks. - Sedat -
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/5/21 11:24 AM, Arnaldo Carvalho de Melo wrote: Em Fri, Feb 05, 2021 at 11:10:08AM -0800, Yonghong Song escreveu: On 2/5/21 11:06 AM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 7:53 PM Sedat Dilek wrote: Grepping through linux.git/tools I guess some BTF tools/libs need to know what BTF_INT_UNSIGNED is? BTF_INT_UNSIGNED needs kernel support. Maybe to teach pahole to ignore this for now until kernel infrastructure is ready. Yeah, I thought about doing that. Not sure whether this information will be useful or not for BTF. This needs to be discussed separately. Maybe search for the rationale for its introduction in DWARF. In LLVM, we have: uint8_t BTFEncoding; switch (Encoding) { case dwarf::DW_ATE_boolean: BTFEncoding = BTF::INT_BOOL; break; case dwarf::DW_ATE_signed: case dwarf::DW_ATE_signed_char: BTFEncoding = BTF::INT_SIGNED; break; case dwarf::DW_ATE_unsigned: case dwarf::DW_ATE_unsigned_char: BTFEncoding = 0; break; I think DW_ATE_unsigned can be ignored in pahole since the default encoding = 0. A simple comment is enough. - ARnaldo
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/5/21 11:15 AM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 8:10 PM Yonghong Song wrote: On 2/5/21 11:06 AM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 7:53 PM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 6:48 PM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 4:28 PM Arnaldo Carvalho de Melo wrote: Em Fri, Feb 05, 2021 at 04:23:59PM +0100, Sedat Dilek escreveu: On Fri, Feb 5, 2021 at 3:41 PM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 3:37 PM Sedat Dilek wrote: Hi, when building with pahole v1.20 and binutils v2.35.2 plus Clang v12.0.0-rc1 and DWARF-v5 I see: ... + info BTF .btf.vmlinux.bin.o + [ != silent_ ] + printf %-7s %s\n BTF .btf.vmlinux.bin.o BTF .btf.vmlinux.bin.o + LLVM_OBJCOPY=/opt/binutils/bin/objcopy /opt/pahole/bin/pahole -J .tmp_vmlinux.btf [115] INT DW_ATE_unsigned_1 Error emitting BTF type Encountered error while encoding BTF. Grepping the pahole sources: $ git grep DW_ATE dwarf_loader.c: bt->is_bool = encoding == DW_ATE_boolean; dwarf_loader.c: bt->is_signed = encoding == DW_ATE_signed; Missing DW_ATE_unsigned encoding? Checked the LLVM sources: clang/lib/CodeGen/CGDebugInfo.cpp:Encoding = llvm::dwarf::DW_ATE_unsigned_char; clang/lib/CodeGen/CGDebugInfo.cpp:Encoding = llvm::dwarf::DW_ATE_unsigned; clang/lib/CodeGen/CGDebugInfo.cpp:Encoding = llvm::dwarf::DW_ATE_unsigned_fixed; clang/lib/CodeGen/CGDebugInfo.cpp: ? llvm::dwarf::DW_ATE_unsigned ... lld/test/wasm/debuginfo.test:CHECK-NEXT:DW_AT_encoding (DW_ATE_unsigned) So, I will switch from GNU ld.bfd v2.35.2 to LLD-12. Thanks for the research, probably your conclusion is correct, can you go the next step and add that part and check if the end result is the expected one? Still building... Can you give me a hand on what has to be changed in dwarves/pahole? I guess switching from ld.bfd to ld.lld will show the same ERROR. This builds successfully - untested: $ git diff diff --git a/btf_loader.c b/btf_loader.c index ec286f413f36..a39edd3362db 100644 --- a/btf_loader.c +++ b/btf_loader.c @@ -107,6 +107,7 @@ static struct base_type *base_type__new(strings_t name, uint32_t attrs, bt->bit_size = size; bt->is_signed = attrs & BTF_INT_SIGNED; bt->is_bool = attrs & BTF_INT_BOOL; + bt->is_unsigned = attrs & BTF_INT_UNSIGNED; bt->name_has_encoding = false; bt->float_type = float_type; } diff --git a/ctf.h b/ctf.h index 25b79892bde3..9e47c3c74677 100644 --- a/ctf.h +++ b/ctf.h @@ -100,6 +100,7 @@ struct ctf_full_type { #define CTF_TYPE_INT_CHAR 0x2 #define CTF_TYPE_INT_BOOL 0x4 #define CTF_TYPE_INT_VARARGS 0x8 +#define CTF_TYPE_INT_UNSIGNED 0x16 #define CTF_TYPE_FP_ATTRS(VAL) ((VAL) >> 24) #define CTF_TYPE_FP_OFFSET(VAL)(((VAL) >> 16) & 0xff) diff --git a/dwarf_loader.c b/dwarf_loader.c index b73d7867e1e6..79d40f183c24 100644 --- a/dwarf_loader.c +++ b/dwarf_loader.c @@ -473,6 +473,7 @@ static struct base_type *base_type__new(Dwarf_Die *die, struct cu *cu) bt->is_bool = encoding == DW_ATE_boolean; bt->is_signed = encoding == DW_ATE_signed; bt->is_varargs = false; + bt->is_unsigned = encoding == DW_ATE_unsigned; bt->name_has_encoding = true; } diff --git a/dwarves.h b/dwarves.h index 98caf1abc54d..edf32d2e6f80 100644 --- a/dwarves.h +++ b/dwarves.h @@ -1261,6 +1261,7 @@ struct base_type { uint8_t is_signed:1; uint8_t is_bool:1; uint8_t is_varargs:1; + uint8_t is_unsigned:1; uint8_t float_type:4; }; diff --git a/lib/bpf b/lib/bpf --- a/lib/bpf +++ b/lib/bpf @@ -1 +1 @@ -Subproject commit 5af3d86b5a2c5fecdc3ab83822d083edd32b4396 +Subproject commit 5af3d86b5a2c5fecdc3ab83822d083edd32b4396-dirty diff --git a/libbtf.c b/libbtf.c index 9f7628304495..a0661a7bbed9 100644 --- a/libbtf.c +++ b/libbtf.c @@ -247,6 +247,8 @@ static const char * btf_elf__int_encoding_str(uint8_t encoding) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; + else if (encoding == BTF_INT_UNSIGNED) + return "UNSIGNED"; else return "UNKN"; } @@ -379,6 +381,8 @@ int32_t btf_elf__add_base_type(struct btf_elf *btfe, const struct base_type *bt, encoding = BTF_INT_SIGNED; } else if (bt->is_bool) { encoding = BTF_INT_BOOL; + } else if (bt->is_unsigned) { + encoding = BTF_INT_UNSIGNED; } else if (bt->float_type) { fprintf(stderr, "float_type is not supported\n"); return -1; Additionally - I cannot see it with `git diff`: [ lib/bpf/include/uapi/linux/
Re: ERROR: INT DW_ATE_unsigned_1 Error emitting BTF type
On 2/5/21 11:06 AM, Sedat Dilek wrote: On Fri, Feb 5, 2021 at 7:53 PM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 6:48 PM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 4:28 PM Arnaldo Carvalho de Melo wrote: Em Fri, Feb 05, 2021 at 04:23:59PM +0100, Sedat Dilek escreveu: On Fri, Feb 5, 2021 at 3:41 PM Sedat Dilek wrote: On Fri, Feb 5, 2021 at 3:37 PM Sedat Dilek wrote: Hi, when building with pahole v1.20 and binutils v2.35.2 plus Clang v12.0.0-rc1 and DWARF-v5 I see: ... + info BTF .btf.vmlinux.bin.o + [ != silent_ ] + printf %-7s %s\n BTF .btf.vmlinux.bin.o BTF .btf.vmlinux.bin.o + LLVM_OBJCOPY=/opt/binutils/bin/objcopy /opt/pahole/bin/pahole -J .tmp_vmlinux.btf [115] INT DW_ATE_unsigned_1 Error emitting BTF type Encountered error while encoding BTF. Grepping the pahole sources: $ git grep DW_ATE dwarf_loader.c: bt->is_bool = encoding == DW_ATE_boolean; dwarf_loader.c: bt->is_signed = encoding == DW_ATE_signed; Missing DW_ATE_unsigned encoding? Checked the LLVM sources: clang/lib/CodeGen/CGDebugInfo.cpp:Encoding = llvm::dwarf::DW_ATE_unsigned_char; clang/lib/CodeGen/CGDebugInfo.cpp:Encoding = llvm::dwarf::DW_ATE_unsigned; clang/lib/CodeGen/CGDebugInfo.cpp:Encoding = llvm::dwarf::DW_ATE_unsigned_fixed; clang/lib/CodeGen/CGDebugInfo.cpp: ? llvm::dwarf::DW_ATE_unsigned ... lld/test/wasm/debuginfo.test:CHECK-NEXT:DW_AT_encoding (DW_ATE_unsigned) So, I will switch from GNU ld.bfd v2.35.2 to LLD-12. Thanks for the research, probably your conclusion is correct, can you go the next step and add that part and check if the end result is the expected one? Still building... Can you give me a hand on what has to be changed in dwarves/pahole? I guess switching from ld.bfd to ld.lld will show the same ERROR. This builds successfully - untested: $ git diff diff --git a/btf_loader.c b/btf_loader.c index ec286f413f36..a39edd3362db 100644 --- a/btf_loader.c +++ b/btf_loader.c @@ -107,6 +107,7 @@ static struct base_type *base_type__new(strings_t name, uint32_t attrs, bt->bit_size = size; bt->is_signed = attrs & BTF_INT_SIGNED; bt->is_bool = attrs & BTF_INT_BOOL; + bt->is_unsigned = attrs & BTF_INT_UNSIGNED; bt->name_has_encoding = false; bt->float_type = float_type; } diff --git a/ctf.h b/ctf.h index 25b79892bde3..9e47c3c74677 100644 --- a/ctf.h +++ b/ctf.h @@ -100,6 +100,7 @@ struct ctf_full_type { #define CTF_TYPE_INT_CHAR 0x2 #define CTF_TYPE_INT_BOOL 0x4 #define CTF_TYPE_INT_VARARGS 0x8 +#define CTF_TYPE_INT_UNSIGNED 0x16 #define CTF_TYPE_FP_ATTRS(VAL) ((VAL) >> 24) #define CTF_TYPE_FP_OFFSET(VAL)(((VAL) >> 16) & 0xff) diff --git a/dwarf_loader.c b/dwarf_loader.c index b73d7867e1e6..79d40f183c24 100644 --- a/dwarf_loader.c +++ b/dwarf_loader.c @@ -473,6 +473,7 @@ static struct base_type *base_type__new(Dwarf_Die *die, struct cu *cu) bt->is_bool = encoding == DW_ATE_boolean; bt->is_signed = encoding == DW_ATE_signed; bt->is_varargs = false; + bt->is_unsigned = encoding == DW_ATE_unsigned; bt->name_has_encoding = true; } diff --git a/dwarves.h b/dwarves.h index 98caf1abc54d..edf32d2e6f80 100644 --- a/dwarves.h +++ b/dwarves.h @@ -1261,6 +1261,7 @@ struct base_type { uint8_t is_signed:1; uint8_t is_bool:1; uint8_t is_varargs:1; + uint8_t is_unsigned:1; uint8_t float_type:4; }; diff --git a/lib/bpf b/lib/bpf --- a/lib/bpf +++ b/lib/bpf @@ -1 +1 @@ -Subproject commit 5af3d86b5a2c5fecdc3ab83822d083edd32b4396 +Subproject commit 5af3d86b5a2c5fecdc3ab83822d083edd32b4396-dirty diff --git a/libbtf.c b/libbtf.c index 9f7628304495..a0661a7bbed9 100644 --- a/libbtf.c +++ b/libbtf.c @@ -247,6 +247,8 @@ static const char * btf_elf__int_encoding_str(uint8_t encoding) return "CHAR"; else if (encoding == BTF_INT_BOOL) return "BOOL"; + else if (encoding == BTF_INT_UNSIGNED) + return "UNSIGNED"; else return "UNKN"; } @@ -379,6 +381,8 @@ int32_t btf_elf__add_base_type(struct btf_elf *btfe, const struct base_type *bt, encoding = BTF_INT_SIGNED; } else if (bt->is_bool) { encoding = BTF_INT_BOOL; + } else if (bt->is_unsigned) { + encoding = BTF_INT_UNSIGNED; } else if (bt->float_type) { fprintf(stderr, "float_type is not supported\n"); return -1; Additionally - I cannot see it with `git diff`: [ lib/bpf/include/uapi/linux/btf.h ] /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED (1 << 0) #define BTF_INT_CHAR (1 << 1) #define BTF_INT_BOOL (1 << 2) #define BTF_INT_UNSIGNED (1 << 3) Comments? Hmmm... + info BTF .btf.vmlinux.bin.o + [ != silent_
Re: [PATCH bpf-next v3] bpf: Propagate stack bounds to registers in atomics w/ BPF_FETCH
On 2/2/21 5:50 AM, Brendan Jackman wrote: When BPF_FETCH is set, atomic instructions load a value from memory into a register. The current verifier code first checks via check_mem_access whether we can access the memory, and then checks via check_reg_arg whether we can write into the register. For loads, check_reg_arg has the side-effect of marking the register's value as unkonwn, and check_mem_access has the side effect of propagating bounds from memory to the register. This currently only takes effect for stack memory. Therefore with the current order, bounds information is thrown away, but by simply reversing the order of check_reg_arg vs. check_mem_access, we can instead propagate bounds smartly. A simple test is added with an infinite loop that can only be proved unreachable if this propagation is present. This is implemented both with C and directly in test_verifier using assembly. Suggested-by: John Fastabend Signed-off-by: Brendan Jackman Ack with a nit below. Acked-by: Yonghong Song --- Difference from v2->v3 [1]: * Fixed missing ENABLE_ATOMICS_TESTS check. Difference from v1->v2: * Reworked commit message to clarify this only affects stack memory * Added the Suggested-by * Added a C-based test. [1]: https://lore.kernel.org/bpf/ca+i-1c2zwubgxwj8kaxbri9rbboyumavj_bbhg+2zf_su9b...@mail.gmail.com/T/#t kernel/bpf/verifier.c | 32 +++ .../selftests/bpf/prog_tests/atomic_bounds.c | 15 + .../selftests/bpf/progs/atomic_bounds.c | 24 ++ .../selftests/bpf/verifier/atomic_bounds.c| 27 4 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomic_bounds.c create mode 100644 tools/testing/selftests/bpf/progs/atomic_bounds.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_bounds.c diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 972fc38eb62d..5e09632efddb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3665,9 +3665,26 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } + if (insn->imm & BPF_FETCH) { + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); + if (err) + return err; + } else { + /* This instruction accesses a memory location but doesn't +* actually load it into a register. +*/ + load_reg = -1; + } + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, load_reg, true); if (err) return err; @@ -3677,19 +3694,6 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; - if (!(insn->imm & BPF_FETCH)) - return 0; - - if (insn->imm == BPF_CMPXCHG) - load_reg = BPF_REG_0; - else - load_reg = insn->src_reg; - - /* check and record load of old value */ - err = check_reg_arg(env, load_reg, DST_OP); - if (err) - return err; - return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c b/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c new file mode 100644 index ..addf127068e4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomic_bounds.skel.h" + +void test_atomic_bounds(void) +{ + struct atomic_bounds *skel; + __u32 duration = 0; + + skel = atomic_bounds__open_and_load(); + if (CHECK(!skel, "skel_load", "couldn't load program\n")) + return; You are missing atomic_bounds__destroy(skel); here. +} diff --git a/tools/testing/selftests/bpf/progs/atomic_bounds.c b/tools/testing/selftests/bpf/progs/atomic_bounds.c new file mode 100644 index ..e5fff7fc7f8f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/atomic_bounds.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#ifdef ENABLE_ATOMICS_TESTS +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(sub, int x) +{ +#ifdef ENABLE_ATOMICS_TESTS + int a = 0; +
Re: [PATCH bpf-next 2/3] bpf: Add size arg to build_id_parse function
On 1/26/21 12:52 PM, Jiri Olsa wrote: On Thu, Jan 14, 2021 at 07:47:20PM -0800, Alexei Starovoitov wrote: On Thu, Jan 14, 2021 at 3:44 PM Yonghong Song wrote: On 1/14/21 2:02 PM, Jiri Olsa wrote: On Thu, Jan 14, 2021 at 01:05:33PM -0800, Yonghong Song wrote: On 1/14/21 12:01 PM, Jiri Olsa wrote: On Thu, Jan 14, 2021 at 10:56:33AM -0800, Yonghong Song wrote: On 1/14/21 5:40 AM, Jiri Olsa wrote: It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Currently, bpf build_id based stackmap does not returns the size of the build_id. Did you see an issue here? I guess user space can check the length of non-zero bits of the build id to decide what kind of type it is, right? you can have zero bytes in the build id hash, so you need to get the size I never saw MD5 being used in practise just SHA1, but we added the size to be complete and make sure we'll fit with build id, because there's only limited space in mmap2 event I am asking to check whether we should extend uapi struct bpf_stack_build_id to include build_id_size as well. I guess we can delay this until a real use case. right, we can try make some MD5 build id binaries and check if it explodes with some bcc tools, but I don't expect that.. I'll try to find some time for that Thanks. We may have issues on bcc side. For build_id collected in kernel, bcc always generates a length-20 string. But for user binaries, the build_id string length is equal to actual size of the build_id. They may not match (MD5 length is 16). The fix is probably to append '0's (up to length 20) for user binary build_id's. I guess MD5 is very seldom used. I will wait if you can reproduce the issue and then we might fix it. Indeed. Jiri, please check whether md5 is really an issue. Sounds like we have to do something on the kernel side. Hopefully zero padding will be enough. I would prefer to avoid extending uapi struct to cover rare case. build_id_parse is already doing the zero padding, so we are ok I tried several bcc tools over perf bench with md5 buildid and the results looked ok Great. Thanks for confirmation! jirka
Re: [PATCH bpf-next v2] samples/bpf: Set flag __SANE_USERSPACE_TYPES__ for MIPS to fix build warnings
On 1/24/21 9:05 PM, Tiezhu Yang wrote: There exists many build warnings when make M=samples/bpf on the Loongson platform, this issue is MIPS related, x86 compiles just fine. Here are some warnings: CC samples/bpf/ibumad_user.o samples/bpf/ibumad_user.c: In function ‘dump_counts’: samples/bpf/ibumad_user.c:46:24: warning: format ‘%llu’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘__u64’ {aka ‘long unsigned int’} [-Wformat=] printf("0x%02x : %llu\n", key, value); ~~~^ ~ %lu CC samples/bpf/offwaketime_user.o samples/bpf/offwaketime_user.c: In function ‘print_ksym’: samples/bpf/offwaketime_user.c:34:17: warning: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘__u64’ {aka ‘long unsigned int’} [-Wformat=] printf("%s/%llx;", sym->name, addr); ~~~^ %lx samples/bpf/offwaketime_user.c: In function ‘print_stack’: samples/bpf/offwaketime_user.c:68:17: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 3 has type ‘__u64’ {aka ‘long unsigned int’} [-Wformat=] printf(";%s %lld\n", key->waker, count); ~~~^ ~ %ld MIPS needs __SANE_USERSPACE_TYPES__ before to select 'int-ll64.h' in arch/mips/include/uapi/asm/types.h, then it can avoid build warnings when printing __u64 with %llu, %llx or %lld. The header tools/include/linux/types.h defines __SANE_USERSPACE_TYPES__, it seems that we can include in the source files which have build warnings, but it has no effect due to actually it includes usr/include/linux/types.h instead of tools/include/linux/types.h, the problem is that "usr/include" is preferred first than "tools/include" in samples/bpf/Makefile, that sounds like a ugly hack to -Itools/include before -Iusr/include. So define __SANE_USERSPACE_TYPES__ for MIPS in samples/bpf/Makefile is proper, if add "TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__" in samples/bpf/Makefile, it appears the following error: Auto-detecting system features: ...libelf: [ on ] ... zlib: [ on ] ... bpf: [ OFF ] BPF API too old make[3]: *** [Makefile:293: bpfdep] Error 1 make[2]: *** [Makefile:156: all] Error 2 With #ifndef __SANE_USERSPACE_TYPES__ in tools/include/linux/types.h, the above error has gone and this ifndef change does not hurt other compilations. Signed-off-by: Tiezhu Yang Acked-by: Yonghong Song
Re: [PATCH bpf-next v5 4/4] selftests/bpf: Add a selftest for the tracing bpf_get_socket_cookie
On 1/22/21 7:34 AM, Florent Revest wrote: On Wed, Jan 20, 2021 at 8:06 PM Florent Revest wrote: On Wed, Jan 20, 2021 at 8:04 PM Alexei Starovoitov wrote: On Wed, Jan 20, 2021 at 9:08 AM KP Singh wrote: On Tue, Jan 19, 2021 at 5:00 PM Florent Revest wrote: This builds up on the existing socket cookie test which checks whether the bpf_get_socket_cookie helpers provide the same value in cgroup/connect6 and sockops programs for a socket created by the userspace part of the test. Adding a tracing program to the existing objects requires a different attachment strategy and different headers. Signed-off-by: Florent Revest Acked-by: KP Singh (one minor note, doesn't really need fixing as a part of this though) --- .../selftests/bpf/prog_tests/socket_cookie.c | 24 +++ .../selftests/bpf/progs/socket_cookie_prog.c | 41 --- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/socket_cookie.c b/tools/testing/selftests/bpf/prog_tests/socket_cookie.c index 53d0c44e7907..e5c5e2ea1deb 100644 --- a/tools/testing/selftests/bpf/prog_tests/socket_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/socket_cookie.c @@ -15,8 +15,8 @@ struct socket_cookie { void test_socket_cookie(void) { + struct bpf_link *set_link, *update_sockops_link, *update_tracing_link; socklen_t addr_len = sizeof(struct sockaddr_in6); - struct bpf_link *set_link, *update_link; int server_fd, client_fd, cgroup_fd; struct socket_cookie_prog *skel; __u32 cookie_expected_value; @@ -39,15 +39,21 @@ void test_socket_cookie(void) PTR_ERR(set_link))) goto close_cgroup_fd; - update_link = bpf_program__attach_cgroup(skel->progs.update_cookie, -cgroup_fd); - if (CHECK(IS_ERR(update_link), "update-link-cg-attach", "err %ld\n", - PTR_ERR(update_link))) + update_sockops_link = bpf_program__attach_cgroup( + skel->progs.update_cookie_sockops, cgroup_fd); + if (CHECK(IS_ERR(update_sockops_link), "update-sockops-link-cg-attach", + "err %ld\n", PTR_ERR(update_sockops_link))) goto free_set_link; + update_tracing_link = bpf_program__attach( + skel->progs.update_cookie_tracing); + if (CHECK(IS_ERR(update_tracing_link), "update-tracing-link-attach", + "err %ld\n", PTR_ERR(update_tracing_link))) + goto free_update_sockops_link; + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) - goto free_update_link; + goto free_update_tracing_link; client_fd = connect_to_fd(server_fd, 0); if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) @@ -71,8 +77,10 @@ void test_socket_cookie(void) close(client_fd); close_server_fd: close(server_fd); -free_update_link: - bpf_link__destroy(update_link); +free_update_tracing_link: + bpf_link__destroy(update_tracing_link); I don't think this need to block submission unless there are other issues but the bpf_link__destroy can just be called in a single cleanup label because it handles null or erroneous inputs: int bpf_link__destroy(struct bpf_link *link) { int err = 0; if (IS_ERR_OR_NULL(link)) return 0; [...] +1 to KP's point. Also Florent, how did you test it? This test fails in CI and in my manual run: ./test_progs -t cook libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: ; int update_cookie_sockops(struct bpf_sock_ops *ctx) 0: (bf) r6 = r1 ; if (ctx->family != AF_INET6) 1: (61) r1 = *(u32 *)(r6 +20) ; if (ctx->family != AF_INET6) 2: (56) if w1 != 0xa goto pc+21 R1_w=inv10 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 ; if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) 3: (61) r1 = *(u32 *)(r6 +0) ; if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) 4: (56) if w1 != 0x3 goto pc+19 R1_w=inv3 R6_w=ctx(id=0,off=0,imm=0) R10=fp0 ; if (!ctx->sk) 5: (79) r1 = *(u64 *)(r6 +184) ; if (!ctx->sk) 6: (15) if r1 == 0x0 goto pc+17 R1_w=sock(id=0,ref_obj_id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) R10=fp0 ; p = bpf_sk_storage_get(_cookies, ctx->sk, 0, 0); 7: (79) r2 = *(u64 *)(r6 +184) ; p = bpf_sk_storage_get(_cookies, ctx->sk, 0, 0); 8: (18) r1 = 0x888106e41400 10: (b7) r3 = 0 11: (b7) r4 = 0 12: (85) call bpf_sk_storage_get#107 R2 type=sock_or_null expected=sock_common, sock, tcp_sock, xdp_sock, ptr_ processed 12 insns (limit 100) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 libbpf: -- END LOG -- libbpf: failed to load program 'update_cookie_sockops' libbpf: failed to load object 'socket_cookie_prog' libbpf: failed to load BPF skeleton 'socket_cookie_prog': -4007 test_socket_cookie:FAIL:socket_cookie_prog__open_and_load
Re: KASAN: vmalloc-out-of-bounds Read in bpf_trace_run7
I can reproduce the issue with C reproducer. This is an old known issue though and the failure is due to memory allocation failure in tracepoint_probe_unregister(). [ 40.807849][ T8287] Call Trace: [ 40.808201][ T8287] dump_stack+0x77/0x97 [ 40.808695][ T8287] should_fail.cold.6+0x32/0x4c [ 40.809238][ T8287] should_failslab+0x5/0x10 [ 40.809709][ T8287] slab_pre_alloc_hook.constprop.97+0xa0/0xd0 [ 40.810365][ T8287] ? tracepoint_probe_unregister+0xc7/0x2b0 [ 40.810998][ T8287] __kmalloc+0x64/0x210 [ 40.811442][ T8287] ? trace_raw_output_percpu_destroy_chunk+0x40/0x40 [ 40.812158][ T8287] tracepoint_probe_unregister+0xc7/0x2b0 [ 40.812766][ T8287] bpf_raw_tp_link_release+0x11/0x20 [ 40.813328][ T8287] bpf_link_free+0x20/0x40 [ 40.813802][ T8287] bpf_link_release+0xc/0x10 [ 40.814242][ T8287] __fput+0xa1/0x250 [ 40.814606][ T8287] task_work_run+0x68/0xb0 [ 40.815030][ T8287] exit_to_user_mode_prepare+0x22c/0x250 Steven Rostedt has the following pending patch https://lore.kernel.org/bpf/20201118093405.7a6d2...@gandalf.local.home/ trying to solve this exact problem. On 1/20/21 11:14 PM, syzbot wrote: syzbot has bisected this issue to: commit 8b401f9ed2441ad9e219953927a842d24ed051fc Author: Yonghong Song Date: Thu May 23 21:47:45 2019 + bpf: implement bpf_send_signal() helper bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=123408e750 start commit: 7d68e382 bpf: Permit size-0 datasec git tree: bpf-next final oops: https://syzkaller.appspot.com/x/report.txt?x=113408e750 console output: https://syzkaller.appspot.com/x/log.txt?x=163408e750 kernel config: https://syzkaller.appspot.com/x/.config?x=e0c7843b8af99dff dashboard link: https://syzkaller.appspot.com/bug?extid=fad5d91c7158ce568634 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=1224daa4d0 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=13dfabd0d0 Reported-by: syzbot+fad5d91c7158ce568...@syzkaller.appspotmail.com Fixes: 8b401f9ed244 ("bpf: implement bpf_send_signal() helper") For information about bisection process see: https://goo.gl/tpsmEJ#bisection
Re: [PATCH bpf-next v3] samples/bpf: Update build procedure for manually compiling LLVM and Clang
On 1/21/21 6:23 AM, Nathan Chancellor wrote: On Thu, Jan 21, 2021 at 12:08:31AM -0800, Andrii Nakryiko wrote: On Wed, Jan 20, 2021 at 9:36 PM Nathan Chancellor wrote: On Thu, Jan 21, 2021 at 01:27:35PM +0800, Tiezhu Yang wrote: The current LLVM and Clang build procedure in samples/bpf/README.rst is out of date. See below that the links are not accessible any more. $ git clone http://llvm.org/git/llvm.git Cloning into 'llvm'... fatal: unable to access 'http://llvm.org/git/llvm.git/ ': Maximum (20) redirects followed $ git clone --depth 1 http://llvm.org/git/clang.git Cloning into 'clang'... fatal: unable to access 'http://llvm.org/git/clang.git/ ': Maximum (20) redirects followed The LLVM community has adopted new ways to build the compiler. There are different ways to build LLVM and Clang, the Clang Getting Started page [1] has one way. As Yonghong said, it is better to copy the build procedure in Documentation/bpf/bpf_devel_QA.rst to keep consistent. I verified the procedure and it is proved to be feasible, so we should update README.rst to reflect the reality. At the same time, update the related comment in Makefile. Additionally, as Fangrui said, the dir llvm-project/llvm/build/install is not used, BUILD_SHARED_LIBS=OFF is the default option [2], so also change Documentation/bpf/bpf_devel_QA.rst together. [1] https://clang.llvm.org/get_started.html [2] https://www.llvm.org/docs/CMake.html Signed-off-by: Tiezhu Yang Acked-by: Yonghong Song Reviewed-by: Nathan Chancellor Small comment below. --- v2: Update the commit message suggested by Yonghong, thank you very much. v3: Remove the default option BUILD_SHARED_LIBS=OFF and just mkdir llvm-project/llvm/build suggested by Fangrui. Documentation/bpf/bpf_devel_QA.rst | 3 +-- samples/bpf/Makefile | 2 +- samples/bpf/README.rst | 16 +--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index 5b613d2..18788bb 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -506,11 +506,10 @@ that set up, proceed with building the latest LLVM and clang version from the git repositories:: $ git clone https://github.com/llvm/llvm-project.git - $ mkdir -p llvm-project/llvm/build/install + $ mkdir -p llvm-project/llvm/build $ cd llvm-project/llvm/build $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ -DLLVM_ENABLE_PROJECTS="clang"\ --DBUILD_SHARED_LIBS=OFF \ -DCMAKE_BUILD_TYPE=Release\ -DLLVM_BUILD_RUNTIME=OFF $ ninja diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 26fc96c..d061446 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -208,7 +208,7 @@ TPROGLDLIBS_xdpsock += -pthread -lcap TPROGLDLIBS_xsk_fwd += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: -# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang LLC ?= llc CLANG ?= clang OPT ?= opt diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index dd34b2d..23006cb 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -65,17 +65,19 @@ To generate a smaller llc binary one can use:: Quick sniplet for manually compiling LLVM and clang (build dependencies are cmake and gcc-c++):: Technically, ninja is now a build dependency as well, it might be worth mentioning that here (usually the package is ninja or ninja-build). it's possible to generate Makefile by passing `-g "Unix Makefiles"`, which would avoid dependency on ninja, no? Yes, although I am fairly certain that building with ninja is quicker so I would recommend keeping it. One small extra dependency never killed anyone plus ninja is becoming more common nowadays :) Agree. Let us use 'ninja' here, which is widely used in llvm community for llvm-project build compared to other alternatives. Regardless of whether that is addressed or not (because it is small), feel free to carry forward my tag in any future revisions unless they drastically change. - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" - $ make -j $(getconf _NPROCESSORS_ONLN) + $ git clone https://github.com/llvm/llvm-project.git + $ mkdir -p llvm-project/llvm/build + $ cd llvm-project/llvm/build + $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ +-DLLVM_ENABLE_PROJECTS="clang"\ +-DCMAKE_BUILD_TYPE=Release\
Re: [PATCH bpf-next v2] samples/bpf: Update README.rst and Makefile for manually compiling LLVM and clang
On 1/19/21 6:32 PM, Tiezhu Yang wrote: On 01/20/2021 05:58 AM, Fangrui Song wrote: On 2021-01-19, Tiezhu Yang wrote: The current llvm/clang build procedure in samples/bpf/README.rst is out of date. See below that the links are not accessible any more. $ git clone http://llvm.org/git/llvm.git Cloning into 'llvm'... fatal: unable to access 'http://llvm.org/git/llvm.git/ ': Maximum (20) redirects followed $ git clone --depth 1 http://llvm.org/git/clang.git Cloning into 'clang'... fatal: unable to access 'http://llvm.org/git/clang.git/ ': Maximum (20) redirects followed The llvm community has adopted new ways to build the compiler. There are different ways to build llvm/clang, the Clang Getting Started page [1] has one way. As Yonghong said, it is better to just copy the build procedure in Documentation/bpf/bpf_devel_QA.rst to keep consistent. I verified the procedure and it is proved to be feasible, so we should update README.rst to reflect the reality. At the same time, update the related comment in Makefile. [1] https://clang.llvm.org/get_started.html Signed-off-by: Tiezhu Yang Acked-by: Yonghong Song --- v2: Update the commit message suggested by Yonghong, thank you very much. samples/bpf/Makefile | 2 +- samples/bpf/README.rst | 17 ++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 26fc96c..d061446 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -208,7 +208,7 @@ TPROGLDLIBS_xdpsock += -pthread -lcap TPROGLDLIBS_xsk_fwd += -pthread # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: -# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang +# make M=samples/bpf LLC=~/git/llvm-project/llvm/build/bin/llc CLANG=~/git/llvm-project/llvm/build/bin/clang LLC ?= llc CLANG ?= clang OPT ?= opt diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index dd34b2d..d1be438 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -65,17 +65,20 @@ To generate a smaller llc binary one can use:: Quick sniplet for manually compiling LLVM and clang (build dependencies are cmake and gcc-c++):: - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" - $ make -j $(getconf _NPROCESSORS_ONLN) + $ git clone https://github.com/llvm/llvm-project.git + $ mkdir -p llvm-project/llvm/build/install llvm-project/llvm/build/install is not used. Yes, just mkdir -p llvm-project/llvm/build is OK. + $ cd llvm-project/llvm/build + $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DBUILD_SHARED_LIBS=OFF \ -DBUILD_SHARED_LIBS=OFF is the default. It can be omitted. I search the related doc [1] [2], remove this option is OK for me. BUILD_SHARED_LIBS:BOOL Flag indicating if each LLVM component (e.g. Support) is built as a shared library (ON) or as a static library (OFF). Its default value is OFF. [1] https://www.llvm.org/docs/CMake.html [2] https://cmake.org/cmake/help/latest/variable/BUILD_SHARED_LIBS.html + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_BUILD_RUNTIME=OFF -DLLVM_BUILD_RUNTIME=OFF can be omitted if none of compiler-rt/libc++/libc++abi is built. I am not very sure about it because the default value of LLVM_BUILD_RUNTIME is ON? [3] option(LLVM_BUILD_RUNTIME "Build the LLVM runtime libraries." ON) [3] https://github.com/llvm/llvm-project/blob/main/llvm/CMakeLists.txt If anyone has any more suggestions, please let me know. I will send v3 after waiting for other feedback. By the way, Documentation/bpf/bpf_devel_QA.rst maybe need a separate patch to remove some cmake options? Please submit updated this patch and Documentation/bpf/bpf_devel_QA.rst together. This way, it is easy to cross check. Thanks. Thanks, Tiezhu [...]
Re: [PATCH v3 bpf-next 2/2] selftests: bpf: Add a new test for bare tracepoints
On 1/19/21 4:22 AM, Qais Yousef wrote: Reuse module_attach infrastructure to add a new bare tracepoint to check we can attach to it as a raw tracepoint. Signed-off-by: Qais Yousef Acked-by: Yonghong Song
Re: [PATCH bpf] samples/bpf: Update README.rst and Makefile for manually compiling LLVM and clang
On 1/18/21 7:53 PM, Tiezhu Yang wrote: In the current samples/bpf/README.rst, the url of llvm and clang git may be out of date, they are unable to access: Let us just rephrase the above more clearly, something like below. The current clang/llvm build procedure in samples/bpf/README.rst is out of date. See below that the links are not accessible any more. $ git clone http://llvm.org/git/llvm.git Cloning into 'llvm'... fatal: unable to access 'http://llvm.org/git/llvm.git/ ': Maximum (20) redirects followed $ git clone --depth 1 http://llvm.org/git/clang.git Cloning into 'clang'... fatal: unable to access 'http://llvm.org/git/clang.git/ ': Maximum (20) redirects followed The llvm community has adopted new ways to build the compiler. [followed by your descriptions below] There are different ways to build llvm/clang, I find the Clang Getting Started page [1] has one way, as Yonghong said, it is better to just copy the build procedure in Documentation/bpf/bpf_devel_QA.rst to keep consistent. I verified the procedure and it is proved to be feasible, so we should update README.rst to reflect the reality. At the same time, update the related comment in Makefile. [1] https://clang.llvm.org/get_started.html Signed-off-by: Tiezhu Yang Ack with minor nits in the above. Also, this is a documentation update. I think it is okay to target the patch to bpf-next instead of bpf. Acked-by: Yonghong Song
Re: [PATCH bpf 1/2] samples/bpf: Set flag __SANE_USERSPACE_TYPES__ for MIPS to fix build warnings
On 1/17/21 7:22 PM, Tiezhu Yang wrote: On 01/14/2021 01:12 AM, Yonghong Song wrote: On 1/13/21 2:57 AM, Tiezhu Yang wrote: MIPS needs __SANE_USERSPACE_TYPES__ before to select 'int-ll64.h' in arch/mips/include/uapi/asm/types.h and avoid compile warnings when printing __u64 with %llu, %llx or %lld. could you mention which command produces the following warning? make M=samples/bpf printf("0x%02x : %llu\n", key, value); ~~~^ ~ %lu printf("%s/%llx;", sym->name, addr); ~~~^ %lx printf(";%s %lld\n", key->waker, count); ~~~^ ~ %ld Signed-off-by: Tiezhu Yang --- samples/bpf/Makefile | 4 tools/include/linux/types.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 26fc96c..27de306 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -183,6 +183,10 @@ BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) endif +ifeq ($(ARCH), mips) +TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__ +endif + This change looks okay based on description in arch/mips/include/uapi/asm/types.h ''' /* * We don't use int-l64.h for the kernel anymore but still use it for * userspace to avoid code changes. * * However, some user programs (e.g. perf) may not want this. They can * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here. */ ''' TPROGS_CFLAGS += -Wall -O2 TPROGS_CFLAGS += -Wmissing-prototypes TPROGS_CFLAGS += -Wstrict-prototypes diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h index 154eb4e..e9c5a21 100644 --- a/tools/include/linux/types.h +++ b/tools/include/linux/types.h @@ -6,7 +6,10 @@ #include #include +#ifndef __SANE_USERSPACE_TYPES__ #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ +#endif What problem this patch fixed? If add "TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__" in samples/bpf/Makefile, it appears the following error: Auto-detecting system features: ... libelf: [ on ] ... zlib: [ on ] ... bpf: [ OFF ] BPF API too old make[3]: *** [Makefile:293: bpfdep] Error 1 make[2]: *** [Makefile:156: all] Error 2 With #ifndef __SANE_USERSPACE_TYPES__ in tools/include/linux/types.h, the above error has gone. If this header is used, you can just change comment from "PPC64" to "PPC64/MIPS", right? If include in the source files which have compile warnings when printing __u64 with %llu, %llx or %lld, it has no effect due to actually it includes usr/include/linux/types.h instead of tools/include/linux/types.h, this is because the include-directories in samples/bpf/Makefile are searched in the order, -I./usr/include is in the front of -I./tools/include. So I think define __SANE_USERSPACE_TYPES__ for MIPS in samples/bpf/Makefile is proper, at the same time, add #ifndef __SANE_USERSPACE_TYPES__ in tools/include/linux/types.h can avoid build error and have no side effect. I will send v2 later with mention in the commit message that this is mips related. It would be good if you can add the above information to the commit message so people will know what the root cause of the issue. If I understand correctly, if we could have include path "tools/include" earlier than "usr/include", we might not have this issue. The problem is that "usr/include" is preferred first (uapi) than "tools/include" (including kernel dev headers). I am wondering whether we could avoid changes in tools/include/linux/types.h, e.g., by undef __SANE_USER_SPACE_TYPES right before include path tools/include. But that sounds like a ugly hack and actually the change in tools/include/linux/types.h does not hurt other compilations. So your current change looks good to me, but please have better explanation of the problem and why for each change in the commit message. Thanks, Tiezhu + #include #include
Re: [PATCH bpf] samples/bpf: Update README.rst for manually compiling LLVM and clang
On 1/18/21 12:53 AM, Tiezhu Yang wrote: In the current samples/bpf/README.rst, the url of llvm and clang git may be out of date, they are unable to access: $ git clone http://llvm.org/git/llvm.git Cloning into 'llvm'... fatal: unable to access 'http://llvm.org/git/llvm.git/ ': Maximum (20) redirects followed $ git clone --depth 1 http://llvm.org/git/clang.git Cloning into 'clang'... fatal: unable to access 'http://llvm.org/git/clang.git/ ': Maximum (20) redirects followed The Clang Getting Started page [1] might have more accurate information, I verified the procedure and it is proved to be feasible, so we should update it to reflect the reality. [1] https://clang.llvm.org/get_started.html Signed-off-by: Tiezhu Yang --- samples/bpf/README.rst | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index dd34b2d..f606c08 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -65,11 +65,9 @@ To generate a smaller llc binary one can use:: Quick sniplet for manually compiling LLVM and clang (build dependencies are cmake and gcc-c++):: - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" + $ git clone https://github.com/llvm/llvm-project.git + $ cd llvm-project; mkdir build; cd build + $ cmake -DLLVM_ENABLE_PROJECTS=clang -DLLVM_TARGETS_TO_BUILD="BPF;X86" -G "Unix Makefiles" ../llvm $ make -j $(getconf _NPROCESSORS_ONLN) Thanks for the patch. Indeed llvm.org/git/llvm has been deprecated. We have recommended to use llvm-project at kernel/Documentation/bpf/bpf_devel_QA.rst. https://github.com/torvalds/linux/blob/master/Documentation/bpf/bpf_devel_QA.rst#q-got-it-so-how-do-i-build-llvm-manually-anyway Could you use the same scripts in the above link here? There are different ways to build llvm/clang, I just want to be consistent between bpf_devel_QA.rst and there. I am also thinking whether we should just provide a link here to bpf_devel_QA.rst. Looking at samples/bpf/README.rst, it all contains direct commands for people to build/test, so copy-pasting the llvm build scripts here should be fine. It is also possible to point make to the newly compiled 'llc' or
Re: [PATCH v2 bpf-next 2/2] selftests: bpf: Add a new test for bare tracepoints
On 1/18/21 4:18 AM, Qais Yousef wrote: On 01/16/21 18:11, Yonghong Song wrote: On 1/16/21 10:21 AM, Qais Yousef wrote: Reuse module_attach infrastructure to add a new bare tracepoint to check we can attach to it as a raw tracepoint. Signed-off-by: Qais Yousef --- .../bpf/bpf_testmod/bpf_testmod-events.h | 6 + .../selftests/bpf/bpf_testmod/bpf_testmod.c | 21 ++- .../selftests/bpf/bpf_testmod/bpf_testmod.h | 6 + .../selftests/bpf/prog_tests/module_attach.c | 27 +++ .../selftests/bpf/progs/test_module_attach.c | 10 +++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h index b83ea448bc79..89c6d58e5dd6 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -28,6 +28,12 @@ TRACE_EVENT(bpf_testmod_test_read, __entry->pid, __entry->comm, __entry->off, __entry->len) ); +/* A bare tracepoint with no event associated with it */ +DECLARE_TRACE(bpf_testmod_test_write_bare, + TP_PROTO(struct task_struct *task, struct bpf_testmod_test_write_ctx *ctx), + TP_ARGS(task, ctx) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 2df19d73ca49..e900adad2276 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -28,9 +28,28 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, EXPORT_SYMBOL(bpf_testmod_test_read); ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); +noinline ssize_t +bpf_testmod_test_write(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + struct bpf_testmod_test_write_ctx ctx = { + .buf = buf, + .off = off, + .len = len, + }; + + trace_bpf_testmod_test_write_bare(current, ); + + return -EIO; /* always fail */ +} +EXPORT_SYMBOL(bpf_testmod_test_write); +ALLOW_ERROR_INJECTION(bpf_testmod_test_write, ERRNO); + static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { Do we need to remove __ro_after_init? I don't think so. The structure should still remain RO AFAIU. okay. - .attr = { .name = "bpf_testmod", .mode = 0444, }, + .attr = { .name = "bpf_testmod", .mode = 0666, }, .read = bpf_testmod_test_read, + .write = bpf_testmod_test_write, }; static int bpf_testmod_init(void) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index b81adfedb4f6..b3892dc40111 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -11,4 +11,10 @@ struct bpf_testmod_test_read_ctx { size_t len; }; +struct bpf_testmod_test_write_ctx { + char *buf; + loff_t off; + size_t len; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 50796b651f72..e4605c0b5af1 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -21,9 +21,34 @@ static int trigger_module_test_read(int read_sz) return 0; } +static int trigger_module_test_write(int write_sz) +{ + int fd, err; Init err = 0? I don't see what difference this makes. + char *buf = malloc(write_sz); + + if (!buf) + return -ENOMEM; Looks like we already non-negative value, so return ENOMEM? We already set err=-errno. So shouldn't we return negative too? Oh, yes, return -ENOMEM sounds right here. + + memset(buf, 'a', write_sz); + buf[write_sz-1] = '\0'; + + fd = open("/sys/kernel/bpf_testmod", O_WRONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) + goto out; Change the above to fd = open("/sys/kernel/bpf_testmod", O_WRONLY); if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", errno)) { Here it should be ... "failed: %d\n", -errno. err = -errno; goto out; } I kept the code consistent with the definition of trigger_module_test_read(). The original patch code: +static int trigger_module_test_write(int write_sz) +{ + int fd, err; + char *buf = malloc(write_sz); + + if (!buf) + return -EN
Re: [PATCH bpf-next v2 2/2] docs: bpf: Clarify -mcpu=v3 requirement for atomic ops
On 1/18/21 7:57 AM, Brendan Jackman wrote: Alexei pointed out [1] that this wording is pretty confusing. Here's an attempt to be more explicit and clear. [1] https://lore.kernel.org/bpf/CAADnVQJVvwoZsE1K+6qRxzF7+6CvZNzygnoBW9tZNWJELk5c=q...@mail.gmail.com/T/#m07264fc18fdc43af02fc1320968afefcc73d96f4 Signed-off-by: Brendan Jackman Thanks for better description! Acked-by: Yonghong Song
Re: [PATCH v2 bpf-next 2/2] selftests: bpf: Add a new test for bare tracepoints
On 1/16/21 10:21 AM, Qais Yousef wrote: Reuse module_attach infrastructure to add a new bare tracepoint to check we can attach to it as a raw tracepoint. Signed-off-by: Qais Yousef --- .../bpf/bpf_testmod/bpf_testmod-events.h | 6 + .../selftests/bpf/bpf_testmod/bpf_testmod.c | 21 ++- .../selftests/bpf/bpf_testmod/bpf_testmod.h | 6 + .../selftests/bpf/prog_tests/module_attach.c | 27 +++ .../selftests/bpf/progs/test_module_attach.c | 10 +++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h index b83ea448bc79..89c6d58e5dd6 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod-events.h @@ -28,6 +28,12 @@ TRACE_EVENT(bpf_testmod_test_read, __entry->pid, __entry->comm, __entry->off, __entry->len) ); +/* A bare tracepoint with no event associated with it */ +DECLARE_TRACE(bpf_testmod_test_write_bare, + TP_PROTO(struct task_struct *task, struct bpf_testmod_test_write_ctx *ctx), + TP_ARGS(task, ctx) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 2df19d73ca49..e900adad2276 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -28,9 +28,28 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, EXPORT_SYMBOL(bpf_testmod_test_read); ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); +noinline ssize_t +bpf_testmod_test_write(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + struct bpf_testmod_test_write_ctx ctx = { + .buf = buf, + .off = off, + .len = len, + }; + + trace_bpf_testmod_test_write_bare(current, ); + + return -EIO; /* always fail */ +} +EXPORT_SYMBOL(bpf_testmod_test_write); +ALLOW_ERROR_INJECTION(bpf_testmod_test_write, ERRNO); + static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { Do we need to remove __ro_after_init? - .attr = { .name = "bpf_testmod", .mode = 0444, }, + .attr = { .name = "bpf_testmod", .mode = 0666, }, .read = bpf_testmod_test_read, + .write = bpf_testmod_test_write, }; static int bpf_testmod_init(void) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h index b81adfedb4f6..b3892dc40111 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.h @@ -11,4 +11,10 @@ struct bpf_testmod_test_read_ctx { size_t len; }; +struct bpf_testmod_test_write_ctx { + char *buf; + loff_t off; + size_t len; +}; + #endif /* _BPF_TESTMOD_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/module_attach.c b/tools/testing/selftests/bpf/prog_tests/module_attach.c index 50796b651f72..e4605c0b5af1 100644 --- a/tools/testing/selftests/bpf/prog_tests/module_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/module_attach.c @@ -21,9 +21,34 @@ static int trigger_module_test_read(int read_sz) return 0; } +static int trigger_module_test_write(int write_sz) +{ + int fd, err; Init err = 0? + char *buf = malloc(write_sz); + + if (!buf) + return -ENOMEM; Looks like we already non-negative value, so return ENOMEM? + + memset(buf, 'a', write_sz); + buf[write_sz-1] = '\0'; + + fd = open("/sys/kernel/bpf_testmod", O_WRONLY); + err = -errno; + if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", err)) + goto out; Change the above to fd = open("/sys/kernel/bpf_testmod", O_WRONLY); if (CHECK(fd < 0, "testmod_file_open", "failed: %d\n", errno)) { err = -errno; goto out; } + + write(fd, buf, write_sz); + close(fd); +out: + free(buf); + No need for extra line here. + return 0; return err. +} + void test_module_attach(void) { const int READ_SZ = 456; + const int WRITE_SZ = 457; struct test_module_attach* skel; struct test_module_attach__bss *bss; int err; @@ -48,8 +73,10 @@ void test_module_attach(void) /* trigger tracepoint */ ASSERT_OK(trigger_module_test_read(READ_SZ), "trigger_read"); + ASSERT_OK(trigger_module_test_write(WRITE_SZ), "trigger_write"); ASSERT_EQ(bss->raw_tp_read_sz, READ_SZ, "raw_tp"); + ASSERT_EQ(bss->raw_tp_bare_write_sz, WRITE_SZ, "raw_tp_bare");
Re: [PATCH v2 bpf-next 1/2] trace: bpf: Allow bpf to attach to bare tracepoints
On 1/16/21 10:21 AM, Qais Yousef wrote: Some subsystems only have bare tracepoints (a tracepoint with no associated trace event) to avoid the problem of trace events being an ABI that can't be changed. From bpf presepective, bare tracepoints are what it calls RAW_TRACEPOINT(). Since bpf assumed there's 1:1 mapping, it relied on hooking to DEFINE_EVENT() macro to create bpf mapping of the tracepoints. Since bare tracepoints use DECLARE_TRACE() to create the tracepoint, bpf had no knowledge about their existence. By teaching bpf_probe.h to parse DECLARE_TRACE() in a similar fashion to DEFINE_EVENT(), bpf can find and attach to the new raw tracepoints. Enabling that comes with the contract that changes to raw tracepoints don't constitute a regression if they break existing bpf programs. We need the ability to continue to morph and modify these raw tracepoints without worrying about any ABI. Update Documentation/bpf/bpf_design_QA.rst to document this contract. Signed-off-by: Qais Yousef Acked-by: Yonghong Song
Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
On 1/15/21 5:12 PM, Song Liu wrote: On Jan 15, 2021, at 4:55 PM, Yonghong Song wrote: On 1/15/21 3:34 PM, Song Liu wrote: On Jan 12, 2021, at 8:53 AM, KP Singh wrote: On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song wrote: On 1/11/21 3:45 PM, Song Liu wrote: On Jan 11, 2021, at 1:58 PM, Martin Lau wrote: On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote: On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau wrote: On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote: [ ... ] diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e73..9bd47ad2b26f1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c [...] +#include #include #include @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); If exit_creds() is traced by a bpf and this bpf is doing bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE), new task storage will be created after bpf_task_storage_free(). I recalled there was an earlier discussion with KP and KP mentioned BPF_LSM will not be called with a task that is going away. It seems enabling bpf task storage in bpf tracing will break this assumption and needs to be addressed? For tracing programs, I think we will need an allow list where task local storage can be used. Instead of whitelist, can refcount_inc_not_zero(>usage) be used? I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like: diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c index f654b56907b69..93d01b0a010e6 100644 --- i/kernel/bpf/bpf_task_storage.c +++ w/kernel/bpf/bpf_task_storage.c @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * by an RCU read-side critical section. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + if (!refcount_inc_not_zero(>usage)) + return -EBUSY; + sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); But where shall we add the refcount_dec()? IIUC, we cannot add it to __put_task_struct(). Maybe put_task_struct()? Yeah, something like, or if you find a more elegant alternative :) --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { - if (refcount_dec_and_test(>usage)) + + if (rcu_access_pointer(t->bpf_storage)) { + if (refcount_sub_and_test(2, >usage)) + __put_task_struct(t); + } else if (refcount_dec_and_test(>usage)) __put_task_struct(t); } static inline void put_task_struct_many(struct task_struct *t, int nr) { - if (refcount_sub_and_test(nr, >usage)) + if (rcu_access_pointer(t->bpf_storage)) { + if (refcount_sub_and_test(nr + 1, >usage)) + __put_task_struct(t); + } else if (refcount_sub_and_test(nr, >usage)) __put_task_struct(t); } It is not ideal to leak bpf_storage here. How about we only add the following: diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c index f654b56907b69..2811b9fc47233 100644 --- i/kernel/bpf/bpf_task_storage.c +++ w/kernel/bpf/bpf_task_storage.c @@ -216,6 +216,10 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * by an RCU read-side critical section. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* the task_struct is being freed, fail over*/ + if (!refcount_read(>usage)) + return -EBUSY; This may not work? Even we check here and task->usage is not 0, it could still become 0 immediately after the above refcount_read, right? We call bpf_task_storage_get() with "task" that has valid BTF, so "task" should not go away during the BPF program? Whatever mechanism that Oh, right. this is true. Otherwise, we cannot use task ptr in the helper. triggers the BPF program should either hold a reference to task (usage > 0) or be the only one owning it (usage == 0, in __put_task_struct). Did I miss anything? Sorry. I think you are right. Not sure lsm requirement. There are two more possible ways to check task is exiting which happens before __put_task_struct(): . check task->exit_state . check task->flags & PF_EXITING (used in bpf_trace.c) Not sure which condition is the correct one to check. Thanks, Song + sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map
Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
On 1/15/21 3:34 PM, Song Liu wrote: On Jan 12, 2021, at 8:53 AM, KP Singh wrote: On Tue, Jan 12, 2021 at 5:32 PM Yonghong Song wrote: On 1/11/21 3:45 PM, Song Liu wrote: On Jan 11, 2021, at 1:58 PM, Martin Lau wrote: On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote: On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau wrote: On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote: [ ... ] diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e73..9bd47ad2b26f1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c [...] +#include #include #include @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); If exit_creds() is traced by a bpf and this bpf is doing bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE), new task storage will be created after bpf_task_storage_free(). I recalled there was an earlier discussion with KP and KP mentioned BPF_LSM will not be called with a task that is going away. It seems enabling bpf task storage in bpf tracing will break this assumption and needs to be addressed? For tracing programs, I think we will need an allow list where task local storage can be used. Instead of whitelist, can refcount_inc_not_zero(>usage) be used? I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like: diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c index f654b56907b69..93d01b0a010e6 100644 --- i/kernel/bpf/bpf_task_storage.c +++ w/kernel/bpf/bpf_task_storage.c @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * by an RCU read-side critical section. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + if (!refcount_inc_not_zero(>usage)) + return -EBUSY; + sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); But where shall we add the refcount_dec()? IIUC, we cannot add it to __put_task_struct(). Maybe put_task_struct()? Yeah, something like, or if you find a more elegant alternative :) --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -107,13 +107,20 @@ extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { - if (refcount_dec_and_test(>usage)) + + if (rcu_access_pointer(t->bpf_storage)) { + if (refcount_sub_and_test(2, >usage)) + __put_task_struct(t); + } else if (refcount_dec_and_test(>usage)) __put_task_struct(t); } static inline void put_task_struct_many(struct task_struct *t, int nr) { - if (refcount_sub_and_test(nr, >usage)) + if (rcu_access_pointer(t->bpf_storage)) { + if (refcount_sub_and_test(nr + 1, >usage)) + __put_task_struct(t); + } else if (refcount_sub_and_test(nr, >usage)) __put_task_struct(t); } It is not ideal to leak bpf_storage here. How about we only add the following: diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c index f654b56907b69..2811b9fc47233 100644 --- i/kernel/bpf/bpf_task_storage.c +++ w/kernel/bpf/bpf_task_storage.c @@ -216,6 +216,10 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * by an RCU read-side critical section. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* the task_struct is being freed, fail over*/ + if (!refcount_read(>usage)) + return -EBUSY; This may not work? Even we check here and task->usage is not 0, it could still become 0 immediately after the above refcount_read, right? + sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); I may be missing something but shouldn't bpf_storage be an __rcu member like we have for sk_bpf_storage? Good catch! I will fix this in v2. Thanks, Song
Re: [PATCH v5 0/3] Kbuild: DWARF v5 support
On 1/15/21 3:34 PM, Nick Desaulniers wrote: On Fri, Jan 15, 2021 at 3:24 PM Yonghong Song wrote: On 1/15/21 1:53 PM, Sedat Dilek wrote: En plus, I encountered breakage with GCC v10.2.1 and LLVM=1 and CONFIG_DEBUG_INFO_DWARF4. So might be good to add a "depends on !DEBUG_INFO_BTF" in this combination. Can you privately send me your configs that repro? Maybe I can isolate it to a set of configs? I suggested not to add !DEBUG_INFO_BTF to CONFIG_DEBUG_INFO_DWARF4. It is not there before and adding this may suddenly break some users. If certain combination of gcc/llvm does not work for CONFIG_DEBUG_INFO_DWARF4 with pahole, this is a bug bpf community should fix. Is there a place I should report bugs? You can send bug report to Arnaldo Carvalho de Melo , dwar...@vger.kernel.org and b...@vger.kernel.org. I had some other small nits commented in the single patches. As requested in your previous patch-series, feel free to add my: Tested-by: Sedat Dilek Yeah, I'll keep it if v6 is just commit message changes.
Re: [PATCH v5 0/3] Kbuild: DWARF v5 support
On 1/15/21 1:53 PM, Sedat Dilek wrote: On Fri, Jan 15, 2021 at 10:06 PM Nick Desaulniers wrote: DWARF v5 is the latest standard of the DWARF debug info format. DWARF5 wins significantly in terms of size when mixed with compression (CONFIG_DEBUG_INFO_COMPRESSED). Link: http://www.dwarfstd.org/doc/DWARF5.pdf Patch 1 is a cleanup from Masahiro and isn't DWARF v5 specific. Patch 2 is a cleanup that lays the ground work and isn't DWARF v5 specific. Patch 3 implements Kconfig and Kbuild support for DWARFv5. Changes from v4: * drop set -e from script as per Nathan. * add dependency on !CONFIG_DEBUG_INFO_BTF for DWARF v5 as per Sedat. * Move LLVM_IAS=1 complexity from patch 2 to patch 3 as per Arvind and Masahiro. Sorry it took me a few tries to understand the point (I might still not), but it looks much cleaner this way. Sorry Nathan, I did not carry forward your previous reviews as a result, but I would appreciate if you could look again. * Add Nathan's reviewed by tag to patch 1. * Reword commit message for patch 3 to mention LLVM_IAS=1 and -gdwarf-5 binutils addition later, and BTF issue. * I still happen to see a pahole related error spew for the combination of: * LLVM=1 * LLVM_IAS=1 * CONFIG_DEBUG_INFO_DWARF4 * CONFIG_DEBUG_INFO_BTF Though they're non-fatal to the build. I'm not sure yet why removing any one of the above prevents the warning spew. Maybe we'll need a v6. En plus, I encountered breakage with GCC v10.2.1 and LLVM=1 and CONFIG_DEBUG_INFO_DWARF4. So might be good to add a "depends on !DEBUG_INFO_BTF" in this combination. I suggested not to add !DEBUG_INFO_BTF to CONFIG_DEBUG_INFO_DWARF4. It is not there before and adding this may suddenly break some users. If certain combination of gcc/llvm does not work for CONFIG_DEBUG_INFO_DWARF4 with pahole, this is a bug bpf community should fix. I had some other small nits commented in the single patches. As requested in your previous patch-series, feel free to add my: Tested-by: Sedat Dilek - Sedat - Changes from v3: Changes as per Arvind: * only add -Wa,-gdwarf-5 for (LLVM=1|CC=clang)+LLVM_IAS=0 builds. * add -gdwarf-5 to Kconfig shell script. * only run Kconfig shell script for Clang. Apologies to Sedat and Nathan; I appreciate previous testing/review, but I did no carry forward your Tested-by and Reviewed-by tags, as the patches have changed too much IMO. Changes from v2: * Drop two of the earlier patches that have been accepted already. * Add measurements with GCC 10.2 to commit message. * Update help text as per Arvind with help from Caroline. * Improve case/wording between DWARF Versions as per Masahiro. Changes from the RFC: * split patch in 3 patch series, include Fangrui's patch, too. * prefer `DWARF vX` format, as per Fangrui. * use spaces between assignment in Makefile as per Masahiro. * simplify setting dwarf-version-y as per Masahiro. * indent `prompt` in Kconfig change as per Masahiro. * remove explicit default in Kconfig as per Masahiro. * add comments to test_dwarf5_support.sh. * change echo in test_dwarf5_support.sh as per Masahiro. * remove -u from test_dwarf5_support.sh as per Masahiro. * add a -gdwarf-5 cc-option check to Kconfig as per Jakub. *** BLURB HERE *** Masahiro Yamada (1): Remove $(cc-option,-gdwarf-4) dependency from CONFIG_DEBUG_INFO_DWARF4 Nick Desaulniers (2): Kbuild: make DWARF version a choice Kbuild: implement support for DWARF v5 Makefile | 13 +++--- include/asm-generic/vmlinux.lds.h | 6 - lib/Kconfig.debug | 42 +-- scripts/test_dwarf5_support.sh| 8 ++ 4 files changed, 57 insertions(+), 12 deletions(-) create mode 100755 scripts/test_dwarf5_support.sh -- 2.30.0.284.gd98b1dd5eaa7-goog
Re: [PATCH bpf-next 2/3] bpf: Add size arg to build_id_parse function
On 1/14/21 2:02 PM, Jiri Olsa wrote: On Thu, Jan 14, 2021 at 01:05:33PM -0800, Yonghong Song wrote: On 1/14/21 12:01 PM, Jiri Olsa wrote: On Thu, Jan 14, 2021 at 10:56:33AM -0800, Yonghong Song wrote: On 1/14/21 5:40 AM, Jiri Olsa wrote: It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Currently, bpf build_id based stackmap does not returns the size of the build_id. Did you see an issue here? I guess user space can check the length of non-zero bits of the build id to decide what kind of type it is, right? you can have zero bytes in the build id hash, so you need to get the size I never saw MD5 being used in practise just SHA1, but we added the size to be complete and make sure we'll fit with build id, because there's only limited space in mmap2 event I am asking to check whether we should extend uapi struct bpf_stack_build_id to include build_id_size as well. I guess we can delay this until a real use case. right, we can try make some MD5 build id binaries and check if it explodes with some bcc tools, but I don't expect that.. I'll try to find some time for that Thanks. We may have issues on bcc side. For build_id collected in kernel, bcc always generates a length-20 string. But for user binaries, the build_id string length is equal to actual size of the build_id. They may not match (MD5 length is 16). The fix is probably to append '0's (up to length 20) for user binary build_id's. I guess MD5 is very seldom used. I will wait if you can reproduce the issue and then we might fix it. perf tool uses build ids in .debug cache as file links, and we had few isues there jirka
Re: [PATCH bpf-next 2/3] bpf: Add size arg to build_id_parse function
On 1/14/21 12:01 PM, Jiri Olsa wrote: On Thu, Jan 14, 2021 at 10:56:33AM -0800, Yonghong Song wrote: On 1/14/21 5:40 AM, Jiri Olsa wrote: It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Currently, bpf build_id based stackmap does not returns the size of the build_id. Did you see an issue here? I guess user space can check the length of non-zero bits of the build id to decide what kind of type it is, right? you can have zero bytes in the build id hash, so you need to get the size I never saw MD5 being used in practise just SHA1, but we added the size to be complete and make sure we'll fit with build id, because there's only limited space in mmap2 event I am asking to check whether we should extend uapi struct bpf_stack_build_id to include build_id_size as well. I guess we can delay this until a real use case. jirka Adding size argument to build_id_parse function, that returns (if defined) size of the parsed build id, so we can recognize the build id type. Cc: Alexei Starovoitov Cc: Song Liu Signed-off-by: Jiri Olsa --- include/linux/buildid.h | 3 ++- kernel/bpf/stackmap.c | 2 +- lib/buildid.c | 29 + 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 08028a212589..40232f90db6e 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -6,6 +6,7 @@ #define BUILD_ID_SIZE_MAX 20 -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size); #endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 55d254a59f07..cabaf7db8efc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -189,7 +189,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/lib/buildid.c b/lib/buildid.c index 4a4f520c0e29..6156997c3895 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -12,6 +12,7 @@ */ static inline int parse_build_id(void *page_addr, unsigned char *build_id, +__u32 *size, void *note_start, Elf32_Word note_size) { @@ -38,6 +39,8 @@ static inline int parse_build_id(void *page_addr, nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BUILD_ID_SIZE_MAX - nhdr->n_descsz); + if (size) + *size = nhdr->n_descsz; return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -50,7 +53,8 @@ static inline int parse_build_id(void *page_addr, } [...]
Re: [PATCH bpf-next 2/3] bpf: Add size arg to build_id_parse function
On 1/14/21 5:40 AM, Jiri Olsa wrote: It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Currently, bpf build_id based stackmap does not returns the size of the build_id. Did you see an issue here? I guess user space can check the length of non-zero bits of the build id to decide what kind of type it is, right? Adding size argument to build_id_parse function, that returns (if defined) size of the parsed build id, so we can recognize the build id type. Cc: Alexei Starovoitov Cc: Song Liu Signed-off-by: Jiri Olsa --- include/linux/buildid.h | 3 ++- kernel/bpf/stackmap.c | 2 +- lib/buildid.c | 29 + 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 08028a212589..40232f90db6e 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -6,6 +6,7 @@ #define BUILD_ID_SIZE_MAX 20 -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size); #endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 55d254a59f07..cabaf7db8efc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -189,7 +189,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/lib/buildid.c b/lib/buildid.c index 4a4f520c0e29..6156997c3895 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -12,6 +12,7 @@ */ static inline int parse_build_id(void *page_addr, unsigned char *build_id, +__u32 *size, void *note_start, Elf32_Word note_size) { @@ -38,6 +39,8 @@ static inline int parse_build_id(void *page_addr, nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BUILD_ID_SIZE_MAX - nhdr->n_descsz); + if (size) + *size = nhdr->n_descsz; return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -50,7 +53,8 @@ static inline int parse_build_id(void *page_addr, } [...]
Re: [PATCH 2/2] tools/bpftool: Add -Wall when building BPF programs
On 1/13/21 2:36 PM, Ian Rogers wrote: No additional warnings are generated by enabling this, but having it enabled will help avoid regressions. Signed-off-by: Ian Rogers Acked-by: Yonghong Song
Re: [PATCH 1/2] bpf, libbpf: Avoid unused function warning on bpf_tail_call_static
On 1/13/21 2:36 PM, Ian Rogers wrote: Add inline to __always_inline making it match the linux/compiler.h. Adding this avoids an unused function warning on bpf_tail_call_static when compining with -Wall. Signed-off-by: Ian Rogers Acked-by: Yonghong Song
Re: [PATCH 0/2] Fix build errors and warnings when make M=samples/bpf
On 1/13/21 2:57 AM, Tiezhu Yang wrote: There exists many build errors and warnings when make M=samples/bpf, both fixes in this patch related to mips, please do mention in the commit message that this is mips related. x86 (and arm64 I assume) compiles just fine. this patch series fix some of them, I will submit some other patches related with MIPS later. Tiezhu Yang (2): samples/bpf: Set flag __SANE_USERSPACE_TYPES__ for MIPS to fix build warnings compiler.h: Include asm/rwonce.h under ARM64 and ALPHA to fix build errors include/linux/compiler.h| 6 ++ samples/bpf/Makefile| 4 tools/include/linux/types.h | 3 +++ 3 files changed, 13 insertions(+)
Re: [PATCH 2/2] compiler.h: Include asm/rwonce.h under ARM64 and ALPHA to fix build errors
On 1/13/21 2:57 AM, Tiezhu Yang wrote: When make M=samples/bpf on the Loongson 3A3000 platform which belongs to MIPS arch, there exists many similar build errors about 'asm/rwonce.h' file not found, so include it only under CONFIG_ARM64 and CONFIG_ALPHA due to it exists only in arm64 and alpha arch. CLANG-bpf samples/bpf/xdpsock_kern.o In file included from samples/bpf/xdpsock_kern.c:2: In file included from ./include/linux/bpf.h:9: In file included from ./include/linux/workqueue.h:9: In file included from ./include/linux/timer.h:5: In file included from ./include/linux/list.h:9: In file included from ./include/linux/kernel.h:10: ./include/linux/compiler.h:246:10: fatal error: 'asm/rwonce.h' file not found ^~ 1 error generated. $ find . -name rwonce.h ./include/asm-generic/rwonce.h ./arch/arm64/include/asm/rwonce.h ./arch/alpha/include/asm/rwonce.h Signed-off-by: Tiezhu Yang --- include/linux/compiler.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b8fe0c2..bdbe759 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -243,6 +243,12 @@ static inline void *offset_to_ptr(const int *off) */ #define prevent_tail_call_optimization() mb() +#ifdef CONFIG_ARM64 #include +#endif + +#ifdef CONFIG_ALPHA +#include +#endif I do not think this fix is correct. x86 does not define its own rwonce.h and still compiles fine. As noted in the above, we have include/asm-generic/rwonce.h. Once you do a proper build, you will have rwonce.h in arch generated directory like -bash-4.4$ find . -name rwonce.h ./include/asm-generic/rwonce.h ./arch/alpha/include/asm/rwonce.h ./arch/arm64/include/asm/rwonce.h ./arch/x86/include/generated/asm/rwonce.h for mips, it should generated in arch/mips/include/generated/asm/rwonce.h. Please double check why this does not happen. #endif /* __LINUX_COMPILER_H */
Re: [PATCH bpf 1/2] samples/bpf: Set flag __SANE_USERSPACE_TYPES__ for MIPS to fix build warnings
On 1/13/21 2:57 AM, Tiezhu Yang wrote: MIPS needs __SANE_USERSPACE_TYPES__ before to select 'int-ll64.h' in arch/mips/include/uapi/asm/types.h and avoid compile warnings when printing __u64 with %llu, %llx or %lld. could you mention which command produces the following warning? printf("0x%02x : %llu\n", key, value); ~~~^ ~ %lu printf("%s/%llx;", sym->name, addr); ~~~^ %lx printf(";%s %lld\n", key->waker, count); ~~~^ ~ %ld Signed-off-by: Tiezhu Yang --- samples/bpf/Makefile| 4 tools/include/linux/types.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 26fc96c..27de306 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -183,6 +183,10 @@ BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) endif +ifeq ($(ARCH), mips) +TPROGS_CFLAGS += -D__SANE_USERSPACE_TYPES__ +endif + This change looks okay based on description in arch/mips/include/uapi/asm/types.h ''' /* * We don't use int-l64.h for the kernel anymore but still use it for * userspace to avoid code changes. * * However, some user programs (e.g. perf) may not want this. They can * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here. */ ''' TPROGS_CFLAGS += -Wall -O2 TPROGS_CFLAGS += -Wmissing-prototypes TPROGS_CFLAGS += -Wstrict-prototypes diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h index 154eb4e..e9c5a21 100644 --- a/tools/include/linux/types.h +++ b/tools/include/linux/types.h @@ -6,7 +6,10 @@ #include #include +#ifndef __SANE_USERSPACE_TYPES__ #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ +#endif What problem this patch fixed? If this header is used, you can just change comment from "PPC64" to "PPC64/MIPS", right? + #include #include
Re: [PATCH bpf-next 1/2] trace: bpf: Allow bpf to attach to bare tracepoints
On 1/13/21 2:16 AM, Qais Yousef wrote: On 01/12/21 12:19, Yonghong Song wrote: I applied the patch to my local bpf-next repo, and got the following compilation error: [...] I dumped preprecessor result but after macro expansion, the code becomes really complex and I have not figured out why it failed. Do you know what is the possible reason? Yeah I did a last minute fix to address a checkpatch.pl error and my verification of the change wasn't good enough obviously. If you're keen to try out I can send you a patch with the fix. I should send v2 by the weekend too. Thanks. I can wait and will check v2 once it is available. Thanks for having a look. Cheers -- Qais Yousef
Re: [PATCH bpf v2 2/2] selftests/bpf: add verifier test for PTR_TO_MEM spill
On 1/12/21 9:38 PM, Gilad Reti wrote: Add a test to check that the verifier is able to recognize spilling of PTR_TO_MEM registers, by reserving a ringbuf buffer, forcing the spill of a pointer holding the buffer address to the stack, filling it back in from the stack and writing to the memory area pointed by it. The patch was partially contributed by CyberArk Software, Inc. Signed-off-by: Gilad Reti I didn't verify result_unpriv = ACCEPT part. I think it is correct by checking code. Acked-by: Yonghong Song
Re: [PATCH bpf-next 1/2] trace: bpf: Allow bpf to attach to bare tracepoints
On 1/11/21 10:20 AM, Qais Yousef wrote: Some subsystems only have bare tracepoints (a tracepoint with no associated trace event) to avoid the problem of trace events being an ABI that can't be changed. From bpf presepective, bare tracepoints are what it calls RAW_TRACEPOINT(). Since bpf assumed there's 1:1 mapping, it relied on hooking to DEFINE_EVENT() macro to create bpf mapping of the tracepoints. Since bare tracepoints use DECLARE_TRACE() to create the tracepoint, bpf had no knowledge about their existence. By teaching bpf_probe.h to parse DECLARE_TRACE() in a similar fashion to DEFINE_EVENT(), bpf can find and attach to the new raw tracepoints. Enabling that comes with the contract that changes to raw tracepoints don't constitute a regression if they break existing bpf programs. We need the ability to continue to morph and modify these raw tracepoints without worrying about any ABI. Update Documentation/bpf/bpf_design_QA.rst to document this contract. Signed-off-by: Qais Yousef --- Documentation/bpf/bpf_design_QA.rst | 6 ++ include/trace/bpf_probe.h | 12 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst index 2df7b067ab93..0e15f9b05c9d 100644 --- a/Documentation/bpf/bpf_design_QA.rst +++ b/Documentation/bpf/bpf_design_QA.rst @@ -208,6 +208,12 @@ data structures and compile with kernel internal headers. Both of these kernel internals are subject to change and can break with newer kernels such that the program needs to be adapted accordingly. +Q: Are tracepoints part of the stable ABI? +-- +A: NO. Tracepoints are tied to internal implementation details hence they are +subject to change and can break with newer kernels. BPF programs need to change +accordingly when this happens. + Q: How much stack space a BPF program uses? --- A: Currently all program types are limited to 512 bytes of stack diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index cd74bffed5c6..cf1496b162b1 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -55,8 +55,7 @@ /* tracepoints with more than 12 arguments will hit build error */ #define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +#define __BPF_DECLARE_TRACE(call, proto, args) \ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ @@ -64,6 +63,10 @@ __bpf_trace_##call(void *__data, proto) \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ } +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ + __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) + /* * This part is compiled out, it is only here as a build time check * to make sure that if the tracepoint handling changes, the @@ -111,6 +114,11 @@ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) #define DEFINE_EVENT_PRINT(template, name, proto, args, print)\ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) +#undef DECLARE_TRACE +#define DECLARE_TRACE(call, proto, args) \ + (__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ +__DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0)) I applied the patch to my local bpf-next repo, and got the following compilation error: In file included from /data/users/yhs/work/net-next/include/trace/define_trace.h:104, from /data/users/yhs/work/net-next/include/trace/events/sched.h:740, from /data/users/yhs/work/net-next/kernel/sched/core.c:10: /data/users/yhs/work/net-next/include/trace/bpf_probe.h:59:1: error: expected identifier or ‘(’ before ‘static’ static notrace void \ ^~ /data/users/yhs/work/net-next/include/trace/bpf_probe.h:119:3: note: in expansion of macro ‘__BPF_DECLARE_TRACE’ (__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ ^~~ /data/users/yhs/work/net-next/include/trace/events/sched.h:693:1: note: in expansion of macro ‘DECLARE_TRACE’ DECLARE_TRACE(pelt_cfs_tp, ^ /data/users/yhs/work/net-next/include/trace/bpf_probe.h:59:1: error: expected identifier or ‘(’ before ‘static’ static notrace void \ ^~ /data/users/yhs/work/net-next/include/trace/bpf_probe.h:119:3: note: in expansion of macro ‘__BPF_DECLARE_TRACE’ (__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \ ^~~
Re: [PATCH 2/2] selftests/bpf: add verifier test for PTR_TO_MEM spill
On 1/12/21 7:43 AM, Daniel Borkmann wrote: On 1/12/21 4:35 PM, Gilad Reti wrote: On Tue, Jan 12, 2021 at 4:56 PM KP Singh wrote: On Tue, Jan 12, 2021 at 10:16 AM Gilad Reti wrote: Add test to check that the verifier is able to recognize spilling of PTR_TO_MEM registers. It would be nice to have some explanation of what the test does to recognize the spilling of the PTR_TO_MEM registers in the commit log as well. Would it be possible to augment an existing test_progs program like tools/testing/selftests/bpf/progs/test_ringbuf.c to test this functionality? How would you guarantee that LLVM generates the spill/fill, via inline asm? You can make the following change to force the return value ("sample" here) of bpf_ringbuf_reserve() to spill on the stack. diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c index 8ba9959b036b..011521170856 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c @@ -40,7 +40,7 @@ SEC("tp/syscalls/sys_enter_getpgid") int test_ringbuf(void *ctx) { int cur_pid = bpf_get_current_pid_tgid() >> 32; - struct sample *sample; + struct sample * volatile sample; int zero = 0; if (cur_pid != pid) This change will cause verifier failure without Patch #1. It may be possible, but from what I understood from Daniel's comment here https://lore.kernel.org/bpf/17629073-4fab-a922-ecc3-25b019960...@iogearbox.net/ the test should be a part of the verifier tests (which is reasonable to me since it is a verifier bugfix) Yeah, the test_verifier case as you have is definitely the most straight forward way to add coverage in this case.
Re: [PATCH bpf-next] bpf: Fix a verifier message for alloc size helper arg
On 1/12/21 4:39 AM, Brendan Jackman wrote: The error message here is misleading, the argument will be rejected unless it is a known constant. Signed-off-by: Brendan Jackman Okay, this is for bpf_ringbuf_reserve() helper where the size must be a constant. Acked-by: Yonghong Song
Re: [PATCH bpf-next] bpf: Clarify return value of probe str helpers
On 1/12/21 4:34 AM, Brendan Jackman wrote: When the buffer is too small to contain the input string, these helpers return the length of the buffer, not the length of the original string. This tries to make the docs totally clear about that, since "the length of the [copied ]string" could also refer to the length of the input. Signed-off-by: Brendan Jackman Acked-by: Yonghong Song
Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
On 1/11/21 3:45 PM, Song Liu wrote: On Jan 11, 2021, at 1:58 PM, Martin Lau wrote: On Mon, Jan 11, 2021 at 10:35:43PM +0100, KP Singh wrote: On Mon, Jan 11, 2021 at 7:57 PM Martin KaFai Lau wrote: On Fri, Jan 08, 2021 at 03:19:47PM -0800, Song Liu wrote: [ ... ] diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e73..9bd47ad2b26f1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; + unsigned long flags; if (unlikely(!selem_linked_to_storage(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference(selem->local_storage); - raw_spin_lock_bh(_storage->lock); + raw_spin_lock_irqsave(_storage->lock, flags); It will be useful to have a few words in commit message on this change for future reference purpose. Please also remove the in_irq() check from bpf_sk_storage.c to avoid confusion in the future. It probably should be in a separate patch. [ ... ] diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4ef1959a78f27..f654b56907b69 100644 diff --git a/kernel/fork.c b/kernel/fork.c index 7425b3224891d..3d65c8ebfd594 100644 [ ... ] --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); If exit_creds() is traced by a bpf and this bpf is doing bpf_task_storage_get(..., BPF_LOCAL_STORAGE_GET_F_CREATE), new task storage will be created after bpf_task_storage_free(). I recalled there was an earlier discussion with KP and KP mentioned BPF_LSM will not be called with a task that is going away. It seems enabling bpf task storage in bpf tracing will break this assumption and needs to be addressed? For tracing programs, I think we will need an allow list where task local storage can be used. Instead of whitelist, can refcount_inc_not_zero(>usage) be used? I think we can put refcount_inc_not_zero() in bpf_task_storage_get, like: diff --git i/kernel/bpf/bpf_task_storage.c w/kernel/bpf/bpf_task_storage.c index f654b56907b69..93d01b0a010e6 100644 --- i/kernel/bpf/bpf_task_storage.c +++ w/kernel/bpf/bpf_task_storage.c @@ -216,6 +216,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * by an RCU read-side critical section. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + if (!refcount_inc_not_zero(>usage)) + return -EBUSY; + sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); But where shall we add the refcount_dec()? IIUC, we cannot add it to __put_task_struct(). Maybe put_task_struct()? Thanks, Song
Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
On 1/11/21 11:14 PM, Andrii Nakryiko wrote: On Mon, Jan 11, 2021 at 7:24 PM Yonghong Song wrote: On 1/11/21 2:54 PM, Song Liu wrote: On Jan 11, 2021, at 9:49 AM, Yonghong Song wrote: On 1/8/21 3:19 PM, Song Liu wrote: Replace hashtab with task local storage in runqslower. This improves the performance of these BPF programs. The following table summarizes average runtime of these programs, in nanoseconds: task-local hash-prealloc hash-no-prealloc handle__sched_wakeup 125 340 3124 handle__sched_wakeup_new28121510 2998 handle__sched_switch 151 208991 Note that, task local storage gives better performance than hashtab for handle__sched_wakeup and handle__sched_switch. On the other hand, for handle__sched_wakeup_new, task local storage is slower than hashtab with prealloc. This is because handle__sched_wakeup_new accesses the data for the first time, so it has to allocate the data for task local storage. Once the initial allocation is done, subsequent accesses, as those in handle__sched_wakeup, are much faster with task local storage. If we disable hashtab prealloc, task local storage is much faster for all 3 functions. Signed-off-by: Song Liu --- tools/bpf/runqslower/runqslower.bpf.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f0443..c4de4179a0a17 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,19 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 ts, *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; ts = bpf_ktime_get_ns(); - bpf_map_update_elem(, , , 0); + ptr = bpf_task_storage_get(, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = ts; return 0; } @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(, ); + tsp = bpf_task_storage_get(, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ Previously, hash table may overflow so we may have missed enqueue. Here with task local storage, is it possible to add additional pid filtering in the beginning of handle__sched_switch such that missed enqueue here can be treated as an error? IIUC, hashtab overflow is not the only reason of missed enqueue. If the wakeup (which calls trace_enqueue) happens before runqslower starts, we may still get missed enqueue in sched_switch, no? the wakeup won't happen before runqslower starts since runqslower needs to start to do attachment first and then trace_enqueue() can run. I think Song is right. Given wakeup and sched_switch need to be matched, depending at which exact time we attach BPF programs, we can end up missing wakeup, but not missing sched_switch, no? So it's not an error. The current approach works fine. What I suggested is to tighten sched_switch only for target_pid. wakeup (doing queuing) will be more relaxed than sched_switch to ensure task local storage creation is always there for target_pid regardless of attachment timing. I think it should work, but we have to experiment to see actual results... For the current implementation trace_enqueue() will happen for any non-0 pid before setting test_progs tgid, and will happen for any non-0 and test_progs tgid if it is set, so this should be okay if we do filtering in handle_
Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
On 1/11/21 2:54 PM, Song Liu wrote: On Jan 11, 2021, at 9:49 AM, Yonghong Song wrote: On 1/8/21 3:19 PM, Song Liu wrote: Replace hashtab with task local storage in runqslower. This improves the performance of these BPF programs. The following table summarizes average runtime of these programs, in nanoseconds: task-local hash-prealloc hash-no-prealloc handle__sched_wakeup 125 340 3124 handle__sched_wakeup_new28121510 2998 handle__sched_switch 151 208991 Note that, task local storage gives better performance than hashtab for handle__sched_wakeup and handle__sched_switch. On the other hand, for handle__sched_wakeup_new, task local storage is slower than hashtab with prealloc. This is because handle__sched_wakeup_new accesses the data for the first time, so it has to allocate the data for task local storage. Once the initial allocation is done, subsequent accesses, as those in handle__sched_wakeup, are much faster with task local storage. If we disable hashtab prealloc, task local storage is much faster for all 3 functions. Signed-off-by: Song Liu --- tools/bpf/runqslower/runqslower.bpf.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f0443..c4de4179a0a17 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,19 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 ts, *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; ts = bpf_ktime_get_ns(); - bpf_map_update_elem(, , , 0); + ptr = bpf_task_storage_get(, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = ts; return 0; } @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(, ); + tsp = bpf_task_storage_get(, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ Previously, hash table may overflow so we may have missed enqueue. Here with task local storage, is it possible to add additional pid filtering in the beginning of handle__sched_switch such that missed enqueue here can be treated as an error? IIUC, hashtab overflow is not the only reason of missed enqueue. If the wakeup (which calls trace_enqueue) happens before runqslower starts, we may still get missed enqueue in sched_switch, no? the wakeup won't happen before runqslower starts since runqslower needs to start to do attachment first and then trace_enqueue() can run. For the current implementation trace_enqueue() will happen for any non-0 pid before setting test_progs tgid, and will happen for any non-0 and test_progs tgid if it is set, so this should be okay if we do filtering in handle__sched_switch. Maybe you can do an experiment to prove whether my point is correct or not. Thanks, Song
Re: [PATCH bpf-next 4/4] bpf: runqslower: use task local storage
On 1/8/21 3:19 PM, Song Liu wrote: Replace hashtab with task local storage in runqslower. This improves the performance of these BPF programs. The following table summarizes average runtime of these programs, in nanoseconds: task-local hash-prealloc hash-no-prealloc handle__sched_wakeup 125 340 3124 handle__sched_wakeup_new28121510 2998 handle__sched_switch 151 208991 Note that, task local storage gives better performance than hashtab for handle__sched_wakeup and handle__sched_switch. On the other hand, for handle__sched_wakeup_new, task local storage is slower than hashtab with prealloc. This is because handle__sched_wakeup_new accesses the data for the first time, so it has to allocate the data for task local storage. Once the initial allocation is done, subsequent accesses, as those in handle__sched_wakeup, are much faster with task local storage. If we disable hashtab prealloc, task local storage is much faster for all 3 functions. Signed-off-by: Song Liu --- tools/bpf/runqslower/runqslower.bpf.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index 1f18a409f0443..c4de4179a0a17 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -11,9 +11,9 @@ const volatile __u64 min_us = 0; const volatile pid_t targ_pid = 0; struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 10240); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); __type(value, u64); } start SEC(".maps"); @@ -25,15 +25,19 @@ struct { /* record enqueue timestamp */ __always_inline -static int trace_enqueue(u32 tgid, u32 pid) +static int trace_enqueue(struct task_struct *t) { - u64 ts; + u32 pid = t->pid; + u64 ts, *ptr; if (!pid || (targ_pid && targ_pid != pid)) return 0; ts = bpf_ktime_get_ns(); - bpf_map_update_elem(, , , 0); + ptr = bpf_task_storage_get(, t, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (ptr) + *ptr = ts; return 0; } @@ -43,7 +47,7 @@ int handle__sched_wakeup(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_wakeup_new") @@ -52,7 +56,7 @@ int handle__sched_wakeup_new(u64 *ctx) /* TP_PROTO(struct task_struct *p) */ struct task_struct *p = (void *)ctx[0]; - return trace_enqueue(p->tgid, p->pid); + return trace_enqueue(p); } SEC("tp_btf/sched_switch") @@ -70,12 +74,12 @@ int handle__sched_switch(u64 *ctx) /* ivcsw: treat like an enqueue event and store timestamp */ if (prev->state == TASK_RUNNING) - trace_enqueue(prev->tgid, prev->pid); + trace_enqueue(prev); pid = next->pid; /* fetch timestamp and calculate delta */ - tsp = bpf_map_lookup_elem(, ); + tsp = bpf_task_storage_get(, next, 0, 0); if (!tsp) return 0; /* missed enqueue */ Previously, hash table may overflow so we may have missed enqueue. Here with task local storage, is it possible to add additional pid filtering in the beginning of handle__sched_switch such that missed enqueue here can be treated as an error? @@ -91,7 +95,7 @@ int handle__sched_switch(u64 *ctx) bpf_perf_event_output(ctx, , BPF_F_CURRENT_CPU, , sizeof(event)); - bpf_map_delete_elem(, ); + bpf_task_storage_delete(, next); return 0; }
Re: [PATCH bpf-next 3/4] bpf: runqslower: prefer use local vmlinux
On 1/8/21 3:19 PM, Song Liu wrote: Update the Makefile to prefer using ../../../vmlinux, which has latest definitions for vmlinux.h Signed-off-by: Song Liu --- tools/bpf/runqslower/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 4d5ca54fcd4c8..306f1ce5a97b2 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -19,7 +19,8 @@ CFLAGS := -g -Wall # Try to detect best kernel BTF source KERNEL_REL := $(shell uname -r) -VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) +VMLINUX_BTF_PATHS := ../../../vmlinux /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(KERNEL_REL) selftests/bpf Makefile has: VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)\ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)\ ../../../../vmlinux\ /sys/kernel/btf/vmlinux\ /boot/vmlinux-$(shell uname -r) If you intend to add ../../../vmlinux, I think we should also add $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux). VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS
Re: [PATCH bpf-next 2/4] selftests/bpf: add non-BPF_LSM test for task local storage
On 1/8/21 3:19 PM, Song Liu wrote: Task local storage is enabled for tracing programs. Add a test for it without CONFIG_BPF_LSM. Signed-off-by: Song Liu --- .../bpf/prog_tests/test_task_local_storage.c | 34 + .../selftests/bpf/progs/task_local_storage.c | 37 +++ 2 files changed, 71 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c create mode 100644 tools/testing/selftests/bpf/progs/task_local_storage.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c new file mode 100644 index 0..7de7a154ebbe6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_storage.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ 2020 -> 2021 + +#include +#include +#include +#include "task_local_storage.skel.h" + +static unsigned int duration; + +void test_test_task_local_storage(void) +{ + struct task_local_storage *skel; + const int count = 10; + int i, err; + + skel = task_local_storage__open_and_load(); + Extra line is unnecessary here. + if (CHECK(!skel, "skel_open_and_load", "skeleton open and load failed\n")) + return; + + err = task_local_storage__attach(skel); + ditto. + if (CHECK(err, "skel_attach", "skeleton attach failed\n")) + goto out; + + for (i = 0; i < count; i++) + usleep(1000); Does a smaller usleep value will work? If it is, recommend to have a smaller value here to reduce test_progs running time. + CHECK(skel->bss->value < count, "task_local_storage_value", + "task local value too small\n"); + +out: + task_local_storage__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c new file mode 100644 index 0..807255c5c162d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ 2020 -> 2021 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct local_data { + __u64 val; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct local_data); +} task_storage_map SEC(".maps"); + +int value = 0; + +SEC("tp_btf/sched_switch") +int BPF_PROG(on_switch, bool preempt, struct task_struct *prev, +struct task_struct *next) +{ + struct local_data *storage; If it possible that we do some filtering based on test_progs pid so below bpf_task_storage_get is only called for test_progs process? This is more targeted and can avoid counter contributions from other unrelated processes and make test_task_local_storage.c result comparison more meaningful. + + storage = bpf_task_storage_get(_storage_map, + next, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (storage) { + storage->val++; + value = storage->val; + } + return 0; +}
Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
On 1/8/21 3:19 PM, Song Liu wrote: To access per-task data, BPF program typically creates a hash table with pid as the key. This is not ideal because: 1. The use need to estimate requires size of the hash table, with may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write code. Task local storage overcomes these issues and becomes a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Reported-by: kernel test robot The whole patch is not reported by kernel test robot. I think we should drop this. Signed-off-by: Song Liu --- include/linux/bpf.h| 7 +++ include/linux/bpf_lsm.h| 22 -- include/linux/bpf_types.h | 2 +- include/linux/sched.h | 5 + kernel/bpf/Makefile| 3 +-- kernel/bpf/bpf_local_storage.c | 28 +--- kernel/bpf/bpf_lsm.c | 4 kernel/bpf/bpf_task_storage.c | 26 ++ kernel/fork.c | 5 + kernel/trace/bpf_trace.c | 4 10 files changed, 46 insertions(+), 60 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07cb5d15e7439..cf16548f28f7b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1480,6 +1480,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1665,6 +1666,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } + +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} #endif /* CONFIG_BPF_SYSCALL */ [...]
Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
On 1/11/21 2:17 AM, KP Singh wrote: On Mon, Jan 11, 2021 at 7:27 AM Yonghong Song wrote: On 1/8/21 3:19 PM, Song Liu wrote: To access per-task data, BPF program typically creates a hash table with pid as the key. This is not ideal because: 1. The use need to estimate requires size of the hash table, with may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write code. Task local storage overcomes these issues and becomes a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Reported-by: kernel test robot Signed-off-by: Song Liu --- [...] struct cfs_rq; struct fs_struct; @@ -1348,6 +1349,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void*security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage*bpf_storage; +#endif I remembered there is a discussion where KP initially wanted to put bpf_local_storage in task_struct, but later on changed to use in lsm as his use case mostly for lsm. Did anybody remember the details of the discussion? Just want to be sure what is the concern people has with putting bpf_local_storage in task_struct and whether the use case presented by Song will justify it. If I recall correctly, the discussion was about inode local storage and it was decided to use the security blob since the use-case was only LSM programs. Since we now plan to use it in tracing, detangling the dependency from CONFIG_BPF_LSM sounds logical to me. Sounds good. Thanks for explanation. #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6ba..ca995fdfa45e7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,9 +8,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o [...]
Re: [PATCH bpf-next 1/4] bpf: enable task local storage for tracing programs
On 1/8/21 3:19 PM, Song Liu wrote: To access per-task data, BPF program typically creates a hash table with pid as the key. This is not ideal because: 1. The use need to estimate requires size of the hash table, with may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write code. Task local storage overcomes these issues and becomes a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Reported-by: kernel test robot Signed-off-by: Song Liu --- include/linux/bpf.h| 7 +++ include/linux/bpf_lsm.h| 22 -- include/linux/bpf_types.h | 2 +- include/linux/sched.h | 5 + kernel/bpf/Makefile| 3 +-- kernel/bpf/bpf_local_storage.c | 28 +--- kernel/bpf/bpf_lsm.c | 4 kernel/bpf/bpf_task_storage.c | 26 ++ kernel/fork.c | 5 + kernel/trace/bpf_trace.c | 4 10 files changed, 46 insertions(+), 60 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07cb5d15e7439..cf16548f28f7b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1480,6 +1480,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1665,6 +1666,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } + +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -1860,6 +1865,8 @@ extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 0d1c33ace3987..479c101546ad1 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - if (unlikely(!task->security)) - return NULL; - - return task->security + bpf_lsm_blob_sizes.lbs_task; -} - extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); -void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - return NULL; -} - static inline void bpf_inode_storage_free(struct inode *inode) { } -static inline void bpf_task_storage_free(struct task_struct *task) -{ -} - #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 99f7fd657d87a..b9edee336d804 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,8 +109,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/linux/sched.h b/include/linux/sched.h index 51d535b69bd6f..4a173defa2010 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,6 +42,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_local_storage; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1348,6 +1349,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void*security; #endif +#ifdef CONFIG_BPF_SYSCALL + /*
Re: [PATCH] tools/bpf: Remove unnecessary parameter in bpf_object__probe_loading
On 1/7/21 6:08 PM, 彭浩(Richard) wrote: struct bpf_object *obj is not used in bpf_object__probe_loading, so we can remove it. Signed-off-by: Peng Hao Acked-by: Yonghong Song
Re: [PATCH v2] btf: support ints larger than 128 bits
On 12/19/20 8:36 AM, Sean Young wrote: clang supports arbitrary length ints using the _ExtInt extension. This can be useful to hold very large values, e.g. 256 bit or 512 bit types. Larger types (e.g. 1024 bits) are possible but I am unaware of a use case for these. This requires the _ExtInt extension enabled in clang, which is under review. Link: https://clang.llvm.org/docs/LanguageExtensions.html#extended-integer-types Link: https://reviews.llvm.org/D93103 Signed-off-by: Sean Young --- changes since v2: - added tests as suggested by Yonghong Song - added kernel pretty-printer Documentation/bpf/btf.rst | 4 +- include/uapi/linux/btf.h | 2 +- kernel/bpf/btf.c | 54 +- tools/bpf/bpftool/btf_dumper.c| 40 ++ tools/include/uapi/linux/btf.h| 2 +- tools/lib/bpf/btf.c | 2 +- tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/prog_tests/btf.c | 3 +- .../selftests/bpf/progs/test_btf_extint.c | 50 ++ tools/testing/selftests/bpf/test_extint.py| 535 ++ For easier review, maybe you can break this patch into a patch series like below? patch 1 (kernel related changes and doc) kernel/bpf/btf.c, include/uapi/linux/btf.h, tools/include/uapi/linux/btf.h Documentation/bpf/btf.rst patch 2 (libbpf support) tools/lib/bpf/btf.c patch 3 (bpftool support) tools/bpf/bpftool/btf_dumper.c patch 4 (testing) rest files 10 files changed, 679 insertions(+), 16 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_btf_extint.c create mode 100755 tools/testing/selftests/bpf/test_extint.py diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 44dc789de2b4..784f1743dbc7 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -132,7 +132,7 @@ The following sections detail encoding of each kind. #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f00) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff) >> 16) - #define BTF_INT_BITS(VAL) ((VAL) & 0x00ff) + #define BTF_INT_BITS(VAL) ((VAL) & 0x03ff) The ``BTF_INT_ENCODING`` has the following attributes:: @@ -147,7 +147,7 @@ pretty print. At most one encoding can be specified for the int type. The ``BTF_INT_BITS()`` specifies the number of actual bits held by this int type. For example, a 4-bit bitfield encodes ``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8`` must be equal to or greater than ``BTF_INT_BITS()`` -for the type. The maximum value of ``BTF_INT_BITS()`` is 128. +for the type. The maximum value of ``BTF_INT_BITS()`` is 512. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values for this int. For example, a bitfield struct member has: diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..1696fd02b302 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -84,7 +84,7 @@ struct btf_type { */ #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f00) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x00ff) +#define BTF_INT_BITS(VAL) ((VAL) & 0x03ff) /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED(1 << 0) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d6bdb4f4d61..44bc17207e9b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -166,7 +166,8 @@ * */ -#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) +#define BITS_PER_U128 128 +#define BITS_PER_U512 512 #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) @@ -1907,9 +1908,9 @@ static int btf_int_check_member(struct btf_verifier_env *env, nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U128) { + if (nr_copy_bits > BITS_PER_U512) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 128"); + "nr_copy_bits exceeds 512"); return -EINVAL; } @@ -1963,9 +1964,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env, bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U128) { + if (nr_copy_bits > BITS_PER_U512) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits excee
Re: [PATCH] bpf: fix: address of local auto-variable assigned to a function parameter.
On 12/23/20 11:01 PM, YANG LI wrote: Assigning local variable txq to the outputting parameter xdp->txq is not safe, txq will be released after the end of the function call. Then the result of using xdp is unpredictable. Fix this error by defining the struct xdp_txq_info in function dev_map_run_prog() as a static type. Signed-off-by: YANG LI Reported-by: Abaci --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68..af6f004 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -454,7 +454,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct xdp_txq_info txq = { .dev = dev }; + static struct xdp_txq_info txq = { .dev = dev }; u32 act; xdp_set_data_meta_invalid(xdp); exposing txq outside the routine with 'static' definition is not a good practice. maybe just reset xdp->txq = NULl right before function return? diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..50f5c20a33a3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -475,6 +475,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, } xdp_return_buff(xdp); + xdp->txq = NULL; return NULL; } -bash-4.4$
Re: [PATCH bpf-next 1/2] bpf: Add a bpf_kallsyms_lookup helper
On 12/17/20 7:20 PM, Alexei Starovoitov wrote: On Thu, Dec 17, 2020 at 09:26:09AM -0800, Yonghong Song wrote: On 12/17/20 7:31 AM, Florent Revest wrote: On Mon, Dec 14, 2020 at 7:47 AM Yonghong Song wrote: On 12/11/20 6:40 AM, Florent Revest wrote: On Wed, Dec 2, 2020 at 10:18 PM Alexei Starovoitov wrote: I still think that adopting printk/vsnprintf for this instead of reinventing the wheel is more flexible and easier to maintain long term. Almost the same layout can be done with vsnprintf with exception of \0 char. More meaningful names, etc. See Documentation/core-api/printk-formats.rst I agree this would be nice. I finally got a bit of time to experiment with this and I noticed a few things: First of all, because helpers only have 5 arguments, if we use two for the output buffer and its size and two for the format string and its size, we are only left with one argument for a modifier. This is still enough for our usecase (where we'd only use "%ps" for example) but it does not strictly-speaking allow for the same layout that Andrii proposed. See helper bpf_seq_printf. It packs all arguments for format string and puts them into an array. bpf_seq_printf will unpack them as it parsed through the format string. So it should be doable to have more than "%ps" in format string. This could be a nice trick, thank you for the suggestion Yonghong :) My understanding is that this would also require two extra args (one for the array of arguments and one for the size of this array) so it would still not fit the 5 arguments limit I described in my previous email. eg: this would not be possible: long bpf_snprintf(const char *out, u32 out_size, const char *fmt, u32 fmt_size, const void *data, u32 data_len) Right. bpf allows only up to 5 parameters. Would you then suggest that we also put the format string and its length in the first and second cells of this array and have something along the line of: long bpf_snprintf(const char *out, u32 out_size, const void *args, u32 args_len) ? This seems like a fairly opaque signature to me and harder to verify. One way is to define an explicit type for args, something like struct bpf_fmt_str_data { char *fmt; u64 fmt_len; u64 data[]; }; that feels a bit convoluted. The reason I feel unease with the helper as was originally proposed and with Andrii's proposal is all the extra strlen and strcpy that needs to be done. In the helper we have to call kallsyms_lookup() which is ok interface for what it was desinged to do, but it's awkward to use to construct new string ("%s [%s]", sym, modname) or to send two strings into a ring buffer. Andrii's zero separator idea will simplify bpf prog, but user space would need to do strlen anyway if it needs to pretty print. If we take pain on converting addr to sym+modname let's figure out how to make it easy for the bpf prog to do and easy for user space to consume. That's why I proposed snprintf. As far as 6 arg issue: long bpf_snprintf(const char *out, u32 out_size, const char *fmt, u32 fmt_size, const void *data, u32 data_len); Yeah. It won't work as-is, but fmt_size is unnecessary nowadays. The verifier understands read-only data. Hence the helper can be: long bpf_snprintf(const char *out, u32 out_size, const char *fmt, const void *data, u32 data_len); The 3rd arg cannot be ARG_PTR_TO_MEM. Instead we can introduce ARG_PTR_TO_CONST_STR in the verifier. This should work except if fmt string is on the stack. Maybe this is an okay tradeoff. See check_mem_access() where it's doing bpf_map_direct_read(). That 'fmt' string will be accessed through the same bpf_map_direct_read(). The verifier would need to check that it's NUL-terminated valid string. It should probably do % specifier checks at the same time. At the end bpf_snprintf() will have 5 args and when wrapped with BPF_SNPRINTF() macro it will accept arbitrary number of arguments to print. It also will be generally useful to do all other kinds of pretty printing.
Re: [PATCH] btf: support ints larger than 128 bits
On 12/17/20 7:01 AM, Sean Young wrote: clang supports arbitrary length ints using the _ExtInt extension. This can be useful to hold very large values, e.g. 256 bit or 512 bit types. Larger types (e.g. 1024 bits) are possible but I am unaware of a use case for these. This requires the _ExtInt extension to enabled for BPF in clang, which is under review. Link: https://clang.llvm.org/docs/LanguageExtensions.html#extended-integer-types Link: https://reviews.llvm.org/D93103 Signed-off-by: Sean Young --- Documentation/bpf/btf.rst | 4 ++-- include/uapi/linux/btf.h | 2 +- tools/bpf/bpftool/btf_dumper.c | 39 ++ tools/include/uapi/linux/btf.h | 2 +- 4 files changed, 43 insertions(+), 4 deletions(-) Thanks for the patch. But the change is not enough and no tests in the patch set. For example, in kernel/bpf/btf.c, we BITS_PER_U128 to guard in various places where the number of integer bits must be <= 128 bits which is what we supported now. In function btf_type_int_is_regular(), # of int bits larger than 128 considered false. The extint like 256/512bits should be also regular int. extint permits non-power-of-2 bits (e.g., 192bits), to support them may not be necessary and this is not your use case. what do you think? lib/bpf/btf.c btf__and_int() function also has the following check, /* byte_sz must be power of 2 */ if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) return -EINVAL; So Extint 256 bits will fail here. Please do add some selftests tools/testing/selftests/bpf directories: - to ensure btf with newly supported int types loaded successfully in kernel - to ensure bpftool map [pretty] print working fine with new types - to ensure kernel map pretty print works fine (tests at tools/testing/selftests/bpf/prog_tests/btf.c) - to ensure btf manipulation APIs works with new types. diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index 44dc789de2b4..784f1743dbc7 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -132,7 +132,7 @@ The following sections detail encoding of each kind. #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f00) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff) >> 16) - #define BTF_INT_BITS(VAL) ((VAL) & 0x00ff) + #define BTF_INT_BITS(VAL) ((VAL) & 0x03ff) The ``BTF_INT_ENCODING`` has the following attributes:: @@ -147,7 +147,7 @@ pretty print. At most one encoding can be specified for the int type. The ``BTF_INT_BITS()`` specifies the number of actual bits held by this int type. For example, a 4-bit bitfield encodes ``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8`` must be equal to or greater than ``BTF_INT_BITS()`` -for the type. The maximum value of ``BTF_INT_BITS()`` is 128. +for the type. The maximum value of ``BTF_INT_BITS()`` is 512. The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values for this int. For example, a bitfield struct member has: diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..1696fd02b302 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -84,7 +84,7 @@ struct btf_type { */ #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f00) >> 24) #define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x00ff) +#define BTF_INT_BITS(VAL) ((VAL) & 0x03ff) /* Attributes stored in the BTF_INT_ENCODING */ #define BTF_INT_SIGNED(1 << 0) diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 0e9310727281..45ed45ea9962 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -271,6 +271,40 @@ static void btf_int128_print(json_writer_t *jw, const void *data, } } +static void btf_bigint_print(json_writer_t *jw, const void *data, int nr_bits, +bool is_plain_text) +{ + char buf[nr_bits / 4 + 1]; + bool first = true; + int i; + +#ifdef __BIG_ENDIAN_BITFIELD + for (i = 0; i < nr_bits / 64; i++) { +#else + for (i = nr_bits / 64 - 1; i >= 0; i++) { +#endif + __u64 v = ((__u64 *)data)[i]; + + if (first) { + if (!v) + continue; + + snprintf(buf, sizeof(buf), "%llx", v); + + first = false; + } else { + size_t off = strlen(buf); + + snprintf(buf + off, sizeof(buf) - off, "%016llx", v); + } + } + + if (is_plain_text) + jsonw_printf(jw, "0x%s", buf); + else + jsonw_printf(jw, "\"0x%s\"", buf); +} + static void btf_int128_shift(__u64 *print_num, __u16 left_shift_bits, __u16 right_shift_bits) { @@
Re: [PATCH bpf-next 1/2] bpf: Add a bpf_kallsyms_lookup helper
On 12/17/20 7:31 AM, Florent Revest wrote: On Mon, Dec 14, 2020 at 7:47 AM Yonghong Song wrote: On 12/11/20 6:40 AM, Florent Revest wrote: On Wed, Dec 2, 2020 at 10:18 PM Alexei Starovoitov wrote: I still think that adopting printk/vsnprintf for this instead of reinventing the wheel is more flexible and easier to maintain long term. Almost the same layout can be done with vsnprintf with exception of \0 char. More meaningful names, etc. See Documentation/core-api/printk-formats.rst I agree this would be nice. I finally got a bit of time to experiment with this and I noticed a few things: First of all, because helpers only have 5 arguments, if we use two for the output buffer and its size and two for the format string and its size, we are only left with one argument for a modifier. This is still enough for our usecase (where we'd only use "%ps" for example) but it does not strictly-speaking allow for the same layout that Andrii proposed. See helper bpf_seq_printf. It packs all arguments for format string and puts them into an array. bpf_seq_printf will unpack them as it parsed through the format string. So it should be doable to have more than "%ps" in format string. This could be a nice trick, thank you for the suggestion Yonghong :) My understanding is that this would also require two extra args (one for the array of arguments and one for the size of this array) so it would still not fit the 5 arguments limit I described in my previous email. eg: this would not be possible: long bpf_snprintf(const char *out, u32 out_size, const char *fmt, u32 fmt_size, const void *data, u32 data_len) Right. bpf allows only up to 5 parameters. Would you then suggest that we also put the format string and its length in the first and second cells of this array and have something along the line of: long bpf_snprintf(const char *out, u32 out_size, const void *args, u32 args_len) ? This seems like a fairly opaque signature to me and harder to verify. One way is to define an explicit type for args, something like struct bpf_fmt_str_data { char *fmt; u64 fmt_len; u64 data[]; }; The bpf_snprintf signature can be long bpf_snprintf(const char *out, u32 out_size, const struct bpf_fmt_str_data *fmt_data, u32 fmt_data_len); Internally you can have one argument type for "struct bpf_fmt_str_data" like PTR_TO_FMT_DATA as a verifier reg state. if bpf_snprintf is used, when you try to verify PTR_TO_FMT_DATA, you can just verify fmt_data->fmt and fmt_data->fmt_len which satifies mem contraints. The rest of data can be passed to the helper as is. Yes, still some verifier work. But may be useful for this and future format string related helpers.
Re: [PATCH bpf-next v4 10/11] bpf: Add tests for new BPF atomic operations
On 12/16/20 3:51 AM, Brendan Jackman wrote: On Wed, 16 Dec 2020 at 08:19, Yonghong Song wrote: On 12/15/20 3:12 AM, Brendan Jackman wrote: On Tue, Dec 08, 2020 at 10:15:35AM -0800, Yonghong Song wrote: On 12/8/20 8:59 AM, Brendan Jackman wrote: On Tue, Dec 08, 2020 at 08:38:04AM -0800, Yonghong Song wrote: On 12/8/20 4:41 AM, Brendan Jackman wrote: On Mon, Dec 07, 2020 at 07:18:57PM -0800, Yonghong Song wrote: On 12/7/20 8:07 AM, Brendan Jackman wrote: The prog_test that's added depends on Clang/LLVM features added by Yonghong in commit 286daafd6512 (was https://reviews.llvm.org/D72184 ). Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- tools/testing/selftests/bpf/Makefile | 10 + .../selftests/bpf/prog_tests/atomics.c| 246 ++ tools/testing/selftests/bpf/progs/atomics.c | 154 +++ .../selftests/bpf/verifier/atomic_and.c | 77 ++ .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 +++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 .../selftests/bpf/verifier/atomic_or.c| 77 ++ .../selftests/bpf/verifier/atomic_xchg.c | 46 .../selftests/bpf/verifier/atomic_xor.c | 77 ++ 9 files changed, 889 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics.c create mode 100644 tools/testing/selftests/bpf/progs/atomics.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac25ba5d0d6c..13bc1d736164 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -239,6 +239,12 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ -I$(abspath $(OUTPUT)/../usr/include) +# BPF atomics support was added to Clang in llvm-project commit 286daafd6512 +# (release 12.0.0). +BPF_ATOMICS_SUPPORTED = $(shell \ + echo "int x = 0; int foo(void) { return __sync_val_compare_and_swap(, 1, 2); }" \ + | $(CLANG) -x cpp-output -S -target bpf -mcpu=v3 - -o /dev/null && echo 1 || echo 0) '-x c' here more intuitive? + CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -399,11 +405,15 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko\ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +ifeq ($(BPF_ATOMICS_SUPPORTED),1) + TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +endif TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c new file mode 100644 index ..c841a3abc2f7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomics.skel.h" + +static void test_add(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.add); + if (
Re: [PATCH bpf-next v5 11/11] bpf: Document new atomic instructions
On 12/16/20 3:44 AM, Brendan Jackman wrote: On Wed, 16 Dec 2020 at 08:08, Yonghong Song wrote: On 12/15/20 4:18 AM, Brendan Jackman wrote: Document new atomic instructions. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- Documentation/networking/filter.rst | 26 ++ 1 file changed, 26 insertions(+) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 1583d59d806d..26d508a5e038 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1053,6 +1053,32 @@ encoding. .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg +The basic atomic operations supported (from architecture v4 onwards) are: Remove "(from architecture v4 onwards)". Oops, thanks. + +BPF_ADD +BPF_AND +BPF_OR +BPF_XOR + +Each having equivalent semantics with the ``BPF_ADD`` example, that is: the +memory location addresed by ``dst_reg + off`` is atomically modified, with +``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the +immediate, then these operations also overwrite ``src_reg`` with the +value that was in memory before it was modified. + +The more special operations are: + +BPF_XCHG + +This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + +off``. + +BPF_CMPXCHG + +This atomically compares the value addressed by ``dst_reg + off`` with +``R0``. If they match it is replaced with ``src_reg``, The value that was there +before is loaded back to ``R0``. + Note that 1 and 2 byte atomic operations are not supported. Adding something like below. Except xadd for legacy reason, all other 4 byte atomic operations require alu32 mode. The alu32 mode can be enabled with clang flags "-Xclang -target-feature -Xclang +alu32" or "-mcpu=v3". The cpu version 3 has alu32 mode on by default. Thanks, I've written it as: Except ``BPF_ADD`` _without_ ``BPF_FETCH`` (for legacy reasons), all 4 byte atomic operations require alu32 mode. Clang enables this mode by default in architecture v3 (``-mcpu=v3``). For older versions it can be enabled with ``-Xclang -target-feature -Xclang +alu32``. Sounds good. thanks! You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to
Re: [PATCH bpf-next v4 10/11] bpf: Add tests for new BPF atomic operations
On 12/15/20 3:12 AM, Brendan Jackman wrote: On Tue, Dec 08, 2020 at 10:15:35AM -0800, Yonghong Song wrote: On 12/8/20 8:59 AM, Brendan Jackman wrote: On Tue, Dec 08, 2020 at 08:38:04AM -0800, Yonghong Song wrote: On 12/8/20 4:41 AM, Brendan Jackman wrote: On Mon, Dec 07, 2020 at 07:18:57PM -0800, Yonghong Song wrote: On 12/7/20 8:07 AM, Brendan Jackman wrote: The prog_test that's added depends on Clang/LLVM features added by Yonghong in commit 286daafd6512 (was https://reviews.llvm.org/D72184). Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- tools/testing/selftests/bpf/Makefile | 10 + .../selftests/bpf/prog_tests/atomics.c| 246 ++ tools/testing/selftests/bpf/progs/atomics.c | 154 +++ .../selftests/bpf/verifier/atomic_and.c | 77 ++ .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 +++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 .../selftests/bpf/verifier/atomic_or.c| 77 ++ .../selftests/bpf/verifier/atomic_xchg.c | 46 .../selftests/bpf/verifier/atomic_xor.c | 77 ++ 9 files changed, 889 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics.c create mode 100644 tools/testing/selftests/bpf/progs/atomics.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac25ba5d0d6c..13bc1d736164 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -239,6 +239,12 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ -I$(abspath $(OUTPUT)/../usr/include) +# BPF atomics support was added to Clang in llvm-project commit 286daafd6512 +# (release 12.0.0). +BPF_ATOMICS_SUPPORTED = $(shell \ + echo "int x = 0; int foo(void) { return __sync_val_compare_and_swap(, 1, 2); }" \ + | $(CLANG) -x cpp-output -S -target bpf -mcpu=v3 - -o /dev/null && echo 1 || echo 0) '-x c' here more intuitive? + CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -399,11 +405,15 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko\ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +ifeq ($(BPF_ATOMICS_SUPPORTED),1) + TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +endif TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c new file mode 100644 index ..c841a3abc2f7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomics.skel.h" + +static void test_add(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.add); + if (CHECK(IS_ERR(link), "attach(add)", "err: %ld\n", PTR_ERR(link))) + return; + + prog
Re: [PATCH bpf-next v5 11/11] bpf: Document new atomic instructions
On 12/15/20 4:18 AM, Brendan Jackman wrote: Document new atomic instructions. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- Documentation/networking/filter.rst | 26 ++ 1 file changed, 26 insertions(+) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 1583d59d806d..26d508a5e038 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1053,6 +1053,32 @@ encoding. .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg +The basic atomic operations supported (from architecture v4 onwards) are: Remove "(from architecture v4 onwards)". + +BPF_ADD +BPF_AND +BPF_OR +BPF_XOR + +Each having equivalent semantics with the ``BPF_ADD`` example, that is: the +memory location addresed by ``dst_reg + off`` is atomically modified, with +``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the +immediate, then these operations also overwrite ``src_reg`` with the +value that was in memory before it was modified. + +The more special operations are: + +BPF_XCHG + +This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + +off``. + +BPF_CMPXCHG + +This atomically compares the value addressed by ``dst_reg + off`` with +``R0``. If they match it is replaced with ``src_reg``, The value that was there +before is loaded back to ``R0``. + Note that 1 and 2 byte atomic operations are not supported. Adding something like below. Except xadd for legacy reason, all other 4 byte atomic operations require alu32 mode. The alu32 mode can be enabled with clang flags "-Xclang -target-feature -Xclang +alu32" or "-mcpu=v3". The cpu version 3 has alu32 mode on by default. You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to
Re: [PATCH bpf-next v5 09/11] bpf: Add bitwise atomic instructions
On 12/15/20 4:18 AM, Brendan Jackman wrote: This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Signed-off-by: Brendan Jackman Acked-by: Yonghong Song
Re: [PATCH bpf-next v5 07/11] bpf: Add instructions for atomic_[cmp]xchg
On 12/15/20 4:18 AM, Brendan Jackman wrote: This adds two atomic opcodes, both of which include the BPF_FETCH flag. XCHG without the BPF_FETCH flag would naturally encode atomic_set. This is not supported because it would be of limited value to userspace (it doesn't imply any barriers). CMPXCHG without BPF_FETCH woulud be an atomic compare-and-write. We don't have such an operation in the kernel so it isn't provided to BPF either. There are two significant design decisions made for the CMPXCHG instruction: - To solve the issue that this operation fundamentally has 3 operands, but we only have two register fields. Therefore the operand we compare against (the kernel's API calls it 'old') is hard-coded to be R0. x86 has similar design (and A64 doesn't have this problem). A potential alternative might be to encode the other operand's register number in the immediate field. - The kernel's atomic_cmpxchg returns the old value, while the C11 userspace APIs return a boolean indicating the comparison result. Which should BPF do? A64 returns the old value. x86 returns the old value in the hard-coded register (and also sets a flag). That means return-old-value is easier to JIT, so that's what we use. Signed-off-by: Brendan Jackman Ack with a minor comment below. Acked-by: Yonghong Song --- arch/x86/net/bpf_jit_comp.c| 8 include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 4 +++- kernel/bpf/core.c | 20 kernel/bpf/disasm.c| 15 +++ kernel/bpf/verifier.c | 19 +-- tools/include/linux/filter.h | 2 ++ tools/include/uapi/linux/bpf.h | 4 +++- 8 files changed, 70 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index eea7d8b0bb12..308241187582 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ EMIT2(0x0F, 0xC1); break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index c3e87a63e0b8..16e0ba5e8937 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 760ae333a5ed..538b95472c8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90/* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01/* fetch previous value into src reg */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG(0xf0 | BPF_FETCH) /* atomic compare-and-write */ +#define BPF_FETCH 0x01/* not an opcode on its own, used to build others */ Although the above code works fine, I would suggest to put BPF_FETCH definition before BPF_XCHG and BPF_CMPXCHG, which makes more sense intuitively. /* Register numbers */ enum { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c [...] \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 760ae333a5ed..538b95472c8f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90/* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01/* fetch previous value into src reg */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG(0xf0 | BPF_FETCH) /* atomic compare-and-write */ +#define BPF_FETCH 0x01/* not an opcode on its own, used to build others */ same here. /* Register numbers */ enum {
Re: [PATCH bpf-next 1/2] bpf: Add a bpf_kallsyms_lookup helper
On 12/11/20 6:40 AM, Florent Revest wrote: On Wed, Dec 2, 2020 at 10:18 PM Alexei Starovoitov wrote: I still think that adopting printk/vsnprintf for this instead of reinventing the wheel is more flexible and easier to maintain long term. Almost the same layout can be done with vsnprintf with exception of \0 char. More meaningful names, etc. See Documentation/core-api/printk-formats.rst I agree this would be nice. I finally got a bit of time to experiment with this and I noticed a few things: First of all, because helpers only have 5 arguments, if we use two for the output buffer and its size and two for the format string and its size, we are only left with one argument for a modifier. This is still enough for our usecase (where we'd only use "%ps" for example) but it does not strictly-speaking allow for the same layout that Andrii proposed. See helper bpf_seq_printf. It packs all arguments for format string and puts them into an array. bpf_seq_printf will unpack them as it parsed through the format string. So it should be doable to have more than "%ps" in format string. If we force fmt to come from readonly map then bpf_trace_printk()-like run-time check of fmt string can be moved into load time check and performance won't suffer. Regarding this bit, I have the impression that this would not be possible, but maybe I'm missing something ? :) The iteration that bpf_trace_printk does over the format string argument is not only used for validation. It is also used to remember what extra operations need to be done based on the modifier types. For example, it remembers whether an arg should be interpreted as 32bits or 64bits. In the case of string printing, it also remembers whether it is a kernel-space or user-space pointer so that bpf_trace_copy_string can be called with the right arg. If we were to run the iteration over the format string in the verifier, how would you recommend that we "remember" the modifier type until the helper gets called ?
Re: [PATCH bpf-next v4 10/11] bpf: Add tests for new BPF atomic operations
On 12/8/20 8:59 AM, Brendan Jackman wrote: On Tue, Dec 08, 2020 at 08:38:04AM -0800, Yonghong Song wrote: On 12/8/20 4:41 AM, Brendan Jackman wrote: On Mon, Dec 07, 2020 at 07:18:57PM -0800, Yonghong Song wrote: On 12/7/20 8:07 AM, Brendan Jackman wrote: The prog_test that's added depends on Clang/LLVM features added by Yonghong in commit 286daafd6512 (was https://reviews.llvm.org/D72184 ). Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- tools/testing/selftests/bpf/Makefile | 10 + .../selftests/bpf/prog_tests/atomics.c| 246 ++ tools/testing/selftests/bpf/progs/atomics.c | 154 +++ .../selftests/bpf/verifier/atomic_and.c | 77 ++ .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 +++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 .../selftests/bpf/verifier/atomic_or.c| 77 ++ .../selftests/bpf/verifier/atomic_xchg.c | 46 .../selftests/bpf/verifier/atomic_xor.c | 77 ++ 9 files changed, 889 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics.c create mode 100644 tools/testing/selftests/bpf/progs/atomics.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac25ba5d0d6c..13bc1d736164 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -239,6 +239,12 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ -I$(abspath $(OUTPUT)/../usr/include) +# BPF atomics support was added to Clang in llvm-project commit 286daafd6512 +# (release 12.0.0). +BPF_ATOMICS_SUPPORTED = $(shell \ + echo "int x = 0; int foo(void) { return __sync_val_compare_and_swap(, 1, 2); }" \ + | $(CLANG) -x cpp-output -S -target bpf -mcpu=v3 - -o /dev/null && echo 1 || echo 0) '-x c' here more intuitive? + CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -399,11 +405,15 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko\ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +ifeq ($(BPF_ATOMICS_SUPPORTED),1) + TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +endif TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c new file mode 100644 index ..c841a3abc2f7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomics.skel.h" + +static void test_add(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.add); + if (CHECK(IS_ERR(link), "attach(add)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NUL
Re: [PATCH bpf-next v4 10/11] bpf: Add tests for new BPF atomic operations
On 12/8/20 4:41 AM, Brendan Jackman wrote: On Mon, Dec 07, 2020 at 07:18:57PM -0800, Yonghong Song wrote: On 12/7/20 8:07 AM, Brendan Jackman wrote: The prog_test that's added depends on Clang/LLVM features added by Yonghong in commit 286daafd6512 (was https://reviews.llvm.org/D72184 ). Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- tools/testing/selftests/bpf/Makefile | 10 + .../selftests/bpf/prog_tests/atomics.c| 246 ++ tools/testing/selftests/bpf/progs/atomics.c | 154 +++ .../selftests/bpf/verifier/atomic_and.c | 77 ++ .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 +++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 .../selftests/bpf/verifier/atomic_or.c| 77 ++ .../selftests/bpf/verifier/atomic_xchg.c | 46 .../selftests/bpf/verifier/atomic_xor.c | 77 ++ 9 files changed, 889 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics.c create mode 100644 tools/testing/selftests/bpf/progs/atomics.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac25ba5d0d6c..13bc1d736164 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -239,6 +239,12 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ -I$(abspath $(OUTPUT)/../usr/include) +# BPF atomics support was added to Clang in llvm-project commit 286daafd6512 +# (release 12.0.0). +BPF_ATOMICS_SUPPORTED = $(shell \ + echo "int x = 0; int foo(void) { return __sync_val_compare_and_swap(, 1, 2); }" \ + | $(CLANG) -x cpp-output -S -target bpf -mcpu=v3 - -o /dev/null && echo 1 || echo 0) '-x c' here more intuitive? + CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -399,11 +405,15 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko\ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +ifeq ($(BPF_ATOMICS_SUPPORTED),1) + TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +endif TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c new file mode 100644 index ..c841a3abc2f7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomics.skel.h" + +static void test_add(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.add); + if (CHECK(IS_ERR(link), "attach(add)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, , ); + if (CHECK(err || retval, "test_run add", + "err %d errno %d retval %d duration %d\n", err, errno, retva
Re: [PATCH bpf-next v4 11/11] bpf: Document new atomic instructions
On 12/7/20 8:07 AM, Brendan Jackman wrote: Document new atomic instructions. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- Documentation/networking/filter.rst | 26 ++ 1 file changed, 26 insertions(+) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 1583d59d806d..26d508a5e038 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1053,6 +1053,32 @@ encoding. .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg +The basic atomic operations supported (from architecture v4 onwards) are: No "v4" any more. Just say The basic atomic operations supported are: + +BPF_ADD +BPF_AND +BPF_OR +BPF_XOR + +Each having equivalent semantics with the ``BPF_ADD`` example, that is: the +memory location addresed by ``dst_reg + off`` is atomically modified, with +``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the +immediate, then these operations also overwrite ``src_reg`` with the +value that was in memory before it was modified. For 4-byte operations, except BPF_ADD, alu32 mode is required. alu32 is implied with -mcpu=v3. + +The more special operations are: + +BPF_XCHG + +This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + +off``. + +BPF_CMPXCHG + +This atomically compares the value addressed by ``dst_reg + off`` with +``R0``. If they match it is replaced with ``src_reg``, The value that was there +before is loaded back to ``R0``. + Note that 1 and 2 byte atomic operations are not supported. You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to
Re: [PATCH bpf-next v4 10/11] bpf: Add tests for new BPF atomic operations
On 12/7/20 8:07 AM, Brendan Jackman wrote: The prog_test that's added depends on Clang/LLVM features added by Yonghong in commit 286daafd6512 (was https://reviews.llvm.org/D72184 ). Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Signed-off-by: Brendan Jackman Ack with minor comments below. Acked-by: Yonghong Song --- tools/testing/selftests/bpf/Makefile | 10 + .../selftests/bpf/prog_tests/atomics.c| 246 ++ tools/testing/selftests/bpf/progs/atomics.c | 154 +++ .../selftests/bpf/verifier/atomic_and.c | 77 ++ .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 +++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 .../selftests/bpf/verifier/atomic_or.c| 77 ++ .../selftests/bpf/verifier/atomic_xchg.c | 46 .../selftests/bpf/verifier/atomic_xor.c | 77 ++ 9 files changed, 889 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics.c create mode 100644 tools/testing/selftests/bpf/progs/atomics.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ac25ba5d0d6c..13bc1d736164 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -239,6 +239,12 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ -I$(abspath $(OUTPUT)/../usr/include) +# BPF atomics support was added to Clang in llvm-project commit 286daafd6512 +# (release 12.0.0). +BPF_ATOMICS_SUPPORTED = $(shell \ + echo "int x = 0; int foo(void) { return __sync_val_compare_and_swap(, 1, 2); }" \ + | $(CLANG) -x cpp-output -S -target bpf -mcpu=v3 - -o /dev/null && echo 1 || echo 0) '-x c' here more intuitive? + CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \ -Wno-compare-distinct-pointer-types @@ -399,11 +405,15 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +ifeq ($(BPF_ATOMICS_SUPPORTED),1) + TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +endif TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c new file mode 100644 index ..c841a3abc2f7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomics.skel.h" + +static void test_add(struct atomics *skel) +{ + int err, prog_fd; + __u32 duration = 0, retval; + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.add); + if (CHECK(IS_ERR(link), "attach(add)", "err: %ld\n", PTR_ERR(link))) + return; + + prog_fd = bpf_program__fd(skel->progs.add); + err = bpf_prog_test_run(prog_fd, 1, NULL, 0, + NULL, NULL, , ); + if (CHECK(err || retval, "test_run add", + "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) + goto cleanup; + + ASSERT_EQ(skel->data->add64_value, 3, "add64_value"); +
Re: [PATCH bpf-next v4 09/11] bpf: Add bitwise atomic instructions
On 12/7/20 8:07 AM, Brendan Jackman wrote: This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Signed-off-by: Brendan Jackman --- arch/x86/net/bpf_jit_comp.c | 50 ++- include/linux/filter.h | 66 kernel/bpf/core.c| 3 ++ kernel/bpf/disasm.c | 21 +--- kernel/bpf/verifier.c| 6 tools/include/linux/filter.h | 66 6 files changed, 207 insertions(+), 5 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 308241187582..1d4d50199293 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -808,6 +808,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* emit opcode */ switch (atomic_op) { case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; [...] diff --git a/include/linux/filter.h b/include/linux/filter.h index e1e1fc946a7c..e100c71555a4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,7 +264,13 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ @@ -295,6 +301,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = OFF, \ .imm = BPF_ADD }) +/* Atomic memory and, *(uint *)(dst_reg + off16) &= src_reg */ + +#define BPF_ATOMIC_AND(SIZE, DST, SRC, OFF)\ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_AND }) + +/* Atomic memory and with fetch, src_reg = atomic_fetch_and(dst_reg + off, src_reg); */ + +#define BPF_ATOMIC_FETCH_AND(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_AND | BPF_FETCH }) + +/* Atomic memory or, *(uint *)(dst_reg + off16) |= src_reg */ + +#define BPF_ATOMIC_OR(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_OR }) + +/* Atomic memory or with fetch, src_reg = atomic_fetch_or(dst_reg + off, src_reg); */ + +#define BPF_ATOMIC_FETCH_OR(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_OR | BPF_FETCH }) + +/* Atomic memory xor, *(uint *)(dst_reg + off16) ^= src_reg */ + +#define BPF_ATOMIC_XOR(SIZE, DST, SRC, OFF)
Re: [PATCH bpf-next v4 07/11] bpf: Add instructions for atomic_[cmp]xchg
On 12/7/20 8:07 AM, Brendan Jackman wrote: This adds two atomic opcodes, both of which include the BPF_FETCH flag. XCHG without the BPF_FETCH flag would naturally encode atomic_set. This is not supported because it would be of limited value to userspace (it doesn't imply any barriers). CMPXCHG without BPF_FETCH woulud be an atomic compare-and-write. We don't have such an operation in the kernel so it isn't provided to BPF either. There are two significant design decisions made for the CMPXCHG instruction: - To solve the issue that this operation fundamentally has 3 operands, but we only have two register fields. Therefore the operand we compare against (the kernel's API calls it 'old') is hard-coded to be R0. x86 has similar design (and A64 doesn't have this problem). A potential alternative might be to encode the other operand's register number in the immediate field. - The kernel's atomic_cmpxchg returns the old value, while the C11 userspace APIs return a boolean indicating the comparison result. Which should BPF do? A64 returns the old value. x86 returns the old value in the hard-coded register (and also sets a flag). That means return-old-value is easier to JIT. Signed-off-by: Brendan Jackman --- arch/x86/net/bpf_jit_comp.c| 8 include/linux/filter.h | 22 ++ include/uapi/linux/bpf.h | 4 +++- kernel/bpf/core.c | 20 kernel/bpf/disasm.c| 15 +++ kernel/bpf/verifier.c | 19 +-- tools/include/linux/filter.h | 22 ++ tools/include/uapi/linux/bpf.h | 4 +++- 8 files changed, 110 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index eea7d8b0bb12..308241187582 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ EMIT2(0x0F, 0xC1); break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index b5258bca10d2..e1e1fc946a7c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC64(OP, DST, SRC, OFF)\ @@ -293,6 +295,26 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = OFF, \ .imm = BPF_ADD }) +/* Atomic exchange, src_reg = atomic_xchg(dst_reg + off, src_reg) */ + +#define BPF_ATOMIC_XCHG(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_XCHG }) + +/* Atomic compare-exchange, r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg) */ + +#define BPF_ATOMIC_CMPXCHG(SIZE, DST, SRC, OFF)\ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_CMPXCHG }) Define BPF_ATOMIC_{XCHG, CMPXCHG} based on BPF_ATOMIC macro? + /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM)\ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d5389119291e..b733af50a5b9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90/* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01/* fetch previous value into src reg */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /*
Re: [PATCH bpf-next v4 06/11] bpf: Add BPF_FETCH field / create atomic_fetch_add instruction
On 12/7/20 8:07 AM, Brendan Jackman wrote: The BPF_FETCH field can be set in bpf_insn.imm, for BPF_ATOMIC instructions, in order to have the previous value of the atomically-modified memory location loaded into the src register after an atomic op is carried out. Suggested-by: Yonghong Song Signed-off-by: Brendan Jackman --- arch/x86/net/bpf_jit_comp.c| 4 include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 3 +++ kernel/bpf/core.c | 13 + kernel/bpf/disasm.c| 7 +++ kernel/bpf/verifier.c | 33 - tools/include/linux/filter.h | 11 +++ tools/include/uapi/linux/bpf.h | 3 +++ 8 files changed, 66 insertions(+), 9 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c [...] index f345f12c1ff8..4e0100ba52c2 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,6 +173,7 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC64(OP, DST, SRC, OFF)\ @@ -201,6 +202,16 @@ .off = OFF, \ .imm = BPF_ADD }) +/* Atomic memory add with fetch, src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + +#define BPF_ATOMIC_FETCH_ADD(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_ADD | BPF_FETCH }) Not sure whether it is a good idea or not to fold this into BPF_ATOMIC macro. At least you can define BPF_ATOMIC macro and #define BPF_ATOMIC_FETCH_ADD(SIZE, DST, SRC, OFF) \ BPF_ATOMIC(SIZE, DST, SRC, OFF, BPF_ADD | BPF_FETCH) to avoid too many code duplications? + /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM)\ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 98161e2d389f..d5389119291e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80/* function call */ #define BPF_EXIT 0x90/* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01/* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0,
Re: [PATCH bpf-next v4 05/11] bpf: Move BPF_STX reserved field check into BPF_STX verifier code
On 12/7/20 8:07 AM, Brendan Jackman wrote: I can't find a reason why this code is in resolve_pseudo_ldimm64; since I'll be modifying it in a subsequent commit, tidy it up. Signed-off-by: Brendan Jackman Acked-by: Yonghong Song
Re: [PATCH] bpf: propagate __user annotations properly
On 12/7/20 4:37 AM, Lukas Bulwahn wrote: __htab_map_lookup_and_delete_batch() stores a user pointer in the local variable ubatch and uses that in copy_{from,to}_user(), but ubatch misses a __user annotation. So, sparse warns in the various assignments and uses of ubatch: kernel/bpf/hashtab.c:1415:24: warning: incorrect type in initializer (different address spaces) kernel/bpf/hashtab.c:1415:24:expected void *ubatch kernel/bpf/hashtab.c:1415:24:got void [noderef] __user * kernel/bpf/hashtab.c:1444:46: warning: incorrect type in argument 2 (different address spaces) kernel/bpf/hashtab.c:1444:46:expected void const [noderef] __user *from kernel/bpf/hashtab.c:1444:46:got void *ubatch kernel/bpf/hashtab.c:1608:16: warning: incorrect type in assignment (different address spaces) kernel/bpf/hashtab.c:1608:16:expected void *ubatch kernel/bpf/hashtab.c:1608:16:got void [noderef] __user * kernel/bpf/hashtab.c:1609:26: warning: incorrect type in argument 1 (different address spaces) kernel/bpf/hashtab.c:1609:26:expected void [noderef] __user *to kernel/bpf/hashtab.c:1609:26:got void *ubatch Add the __user annotation to repair this chain of propagating __user annotations in __htab_map_lookup_and_delete_batch(). Add fix tag? Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Lukas Bulwahn Thanks for the fix. LGTM. I guess either bpf or bpf-next tree is fine since this is not a correctness issue. Acked-by: Yonghong Song
Re: [PATCH bpf-next v3 10/14] bpf: Add bitwise atomic instructions
On 12/7/20 3:28 AM, Brendan Jackman wrote: On Fri, Dec 04, 2020 at 07:21:22AM -0800, Yonghong Song wrote: On 12/4/20 1:36 AM, Brendan Jackman wrote: On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote: On 12/3/20 8:02 AM, Brendan Jackman wrote: This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4 Signed-off-by: Brendan Jackman --- arch/x86/net/bpf_jit_comp.c | 50 +- include/linux/filter.h | 60 kernel/bpf/core.c| 5 ++- kernel/bpf/disasm.c | 21 ++--- kernel/bpf/verifier.c| 6 tools/include/linux/filter.h | 60 6 files changed, 196 insertions(+), 6 deletions(-) [...] diff --git a/include/linux/filter.h b/include/linux/filter.h index 6186280715ed..698f82897b0d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) [...] +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_XOR | BPF_FETCH }) + /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */ Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other. The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/... I am wondering whether it makes sence to have to BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF) can have less number of macros? Hmm yeah I think that's probably a good idea, it would be consistent with the macros for non-atomic ALU ops. I don't think 'BOP' would be very clear though, 'ALU' might be more obvious. BPF_ATOMIC_ALU and BPF_ATOMIC_FETCH_ALU indeed better. On second thoughts I think it feels right (i.e. it would be roughly consistent with the level of abstraction of the rest of this macro API) to go further and just have two macros BPF_ATOMIC64 and BPF_ATOMIC32: /* * Atomic ALU ops: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_AND *(uint *) (dst_reg + off16) &= src_reg * BPF_OR *(uint *) (dst_reg + off16) |= src_reg * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg "uint *" => "size_type *"? and give an explanation that "size_type" is either "u32" or "u64"? * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC64(OP, DST, SRC, OFF) \ ((struct bpf_insn) {\ .code = BPF_STX | BPF_DW | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) #define BPF_ATOMIC32(OP, DST, SRC, OFF) \ ((struct bpf_insn) {\ .code = BPF_STX | BPF_W | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) You could have BPF_ATOMIC(OP, SIZE, DST, SRC, OFF) where SIZE is BPF_DW or BPF_W. The downside compared to what's currently in the patc
Re: [PATCH v2 bpf-next 0/3] bpf: support module BTF in BTF display helpers
On 12/5/20 4:43 PM, Alan Maguire wrote: On Sat, 5 Dec 2020, Yonghong Song wrote: __builtin_btf_type_id() is really only supported in llvm12 and 64bit return value support is pushed to llvm12 trunk a while back. The builtin is introduced in llvm11 but has a corner bug, so llvm12 is recommended. So if people use the builtin, you can assume 64bit return value. libbpf support is required here. So in my opinion, there is no need to do feature detection. Andrii has a patch to support 64bit return value for __builtin_btf_type_id() and I assume that one should be landed before or together with your patch. Just for your info. The following is an example you could use to determine whether __builtin_btf_type_id() supports btf object id at llvm level. -bash-4.4$ cat t.c int test(int arg) { return __builtin_btf_type_id(arg, 1); } Compile to generate assembly code with latest llvm12 trunk: clang -target bpf -O2 -S -g -mcpu=v3 t.c In the asm code, you should see one line with r0 = 1 ll Or you can generate obj code: clang -target bpf -O2 -c -g -mcpu=v3 t.c and then you disassemble the obj file llvm-objdump -d --no-show-raw-insn --no-leading-addr t.o You should see below in the output r0 = 1 ll Use earlier version of llvm12 trunk, the builtin has 32bit return value, you will see r0 = 1 which is a 32bit imm to r0, while "r0 = 1 ll" is 64bit imm to r0. Thanks for this Yonghong! I'm thinking the way I'll tackle it is to simply verify that the upper 32 bits specifying the veth module object id are non-zero; if they are zero, we'll skip the test (I think a skip probably makes sense as not everyone will have llvm12). Does that seem reasonable? This should work too and we do not need to add a note in README.rst for this test then. With the additional few minor changes on top of Andrii's patch, the use of __builtin_btf_type_id() worked perfectly. Thanks! Alan
Re: [PATCH v2 bpf-next 0/3] bpf: support module BTF in BTF display helpers
On 12/5/20 12:35 PM, Yonghong Song wrote: On 12/4/20 10:48 AM, Alan Maguire wrote: This series aims to add support to bpf_snprintf_btf() and bpf_seq_printf_btf() allowing them to store string representations of module-specific types, as well as the kernel-specific ones they currently support. Patch 1 removes the btf_module_mutex, as since we will need to look up module BTF during BPF program execution, we don't want to risk sleeping in the various contexts in which BPF can run. The access patterns to the btf module list seem to conform to classic list RCU usage so with a few minor tweaks this seems workable. Patch 2 replaces the unused flags field in struct btf_ptr with an obj_id field, allowing the specification of the id of a BTF module. If the value is 0, the core kernel vmlinux is assumed to contain the type's BTF information. Otherwise the module with that id is used to identify the type. If the object-id based lookup fails, we again fall back to vmlinux BTF. Patch 3 is a selftest that uses veth (when built as a module) and a kprobe to display both a module-specific and kernel-specific type; both are arguments to veth_stats_rx(). Currently it looks up the module-specific type and object ids using libbpf; in future, these lookups will likely be supported directly in the BPF program via __builtin_btf_type_id(); but I need to determine a good test to determine if that builtin supports object ids. __builtin_btf_type_id() is really only supported in llvm12 and 64bit return value support is pushed to llvm12 trunk a while back. The builtin is introduced in llvm11 but has a corner bug, so llvm12 is recommended. So if people use the builtin, you can assume 64bit return value. libbpf support is required here. So in my opinion, there is no need to do feature detection. if people use llvm11 which may cause test to fail, we can add an entry in selftest README file to warn people this specific test needs llvm12. Andrii has a patch to support 64bit return value for __builtin_btf_type_id() and I assume that one should be landed before or together with your patch. Just for your info. The following is an example you could use to determine whether __builtin_btf_type_id() supports btf object id at llvm level. -bash-4.4$ cat t.c int test(int arg) { return __builtin_btf_type_id(arg, 1); } Compile to generate assembly code with latest llvm12 trunk: clang -target bpf -O2 -S -g -mcpu=v3 t.c In the asm code, you should see one line with r0 = 1 ll Or you can generate obj code: clang -target bpf -O2 -c -g -mcpu=v3 t.c and then you disassemble the obj file llvm-objdump -d --no-show-raw-insn --no-leading-addr t.o You should see below in the output r0 = 1 ll Use earlier version of llvm12 trunk, the builtin has 32bit return value, you will see r0 = 1 which is a 32bit imm to r0, while "r0 = 1 ll" is 64bit imm to r0. Changes since RFC - add patch to remove module mutex - modify to use obj_id instead of module name as identifier in "struct btf_ptr" (Andrii) Alan Maguire (3): bpf: eliminate btf_module_mutex as RCU synchronization can be used bpf: add module support to btf display helpers selftests/bpf: verify module-specific types can be shown via bpf_snprintf_btf include/linux/btf.h | 12 ++ include/uapi/linux/bpf.h | 13 ++- kernel/bpf/btf.c | 49 +--- kernel/trace/bpf_trace.c | 44 ++-- tools/include/uapi/linux/bpf.h | 13 ++- .../selftests/bpf/prog_tests/snprintf_btf_mod.c | 124 + tools/testing/selftests/bpf/progs/bpf_iter.h | 2 +- tools/testing/selftests/bpf/progs/btf_ptr.h | 2 +- tools/testing/selftests/bpf/progs/veth_stats_rx.c | 72 9 files changed, 292 insertions(+), 39 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/snprintf_btf_mod.c create mode 100644 tools/testing/selftests/bpf/progs/veth_stats_rx.c
Re: [PATCH v2 bpf-next 0/3] bpf: support module BTF in BTF display helpers
On 12/4/20 10:48 AM, Alan Maguire wrote: This series aims to add support to bpf_snprintf_btf() and bpf_seq_printf_btf() allowing them to store string representations of module-specific types, as well as the kernel-specific ones they currently support. Patch 1 removes the btf_module_mutex, as since we will need to look up module BTF during BPF program execution, we don't want to risk sleeping in the various contexts in which BPF can run. The access patterns to the btf module list seem to conform to classic list RCU usage so with a few minor tweaks this seems workable. Patch 2 replaces the unused flags field in struct btf_ptr with an obj_id field, allowing the specification of the id of a BTF module. If the value is 0, the core kernel vmlinux is assumed to contain the type's BTF information. Otherwise the module with that id is used to identify the type. If the object-id based lookup fails, we again fall back to vmlinux BTF. Patch 3 is a selftest that uses veth (when built as a module) and a kprobe to display both a module-specific and kernel-specific type; both are arguments to veth_stats_rx(). Currently it looks up the module-specific type and object ids using libbpf; in future, these lookups will likely be supported directly in the BPF program via __builtin_btf_type_id(); but I need to determine a good test to determine if that builtin supports object ids. __builtin_btf_type_id() is really only supported in llvm12 and 64bit return value support is pushed to llvm12 trunk a while back. The builtin is introduced in llvm11 but has a corner bug, so llvm12 is recommended. So if people use the builtin, you can assume 64bit return value. libbpf support is required here. So in my opinion, there is no need to do feature detection. Andrii has a patch to support 64bit return value for __builtin_btf_type_id() and I assume that one should be landed before or together with your patch. Just for your info. The following is an example you could use to determine whether __builtin_btf_type_id() supports btf object id at llvm level. -bash-4.4$ cat t.c int test(int arg) { return __builtin_btf_type_id(arg, 1); } Compile to generate assembly code with latest llvm12 trunk: clang -target bpf -O2 -S -g -mcpu=v3 t.c In the asm code, you should see one line with r0 = 1 ll Or you can generate obj code: clang -target bpf -O2 -c -g -mcpu=v3 t.c and then you disassemble the obj file llvm-objdump -d --no-show-raw-insn --no-leading-addr t.o You should see below in the output r0 = 1 ll Use earlier version of llvm12 trunk, the builtin has 32bit return value, you will see r0 = 1 which is a 32bit imm to r0, while "r0 = 1 ll" is 64bit imm to r0. Changes since RFC - add patch to remove module mutex - modify to use obj_id instead of module name as identifier in "struct btf_ptr" (Andrii) Alan Maguire (3): bpf: eliminate btf_module_mutex as RCU synchronization can be used bpf: add module support to btf display helpers selftests/bpf: verify module-specific types can be shown via bpf_snprintf_btf include/linux/btf.h| 12 ++ include/uapi/linux/bpf.h | 13 ++- kernel/bpf/btf.c | 49 +--- kernel/trace/bpf_trace.c | 44 ++-- tools/include/uapi/linux/bpf.h | 13 ++- .../selftests/bpf/prog_tests/snprintf_btf_mod.c| 124 + tools/testing/selftests/bpf/progs/bpf_iter.h | 2 +- tools/testing/selftests/bpf/progs/btf_ptr.h| 2 +- tools/testing/selftests/bpf/progs/veth_stats_rx.c | 72 9 files changed, 292 insertions(+), 39 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/snprintf_btf_mod.c create mode 100644 tools/testing/selftests/bpf/progs/veth_stats_rx.c
Re: [PATCH bpf-next v3 13/14] bpf: Add tests for new BPF atomic operations
On 12/4/20 1:45 AM, Brendan Jackman wrote: On Thu, Dec 03, 2020 at 11:06:31PM -0800, Yonghong Song wrote: On 12/3/20 8:02 AM, Brendan Jackman wrote: [...] diff --git a/tools/testing/selftests/bpf/prog_tests/atomics_test.c b/tools/testing/selftests/bpf/prog_tests/atomics_test.c new file mode 100644 index ..66f0ccf4f4ec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics_test.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + + +#include "atomics_test.skel.h" + +static struct atomics_test *setup(void) +{ + struct atomics_test *atomics_skel; + __u32 duration = 0, err; + + atomics_skel = atomics_test__open_and_load(); + if (CHECK(!atomics_skel, "atomics_skel_load", "atomics skeleton failed\n")) + return NULL; + + if (atomics_skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TEST (missing Clang BPF atomics support)", + __func__); + test__skip(); + goto err; + } + + err = atomics_test__attach(atomics_skel); + if (CHECK(err, "atomics_attach", "atomics attach failed: %d\n", err)) + goto err; + + return atomics_skel; + +err: + atomics_test__destroy(atomics_skel); + return NULL; +} + +static void test_add(void) +{ + struct atomics_test *atomics_skel; + int err, prog_fd; + __u32 duration = 0, retval; + + atomics_skel = setup(); When running the test, I observed a noticeable delay between skel load and skel attach. The reason is the bpf program object file contains multiple programs and the above setup() tries to do attachment for ALL programs but actually below only "add" program is tested. This will unnecessarily increase test_progs running time. The best is for setup() here only load and attach program "add". The libbpf API bpf_program__set_autoload() can set a particular program not autoload. You can call attach function explicitly for one specific program. This should be able to reduce test running time. Interesting, thanks a lot - I'll try this out next week. Maybe we can actually load all the progs once at the beginning (i.e. in If you have subtest, people expects subtest can be individual runable. This will complicate your logic. test_atomics_test) then attach/detch each prog individually as needed... Sorry, I haven't got much of a grip on libbpf yet. One alternative is not to do subtests. There is nothing run to have just one bpf program instead of many. This way, you load all and attach once, then do all the test verification.
Re: [PATCH bpf-next v3 10/14] bpf: Add bitwise atomic instructions
On 12/4/20 1:36 AM, Brendan Jackman wrote: On Thu, Dec 03, 2020 at 10:42:19PM -0800, Yonghong Song wrote: On 12/3/20 8:02 AM, Brendan Jackman wrote: This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Change-Id: I340b10cecebea8cb8a52e3606010cde547a10ed4 Signed-off-by: Brendan Jackman --- arch/x86/net/bpf_jit_comp.c | 50 +- include/linux/filter.h | 60 kernel/bpf/core.c| 5 ++- kernel/bpf/disasm.c | 21 ++--- kernel/bpf/verifier.c| 6 tools/include/linux/filter.h | 60 6 files changed, 196 insertions(+), 6 deletions(-) [...] diff --git a/include/linux/filter.h b/include/linux/filter.h index 6186280715ed..698f82897b0d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -280,6 +280,66 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) [...] +#define BPF_ATOMIC_FETCH_XOR(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) {\ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = BPF_XOR | BPF_FETCH }) + /* Atomic exchange, src_reg = atomic_xchg((dst_reg + off), src_reg) */ Looks like BPF_ATOMIC_XOR/OR/AND/... all similar to each other. The same is for BPF_ATOMIC_FETCH_XOR/OR/AND/... I am wondering whether it makes sence to have to BPF_ATOMIC_BOP(BOP, SIZE, DST, SRC, OFF) and BPF_ATOMIC_FETCH_BOP(BOP, SIZE, DST, SRC, OFF) can have less number of macros? Hmm yeah I think that's probably a good idea, it would be consistent with the macros for non-atomic ALU ops. I don't think 'BOP' would be very clear though, 'ALU' might be more obvious. BPF_ATOMIC_ALU and BPF_ATOMIC_FETCH_ALU indeed better.
Re: [PATCH bpf-next v3 09/14] bpf: Pull out a macro for interpreting atomic ALU operations
On 12/4/20 1:29 AM, Brendan Jackman wrote: On Thu, Dec 03, 2020 at 10:30:18PM -0800, Yonghong Song wrote: On 12/3/20 8:02 AM, Brendan Jackman wrote: Since the atomic operations that are added in subsequent commits are all isomorphic with BPF_ADD, pull out a macro to avoid the interpreter becoming dominated by lines of atomic-related code. Note that this sacrificies interpreter performance (combining STX_ATOMIC_W and STX_ATOMIC_DW into single switch case means that we need an extra conditional branch to differentiate them) in favour of compact and (relatively!) simple C code. Change-Id: I8cae5b66e75f34393de6063b91c05a8006fdd9e6 Signed-off-by: Brendan Jackman Ack with a minor suggestion below. Acked-by: Yonghong Song --- kernel/bpf/core.c | 79 +++ 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 28f960bc2e30..498d3f067be7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1618,55 +1618,52 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_ATOMIC_W: - switch (IMM) { - case BPF_ADD: - /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - break; - case BPF_ADD | BPF_FETCH: - SRC = (u32) atomic_fetch_add( - (u32) SRC, - (atomic_t *)(unsigned long) (DST + insn->off)); - break; - case BPF_XCHG: - SRC = (u32) atomic_xchg( - (atomic_t *)(unsigned long) (DST + insn->off), - (u32) SRC); - break; - case BPF_CMPXCHG: - BPF_R0 = (u32) atomic_cmpxchg( - (atomic_t *)(unsigned long) (DST + insn->off), - (u32) BPF_R0, (u32) SRC); +#define ATOMIC(BOP, KOP) \ ATOMIC a little bit generic. Maybe ATOMIC_FETCH_BOP? Well it doesn't fetch in all cases and "BOP" is intended to differentiate from KOP i.e. BOP = BPF operation KOP = Kernel operation. Could go with ATOMIC_ALU_OP? ATOMIC_ALU_OP sounds good. + case BOP: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ +(DST + insn->off)); \ + else\ + atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ + (DST + insn->off)); \ + break; \ + case BOP | BPF_FETCH: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + SRC = (u32) atomic_fetch_##KOP( \ + (u32) SRC, \ + (atomic_t *)(unsigned long) (DST + insn->off)); \ + else\ + SRC = (u64) atomic64_fetch_##KOP( \ + (u64) SRC, \ + (atomic64_t *)(s64) (DST + insn->off)); \ break; - default: - goto default_label; - } - CONT; STX_ATOMIC_DW: + STX_ATOMIC_W: switch (IMM) { - case BPF_ADD: - /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) -(DST + insn->off)); - break; - case BPF_ADD | BPF_FETCH: - SRC = (u64) atomic64_fetch_add( - (u64) SRC, - (atomic64_t *)(s64) (DST + insn->off)); - break; + ATOMIC(BPF_ADD, add) + case BPF_XCHG: - SRC = (u64) atomic64_xchg( - (atomic64_t *)(u64) (DST + insn->off), - (u64) SRC); + if (BPF_SIZE(insn->code) == BPF_W) + SRC = (u32) atomic_xchg( +
Re: [PATCH bpf-next v3 13/14] bpf: Add tests for new BPF atomic operations
On 12/3/20 8:02 AM, Brendan Jackman wrote: This relies on the work done by Yonghong Song in https://reviews.llvm.org/D72184 Note the use of a define called ENABLE_ATOMICS_TESTS: this is used to: - Avoid breaking the build for people on old versions of Clang - Avoid needing separate lists of test objects for no_alu32, where atomics are not supported even if Clang has the feature. The atomics_test.o BPF object is built unconditionally both for test_progs and test_progs-no_alu32. For test_progs, if Clang supports atomics, ENABLE_ATOMICS_TESTS is defined, so it includes the proper test code. Otherwise, progs and global vars are defined anyway, as stubs; this means that the skeleton user code still builds. The atomics_test.o userspace object is built once and used for both test_progs and test_progs-no_alu32. A variable called skip_tests is defined in the BPF object's data section, which tells the userspace object whether to skip the atomics test. Change-Id: Iecc12f35f0ded4a1dd805cce1be576e7b27917ef Signed-off-by: Brendan Jackman --- tools/testing/selftests/bpf/Makefile | 4 + .../selftests/bpf/prog_tests/atomics_test.c | 262 ++ .../selftests/bpf/progs/atomics_test.c| 154 ++ .../selftests/bpf/verifier/atomic_and.c | 77 + .../selftests/bpf/verifier/atomic_cmpxchg.c | 96 +++ .../selftests/bpf/verifier/atomic_fetch_add.c | 106 +++ .../selftests/bpf/verifier/atomic_or.c| 77 + .../selftests/bpf/verifier/atomic_xchg.c | 46 +++ .../selftests/bpf/verifier/atomic_xor.c | 77 + 9 files changed, 899 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomics_test.c create mode 100644 tools/testing/selftests/bpf/progs/atomics_test.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_and.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_fetch_add.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_or.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xchg.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_xor.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f21c4841a612..448a9eb1a56c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -431,11 +431,15 @@ TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) +ifeq ($(feature-clang-bpf-atomics),1) + TRUNNER_BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +endif TRUNNER_BPF_LDFLAGS := -mattr=+alu32 $(eval $(call DEFINE_TEST_RUNNER,test_progs)) # Define test_progs-no_alu32 test runner. TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE +TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) TRUNNER_BPF_LDFLAGS := $(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32)) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics_test.c b/tools/testing/selftests/bpf/prog_tests/atomics_test.c new file mode 100644 index ..66f0ccf4f4ec --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomics_test.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + + +#include "atomics_test.skel.h" + +static struct atomics_test *setup(void) +{ + struct atomics_test *atomics_skel; + __u32 duration = 0, err; + + atomics_skel = atomics_test__open_and_load(); + if (CHECK(!atomics_skel, "atomics_skel_load", "atomics skeleton failed\n")) + return NULL; + + if (atomics_skel->data->skip_tests) { + printf("%s:SKIP:no ENABLE_ATOMICS_TEST (missing Clang BPF atomics support)", + __func__); + test__skip(); + goto err; + } + + err = atomics_test__attach(atomics_skel); + if (CHECK(err, "atomics_attach", "atomics attach failed: %d\n", err)) + goto err; + + return atomics_skel; + +err: + atomics_test__destroy(atomics_skel); + return NULL; +} + +static void test_add(void) +{ + struct atomics_test *atomics_skel; + int err, prog_fd; + __u32 duration = 0, retval; + + atomics_skel = setup(); When running the test, I observed a noticeable delay between skel load and skel attach. The reason is the bpf program object file contains multiple programs and the above setup() tries to do attachment for ALL programs but actually below only "add" program is tested. This will unnecessarily increase test_progs running time. The best is for setup() here only load and attach program "add". The libbpf API bpf_program__set_autoloa