From cf7a5cba86fc2d3000c555b9568f7dd0f43bf0d4 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Wed, 11 Aug 2021 12:36:27 +0000 Subject: selftests/bpf: Fix running of XDP bonding tests An "innocent" cleanup in the last version of the XDP bonding patchset moved the "test__start_subtest" calls to the test main function, but I forgot to reverse the condition, which lead to all tests being skipped. Fix it. Fixes: 6aab1c81b98a ("selftests/bpf: Add tests for XDP bonding") Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210811123627.20223-1-joamaki@gmail.com --- tools/testing/selftests/bpf/prog_tests/xdp_bonding.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c index 6b186b4238d0..370d220288a6 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c @@ -493,20 +493,20 @@ void test_xdp_bonding(void) "xdp_redirect_multi_kern__open_and_load")) goto out; - if (!test__start_subtest("xdp_bonding_attach")) + if (test__start_subtest("xdp_bonding_attach")) test_xdp_bonding_attach(&skeletons); for (i = 0; i < ARRAY_SIZE(bond_test_cases); i++) { struct bond_test_case *test_case = &bond_test_cases[i]; - if (!test__start_subtest(test_case->name)) + if (test__start_subtest(test_case->name)) test_xdp_bonding_with_mode( &skeletons, test_case->mode, test_case->xmit_policy); } - if (!test__start_subtest("xdp_bonding_redirect_multi")) + if (test__start_subtest("xdp_bonding_redirect_multi")) test_xdp_bonding_redirect_multi(&skeletons); out: -- cgit v1.2.3 From 2211c825e7b6b99bbcabab4e0130a2779275dcc3 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 11 Aug 2021 17:38:19 -0700 Subject: libbpf: Support weak typed ksyms. Currently weak typeless ksyms have default value zero, when they don't exist in the kernel. However, weak typed ksyms are rejected by libbpf if they can not be resolved. This means that if a bpf object contains the declaration of a nonexistent weak typed ksym, it will be rejected even if there is no program that references the symbol. Nonexistent weak typed ksyms can also default to zero just like typeless ones. This allows programs that access weak typed ksyms to be accepted by verifier, if the accesses are guarded. For example, extern const int bpf_link_fops3 __ksym __weak; /* then in BPF program */ if (&bpf_link_fops3) { /* use bpf_link_fops3 */ } If actual use of nonexistent typed ksym is not guarded properly, verifier would see that register is not PTR_TO_BTF_ID and wouldn't allow to use it for direct memory reads or passing it to BPF helpers. Signed-off-by: Hao Luo Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210812003819.2439037-1-haoluo@google.com --- tools/lib/bpf/libbpf.c | 16 ++++--- tools/testing/selftests/bpf/prog_tests/ksyms_btf.c | 31 ++++++++++++ .../testing/selftests/bpf/progs/test_ksyms_weak.c | 56 ++++++++++++++++++++++ 3 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ksyms_weak.c diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index cb106e8c42cb..ff3c0ee79d85 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5277,11 +5277,11 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } insn[1].imm = ext->kcfg.data_off; } else /* EXT_KSYM */ { - if (ext->ksym.type_id) { /* typed ksyms */ + if (ext->ksym.type_id && ext->is_set) { /* typed ksyms */ insn[0].src_reg = BPF_PSEUDO_BTF_ID; insn[0].imm = ext->ksym.kernel_btf_id; insn[1].imm = ext->ksym.kernel_btf_obj_fd; - } else { /* typeless ksyms */ + } else { /* typeless ksyms or unresolved typed ksyms */ insn[0].imm = (__u32)ext->ksym.addr; insn[1].imm = ext->ksym.addr >> 32; } @@ -6608,11 +6608,8 @@ static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, break; } } - if (id <= 0) { - pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", - __btf_kind_str(kind), ksym_name); + if (id <= 0) return -ESRCH; - } *res_btf = btf; *res_btf_fd = btf_fd; @@ -6629,8 +6626,13 @@ static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, struct btf *btf = NULL; id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd); - if (id < 0) + if (id == -ESRCH && ext->is_weak) { + return 0; + } else if (id < 0) { + pr_warn("extern (var ksym) '%s': not found in kernel BTF\n", + ext->name); return id; + } /* find local type_id */ local_type_id = ext->ksym.type_id; diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index 67bebd324147..cf3acfa5a91d 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -6,6 +6,7 @@ #include #include "test_ksyms_btf.skel.h" #include "test_ksyms_btf_null_check.skel.h" +#include "test_ksyms_weak.skel.h" static int duration; @@ -81,6 +82,33 @@ static void test_null_check(void) test_ksyms_btf_null_check__destroy(skel); } +static void test_weak_syms(void) +{ + struct test_ksyms_weak *skel; + struct test_ksyms_weak__data *data; + int err; + + skel = test_ksyms_weak__open_and_load(); + if (CHECK(!skel, "test_ksyms_weak__open_and_load", "failed\n")) + return; + + err = test_ksyms_weak__attach(skel); + if (CHECK(err, "test_ksyms_weak__attach", "skeleton attach failed: %d\n", err)) + goto cleanup; + + /* trigger tracepoint */ + usleep(1); + + data = skel->data; + ASSERT_EQ(data->out__existing_typed, 0, "existing typed ksym"); + ASSERT_NEQ(data->out__existing_typeless, -1, "existing typeless ksym"); + ASSERT_EQ(data->out__non_existent_typeless, 0, "nonexistent typeless ksym"); + ASSERT_EQ(data->out__non_existent_typed, 0, "nonexistent typed ksym"); + +cleanup: + test_ksyms_weak__destroy(skel); +} + void test_ksyms_btf(void) { int percpu_datasec; @@ -105,4 +133,7 @@ void test_ksyms_btf(void) if (test__start_subtest("null_check")) test_null_check(); + + if (test__start_subtest("weak_ksyms")) + test_weak_syms(); } diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c new file mode 100644 index 000000000000..5f8379aadb29 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test weak ksyms. + * + * Copyright (c) 2021 Google + */ + +#include "vmlinux.h" + +#include + +int out__existing_typed = -1; +__u64 out__existing_typeless = -1; + +__u64 out__non_existent_typeless = -1; +__u64 out__non_existent_typed = -1; + +/* existing weak symbols */ + +/* test existing weak symbols can be resolved. */ +extern const struct rq runqueues __ksym __weak; /* typed */ +extern const void bpf_prog_active __ksym __weak; /* typeless */ + + +/* non-existent weak symbols. */ + +/* typeless symbols, default to zero. */ +extern const void bpf_link_fops1 __ksym __weak; + +/* typed symbols, default to zero. */ +extern const int bpf_link_fops2 __ksym __weak; + +SEC("raw_tp/sys_enter") +int pass_handler(const void *ctx) +{ + struct rq *rq; + + /* tests existing symbols. */ + rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0); + if (rq) + out__existing_typed = rq->cpu; + out__existing_typeless = (__u64)&bpf_prog_active; + + /* tests non-existent symbols. */ + out__non_existent_typeless = (__u64)&bpf_link_fops1; + + /* tests non-existent symbols. */ + out__non_existent_typed = (__u64)&bpf_link_fops2; + + if (&bpf_link_fops2) /* can't happen */ + out__non_existent_typed = (__u64)bpf_per_cpu_ptr(&bpf_link_fops2, 0); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d164dd9a5c08c16a883b3de97d13948c7be7fa4d Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 13 Aug 2021 00:48:14 +0200 Subject: selftests/bpf: Fix test_core_autosize on big-endian machines The "probed" part of test_core_autosize copies an integer using bpf_core_read() into an integer of a potentially different size. On big-endian machines a destination offset is required for this to produce a sensible result. Fixes: 888d83b961f6 ("selftests/bpf: Validate libbpf's auto-sizing of LD/ST/STX instructions") Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210812224814.187460-1-iii@linux.ibm.com --- .../testing/selftests/bpf/progs/test_core_autosize.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_core_autosize.c b/tools/testing/selftests/bpf/progs/test_core_autosize.c index 44f5aa2e8956..9a7829c5e4a7 100644 --- a/tools/testing/selftests/bpf/progs/test_core_autosize.c +++ b/tools/testing/selftests/bpf/progs/test_core_autosize.c @@ -125,6 +125,16 @@ int handle_downsize(void *ctx) return 0; } +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define bpf_core_read_int bpf_core_read +#else +#define bpf_core_read_int(dst, sz, src) ({ \ + /* Prevent "subtraction from stack pointer prohibited" */ \ + volatile long __off = sizeof(*dst) - (sz); \ + bpf_core_read((char *)(dst) + __off, sz, src); \ +}) +#endif + SEC("raw_tp/sys_enter") int handle_probed(void *ctx) { @@ -132,23 +142,23 @@ int handle_probed(void *ctx) __u64 tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr); + bpf_core_read_int(&tmp, bpf_core_field_size(in->ptr), &in->ptr); ptr_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val1), &in->val1); val1_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val2), &in->val2); val2_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val3), &in->val3); val3_probed = tmp; tmp = 0; - bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4); + bpf_core_read_int(&tmp, bpf_core_field_size(in->val4), &in->val4); val4_probed = tmp; return 0; -- cgit v1.2.3 From f1248dee954c2ddb0ece47a13591e5d55d422d22 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Aug 2021 16:05:29 -0700 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_CGROUP_SOCKOPT This is similar to existing BPF_PROG_TYPE_CGROUP_SOCK and BPF_PROG_TYPE_CGROUP_SOCK_ADDR. Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210813230530.333779-2-sdf@google.com --- kernel/bpf/cgroup.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b567ca46555c..9f6070369caa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1846,11 +1846,29 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; +#ifdef CONFIG_NET +BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx) +{ + const struct net *net = ctx ? sock_net(ctx->sk) : &init_net; + + return net->net_cookie; +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { + .func = bpf_get_netns_cookie_sockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; +#endif + static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #ifdef CONFIG_NET + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sockopt_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: -- cgit v1.2.3 From 6a3a3dcc3f0e5dde3c9417f0419ff8efbab60c60 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Aug 2021 16:05:30 -0700 Subject: selftests/bpf: Verify bpf_get_netns_cookie in BPF_PROG_TYPE_CGROUP_SOCKOPT Add extra calls to sockopt_sk.c. Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210813230530.333779-3-sdf@google.com --- tools/testing/selftests/bpf/progs/sockopt_sk.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 8acdb99b5959..79c8139b63b8 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -33,6 +33,14 @@ int _getsockopt(struct bpf_sockopt *ctx) __u8 *optval = ctx->optval; struct sockopt_sk *storage; + /* Make sure bpf_get_netns_cookie is callable. + */ + if (bpf_get_netns_cookie(NULL) == 0) + return 0; + + if (bpf_get_netns_cookie(ctx) == 0) + return 0; + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { /* Not interested in SOL_IP:IP_TOS; * let next BPF program in the cgroup chain or kernel @@ -123,6 +131,14 @@ int _setsockopt(struct bpf_sockopt *ctx) __u8 *optval = ctx->optval; struct sockopt_sk *storage; + /* Make sure bpf_get_netns_cookie is callable. + */ + if (bpf_get_netns_cookie(NULL) == 0) + return 0; + + if (bpf_get_netns_cookie(ctx) == 0) + return 0; + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { /* Not interested in SOL_IP:IP_TOS; * let next BPF program in the cgroup chain or kernel -- cgit v1.2.3 From d1bf7c4d5deae6685a42463f4d29418fd2515d05 Mon Sep 17 00:00:00 2001 From: Muhammad Falak R Wani Date: Sun, 15 Aug 2021 12:20:13 +0530 Subject: samples/bpf: Define MAX_ENTRIES instead of a magic number in offwaketime Define MAX_ENTRIES instead of using 10000 as a magic number in various places. Signed-off-by: Muhammad Falak R Wani Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210815065013.15411-1-falakreyaz@gmail.com --- samples/bpf/offwaketime_kern.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index 14b792915a9c..4866afd054da 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -20,6 +20,7 @@ }) #define MINBLOCK_US 1 +#define MAX_ENTRIES 10000 struct key_t { char waker[TASK_COMM_LEN]; @@ -32,14 +33,14 @@ struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, struct key_t); __type(value, u64); - __uint(max_entries, 10000); + __uint(max_entries, MAX_ENTRIES); } counts SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, u32); __type(value, u64); - __uint(max_entries, 10000); + __uint(max_entries, MAX_ENTRIES); } start SEC(".maps"); struct wokeby_t { @@ -51,14 +52,14 @@ struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, u32); __type(value, struct wokeby_t); - __uint(max_entries, 10000); + __uint(max_entries, MAX_ENTRIES); } wokeby SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_STACK_TRACE); __uint(key_size, sizeof(u32)); __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); - __uint(max_entries, 10000); + __uint(max_entries, MAX_ENTRIES); } stackmap SEC(".maps"); #define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) -- cgit v1.2.3 From 2c860a43dd77f969bb959336a2f743d7103a8f63 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:15 +0900 Subject: bpf: af_unix: Implement BPF iterator for UNIX domain socket. This patch implements the BPF iterator for the UNIX domain socket. Currently, the batch optimisation introduced for the TCP iterator in the commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") is not used for the UNIX domain socket. It will require replacing the big lock for the hash table with small locks for each hash list not to block other processes. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-2-kuniyu@amazon.co.jp --- include/linux/btf_ids.h | 3 +- net/unix/af_unix.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 57890b357f85..bed4b9964581 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -172,7 +172,8 @@ extern struct btf_id_set name; BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock) \ BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock) \ - BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock) \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UNIX, unix_sock) enum { #define BTF_SOCK_TYPE(name, str) name, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1c2224f05b51..bad8f19174e3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -113,6 +113,7 @@ #include #include #include +#include #include "scm.h" @@ -3143,6 +3144,64 @@ static const struct seq_operations unix_seq_ops = { .stop = unix_seq_stop, .show = unix_seq_show, }; + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__unix { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct unix_sock *, unix_sk); + uid_t uid __aligned(8); +}; + +static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) +{ + struct bpf_iter__unix ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.unix_sk = unix_sk; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return unix_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)unix_prog_seq_show(prog, &meta, v, 0); + } + + unix_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = bpf_iter_unix_seq_stop, + .show = bpf_iter_unix_seq_show, +}; +#endif #endif static const struct net_proto_family unix_family_ops = { @@ -3183,6 +3242,35 @@ static struct pernet_operations unix_net_ops = { .exit = unix_net_exit, }; +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) + +static const struct bpf_iter_seq_info unix_seq_info = { + .seq_ops = &bpf_iter_unix_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct seq_net_private), +}; + +static struct bpf_iter_reg unix_reg_info = { + .target = "unix", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__unix, unix_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &unix_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; + if (bpf_iter_reg_target(&unix_reg_info)) + pr_warn("Warning: could not register bpf iterator unix\n"); +} +#endif + static int __init af_unix_init(void) { int rc = -1; @@ -3198,6 +3286,11 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif + out: return rc; } -- cgit v1.2.3 From 3478cfcfcddff0f3aad82891be2992e51c4f7936 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:16 +0900 Subject: bpf: Support "%c" in bpf_bprintf_prepare(). /proc/net/unix uses "%c" to print a single-byte character to escape '\0' in the name of the abstract UNIX domain socket. The following selftest uses it, so this patch adds support for "%c". Note that it does not support wide character ("%lc" and "%llc") for simplicity. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-3-kuniyu@amazon.co.jp --- kernel/bpf/helpers.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 32761be48143..4e8540716187 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -907,6 +907,20 @@ fmt_str: tmp_buf += err; num_spec++; + continue; + } else if (fmt[i] == 'c') { + if (!tmp_buf) + goto nocopy_fmt; + + if (tmp_buf_end == tmp_buf) { + err = -ENOSPC; + goto out; + } + + *tmp_buf = raw_args[num_spec]; + tmp_buf++; + num_spec++; + continue; } -- cgit v1.2.3 From 04e928180c14332fb15a1b8c64418b602978a51e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:17 +0900 Subject: selftest/bpf: Implement sample UNIX domain socket iterator program. The iterator can output almost the same result compared to /proc/net/unix. The header line is aligned, and the Inode column uses "%8lu" because "%5lu" can be easily overflown. # cat /sys/fs/bpf/unix Num RefCount Protocol Flags Type St Inode Path ffff963c06689800: 00000002 00000000 00010000 0001 01 18697 private/defer ffff963c7c979c00: 00000002 00000000 00000000 0001 01 598245 @Hello@World@ # cat /proc/net/unix Num RefCount Protocol Flags Type St Inode Path ffff963c06689800: 00000002 00000000 00010000 0001 01 18697 private/defer ffff963c7c979c00: 00000002 00000000 00000000 0001 01 598245 @Hello@World@ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-4-kuniyu@amazon.co.jp --- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 16 +++++ tools/testing/selftests/bpf/progs/bpf_iter.h | 8 +++ tools/testing/selftests/bpf/progs/bpf_iter_unix.c | 80 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/bpf_tracing_net.h | 4 ++ 4 files changed, 108 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_unix.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 1f1aade56504..77ac24b191d4 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -13,6 +13,7 @@ #include "bpf_iter_tcp6.skel.h" #include "bpf_iter_udp4.skel.h" #include "bpf_iter_udp6.skel.h" +#include "bpf_iter_unix.skel.h" #include "bpf_iter_test_kern1.skel.h" #include "bpf_iter_test_kern2.skel.h" #include "bpf_iter_test_kern3.skel.h" @@ -313,6 +314,19 @@ static void test_udp6(void) bpf_iter_udp6__destroy(skel); } +static void test_unix(void) +{ + struct bpf_iter_unix *skel; + + skel = bpf_iter_unix__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_unix__open_and_load")) + return; + + do_dummy_read(skel->progs.dump_unix); + + bpf_iter_unix__destroy(skel); +} + /* The expected string is less than 16 bytes */ static int do_read_with_fd(int iter_fd, const char *expected, bool read_one_char) @@ -1255,6 +1269,8 @@ void test_bpf_iter(void) test_udp4(); if (test__start_subtest("udp6")) test_udp6(); + if (test__start_subtest("unix")) + test_unix(); if (test__start_subtest("anon")) test_anon_iter(false); if (test__start_subtest("anon-read-one-char")) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index 3d83b185c4bc..8cfaeba1ddbf 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -12,6 +12,7 @@ #define tcp6_sock tcp6_sock___not_used #define bpf_iter__udp bpf_iter__udp___not_used #define udp6_sock udp6_sock___not_used +#define bpf_iter__unix bpf_iter__unix___not_used #define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used #define bpf_iter__sockmap bpf_iter__sockmap___not_used @@ -32,6 +33,7 @@ #undef tcp6_sock #undef bpf_iter__udp #undef udp6_sock +#undef bpf_iter__unix #undef bpf_iter__bpf_map_elem #undef bpf_iter__bpf_sk_storage_map #undef bpf_iter__sockmap @@ -103,6 +105,12 @@ struct udp6_sock { struct ipv6_pinfo inet6; } __attribute__((preserve_access_index)); +struct bpf_iter__unix { + struct bpf_iter_meta *meta; + struct unix_sock *unix_sk; + uid_t uid; +} __attribute__((preserve_access_index)); + struct bpf_iter__bpf_map_elem { struct bpf_iter_meta *meta; struct bpf_map *map; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_unix.c b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c new file mode 100644 index 000000000000..94423902685d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_unix.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ +#include "bpf_iter.h" +#include "bpf_tracing_net.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +static long sock_i_ino(const struct sock *sk) +{ + const struct socket *sk_socket = sk->sk_socket; + const struct inode *inode; + unsigned long ino; + + if (!sk_socket) + return 0; + + inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode; + bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino); + return ino; +} + +SEC("iter/unix") +int dump_unix(struct bpf_iter__unix *ctx) +{ + struct unix_sock *unix_sk = ctx->unix_sk; + struct sock *sk = (struct sock *)unix_sk; + struct seq_file *seq; + __u32 seq_num; + + if (!unix_sk) + return 0; + + seq = ctx->meta->seq; + seq_num = ctx->meta->seq_num; + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, "Num RefCount Protocol Flags Type St Inode Path\n"); + + BPF_SEQ_PRINTF(seq, "%pK: %08X %08X %08X %04X %02X %8lu", + unix_sk, + sk->sk_refcnt.refs.counter, + 0, + sk->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, + sk->sk_type, + sk->sk_socket ? + (sk->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : + (sk->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), + sock_i_ino(sk)); + + if (unix_sk->addr) { + if (!UNIX_ABSTRACT(unix_sk)) { + BPF_SEQ_PRINTF(seq, " %s", unix_sk->addr->name->sun_path); + } else { + /* The name of the abstract UNIX domain socket starts + * with '\0' and can contain '\0'. The null bytes + * should be escaped as done in unix_seq_show(). + */ + __u64 i, len; + + len = unix_sk->addr->len - sizeof(short); + + BPF_SEQ_PRINTF(seq, " @"); + + for (i = 1; i < len; i++) { + /* unix_mkname() tests this upper bound. */ + if (i >= sizeof(struct sockaddr_un)) + break; + + BPF_SEQ_PRINTF(seq, "%c", + unix_sk->addr->name->sun_path[i] ?: + '@'); + } + } + } + + BPF_SEQ_PRINTF(seq, "\n"); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 3af0998a0623..eef5646ddb19 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -5,6 +5,10 @@ #define AF_INET 2 #define AF_INET6 10 +#define __SO_ACCEPTCON (1 << 16) +#define UNIX_HASH_SIZE 256 +#define UNIX_ABSTRACT(unix_sk) (unix_sk->addr->hash < UNIX_HASH_SIZE) + #define SOL_TCP 6 #define TCP_CONGESTION 13 #define TCP_CA_NAME_MAX 16 -- cgit v1.2.3 From ce547335d4a42e645320402b24aeadb39531f73c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:18 +0900 Subject: selftest/bpf: Extend the bpf_snprintf() test for "%c". This patch adds various "positive" patterns for "%c" and two "negative" patterns for wide character. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210814015718.42704-5-kuniyu@amazon.co.jp --- tools/testing/selftests/bpf/prog_tests/snprintf.c | 4 +++- tools/testing/selftests/bpf/progs/test_snprintf.c | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c index dffbcaa1ec98..8fd1b4b29a0e 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -19,7 +19,7 @@ #define EXP_ADDR_OUT "0000000000000000 ffff00000add4e55 " #define EXP_ADDR_RET sizeof(EXP_ADDR_OUT "unknownhashedptr") -#define EXP_STR_OUT "str1 longstr" +#define EXP_STR_OUT "str1 a b c d e longstr" #define EXP_STR_RET sizeof(EXP_STR_OUT) #define EXP_OVER_OUT "%over" @@ -114,6 +114,8 @@ void test_snprintf_negative(void) ASSERT_ERR(load_single_snprintf("%"), "invalid specifier 3"); ASSERT_ERR(load_single_snprintf("%12345678"), "invalid specifier 4"); ASSERT_ERR(load_single_snprintf("%--------"), "invalid specifier 5"); + ASSERT_ERR(load_single_snprintf("%lc"), "invalid specifier 6"); + ASSERT_ERR(load_single_snprintf("%llc"), "invalid specifier 7"); ASSERT_ERR(load_single_snprintf("\x80"), "non ascii character"); ASSERT_ERR(load_single_snprintf("\x1"), "non printable character"); } diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c index e2ad26150f9b..8fda07544023 100644 --- a/tools/testing/selftests/bpf/progs/test_snprintf.c +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -59,9 +59,9 @@ int handler(const void *ctx) /* Kernel pointers */ addr_ret = BPF_SNPRINTF(addr_out, sizeof(addr_out), "%pK %px %p", 0, 0xFFFF00000ADD4E55, 0xFFFF00000ADD4E55); - /* Strings embedding */ - str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s %+05s", - str1, longstr); + /* Strings and single-byte character embedding */ + str_ret = BPF_SNPRINTF(str_out, sizeof(str_out), "%s % 9c %+2c %-3c %04c %0c %+05s", + str1, 'a', 'b', 'c', 'd', 'e', longstr); /* Overflow */ over_ret = BPF_SNPRINTF(over_out, sizeof(over_out), "%%overflow"); /* Padding of fixed width numbers */ -- cgit v1.2.3 From 1bda52f80471260bcc7391f4e6919effedfc88d8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 15 Aug 2021 22:39:50 +0100 Subject: bpf, tests: Fix spelling mistake "shoft" -> "shift" There is a spelling mistake in a literal string. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815213950.47751-1-colin.king@canonical.com --- lib/test_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 44d8197bbffb..77fe6fde56c5 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5163,7 +5163,7 @@ static struct bpf_test tests[] = { { { 0, -1 } } }, { - "ALU64_ARSH_K: Zero shoft", + "ALU64_ARSH_K: Zero shift", .u.insns_int = { BPF_LD_IMM64(R0, 0x8123456789abcdefLL), BPF_ALU64_IMM(BPF_ARSH, R0, 0), -- cgit v1.2.3 From fb7dd8bca0139fd73d3f4a6cd257b11731317ded Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:54 -0700 Subject: bpf: Refactor BPF_PROG_RUN into a function Turn BPF_PROG_RUN into a proper always inlined function. No functional and performance changes are intended, but it makes it much easier to understand what's going on with how BPF programs are actually get executed. It's more obvious what types and callbacks are expected. Also extra () around input parameters can be dropped, as well as `__` variable prefixes intended to avoid naming collisions, which makes the code simpler to read and write. This refactoring also highlighted one extra issue. BPF_PROG_RUN is both a macro and an enum value (BPF_PROG_RUN == BPF_PROG_TEST_RUN). Turning BPF_PROG_RUN into a function causes naming conflict compilation error. So rename BPF_PROG_RUN into lower-case bpf_prog_run(), similar to bpf_prog_run_xdp(), bpf_prog_run_pin_on_cpu(), etc. All existing callers of BPF_PROG_RUN, the macro, are switched to bpf_prog_run() explicitly. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-2-andrii@kernel.org --- Documentation/networking/filter.rst | 4 +-- drivers/media/rc/bpf-lirc.c | 2 +- drivers/net/ppp/ppp_generic.c | 8 ++--- drivers/net/team/team_mode_loadbalance.c | 2 +- include/linux/bpf.h | 2 +- include/linux/filter.h | 61 +++++++++++++++++++------------- kernel/bpf/bpf_iter.c | 2 +- kernel/bpf/cgroup.c | 16 ++++----- kernel/bpf/core.c | 2 +- kernel/bpf/trampoline.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/events/core.c | 2 +- kernel/trace/bpf_trace.c | 4 +-- lib/test_bpf.c | 2 +- net/bpf/test_run.c | 6 ++-- net/core/filter.c | 4 +-- net/core/ptp_classifier.c | 2 +- net/netfilter/xt_bpf.c | 2 +- net/sched/act_bpf.c | 4 +-- net/sched/cls_bpf.c | 4 +-- 20 files changed, 73 insertions(+), 60 deletions(-) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 5f13905b12e0..ce2b8e8bb9ab 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -638,8 +638,8 @@ extension, PTP dissector/classifier, and much more. They are all internally converted by the kernel into the new instruction set representation and run in the eBPF interpreter. For in-kernel handlers, this all works transparently by using bpf_prog_create() for setting up the filter, resp. -bpf_prog_destroy() for destroying it. The macro -BPF_PROG_RUN(filter, ctx) transparently invokes eBPF interpreter or JITed +bpf_prog_destroy() for destroying it. The function +bpf_prog_run(filter, ctx) transparently invokes eBPF interpreter or JITed code to run the filter. 'filter' is a pointer to struct bpf_prog that we got from bpf_prog_create(), and 'ctx' the given context (e.g. skb pointer). All constraints and restrictions from bpf_check_classic() apply diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index afae0afe3f81..bb5a9dc78f1b 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -217,7 +217,7 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample) raw->bpf_sample = sample; if (raw->progs) - BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN); + BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run); } /* diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index e9e81573f21e..fb52cd175b45 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1744,7 +1744,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) a four-byte PPP header on each packet */ *(u8 *)skb_push(skb, 2) = 1; if (ppp->pass_filter && - BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { + bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: outbound frame " @@ -1754,7 +1754,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) } /* if this packet passes the active filter, record the time */ if (!(ppp->active_filter && - BPF_PROG_RUN(ppp->active_filter, skb) == 0)) + bpf_prog_run(ppp->active_filter, skb) == 0)) ppp->last_xmit = jiffies; skb_pull(skb, 2); #else @@ -2468,7 +2468,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) *(u8 *)skb_push(skb, 2) = 0; if (ppp->pass_filter && - BPF_PROG_RUN(ppp->pass_filter, skb) == 0) { + bpf_prog_run(ppp->pass_filter, skb) == 0) { if (ppp->debug & 1) netdev_printk(KERN_DEBUG, ppp->dev, "PPP: inbound frame " @@ -2477,7 +2477,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) return; } if (!(ppp->active_filter && - BPF_PROG_RUN(ppp->active_filter, skb) == 0)) + bpf_prog_run(ppp->active_filter, skb) == 0)) ppp->last_recv = jiffies; __skb_pull(skb, 2); } else diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index 32aef8ac4a14..b095a4b4957b 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -197,7 +197,7 @@ static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv, fp = rcu_dereference_bh(lb_priv->fp); if (unlikely(!fp)) return 0; - lhash = BPF_PROG_RUN(fp, skb); + lhash = bpf_prog_run(fp, skb); c = (char *) &lhash; return c[0] ^ c[1] ^ c[2] ^ c[3]; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8cc09013210..968fea98087a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1103,7 +1103,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, /* an array of programs to be executed under rcu_lock. * * Typical usage: - * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN); + * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run); * * the structure returned by bpf_prog_array_alloc() should be populated * with program pointers and the last pointer must be NULL. diff --git a/include/linux/filter.h b/include/linux/filter.h index 1797e8506929..954373db20e7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -600,25 +600,38 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); -#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 __ret; \ - cant_migrate(); \ - if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *__stats; \ - u64 __start = sched_clock(); \ - __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - __stats = this_cpu_ptr(prog->stats); \ - u64_stats_update_begin(&__stats->syncp); \ - __stats->cnt++; \ - __stats->nsecs += sched_clock() - __start; \ - u64_stats_update_end(&__stats->syncp); \ - } else { \ - __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - } \ - __ret; }) - -#define BPF_PROG_RUN(prog, ctx) \ - __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) +typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, + const struct bpf_insn *insnsi, + unsigned int (*bpf_func)(const void *, + const struct bpf_insn *)); + +static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, + const void *ctx, + bpf_dispatcher_fn dfunc) +{ + u32 ret; + + cant_migrate(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + struct bpf_prog_stats *stats; + u64 start = sched_clock(); + + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->cnt++; + stats->nsecs += sched_clock() - start; + u64_stats_update_end(&stats->syncp); + } else { + ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + } + return ret; +} + +static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) +{ + return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); +} /* * Use in preemptible and therefore migratable context to make sure that @@ -637,7 +650,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, u32 ret; migrate_disable(); - ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func); + ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } @@ -742,7 +755,7 @@ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, memset(cb_data, 0, sizeof(cb_saved)); } - res = BPF_PROG_RUN(prog, skb); + res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); @@ -787,7 +800,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ - u32 act = __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); + u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) @@ -1440,7 +1453,7 @@ static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, }; u32 act; - act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; @@ -1478,7 +1491,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, }; u32 act; - act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN); + act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2e9d47bb40ff..b2ee45064e06 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -686,7 +686,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) rcu_read_lock(); migrate_disable(); - ret = BPF_PROG_RUN(prog, ctx); + ret = bpf_prog_run(prog, ctx); migrate_enable(); rcu_read_unlock(); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f6070369caa..16dc467adfa0 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1091,7 +1091,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN, flags); + bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; } @@ -1121,7 +1121,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int ret; ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, - BPF_PROG_RUN); + bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1140,7 +1140,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN); + bpf_prog_run); rcu_read_unlock(); return !allow; @@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1386,7 +1386,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1496,7 +1496,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1557,7 +1557,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82af6279992d..5ee2ec27c3d4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1879,7 +1879,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. - * The BPF program will be executed via BPF_PROG_RUN() macro. + * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b2535acfe9db..fe1e857324e6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -548,7 +548,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) u64_stats_update_end(&stats->syncp); } -/* The logic is similar to BPF_PROG_RUN, but with an explicit +/* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into * call __bpf_prog_enter diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ea2238a6656..f5a0077c9981 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12383,7 +12383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - /* BPF_PROG_RUN doesn't call subprogs directly, + /* bpf_prog_run() doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id * func[i]->stats will never be accessed and stays NULL diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..7d20743b48e1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9913,7 +9913,7 @@ static void bpf_overflow_handler(struct perf_event *event, if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, &ctx); + ret = bpf_prog_run(event->prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0da94e1d6af9..05a5a556671d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run); out: __this_cpu_dec(bpf_prog_active); @@ -1816,7 +1816,7 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); rcu_read_lock(); - (void) BPF_PROG_RUN(prog, args); + (void) bpf_prog_run(prog, args); rcu_read_unlock(); } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 77fe6fde56c5..830a18ecffc8 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -8616,7 +8616,7 @@ static int __run_one(const struct bpf_prog *fp, const void *data, start = ktime_get_ns(); for (i = 0; i < runs; i++) - ret = BPF_PROG_RUN(fp, data); + ret = bpf_prog_run(fp, data); finish = ktime_get_ns(); migrate_enable(); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4b855af267b1..2eb0e55ef54d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -116,7 +116,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else - *retval = BPF_PROG_RUN(prog, ctx); + *retval = bpf_prog_run(prog, ctx); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); @@ -327,7 +327,7 @@ __bpf_prog_test_run_raw_tp(void *data) struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); - info->retval = BPF_PROG_RUN(info->prog, info->ctx); + info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } @@ -989,7 +989,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; - retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); bpf_test_timer_leave(&t); diff --git a/net/core/filter.c b/net/core/filter.c index 3aca07c44fad..5cf38e8886f1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to BPF_PROG_RUN. It returns 0 if the packet should + * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -10115,7 +10115,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); - action = BPF_PROG_RUN(prog, &reuse_kern); + action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index e33fde06d528..dd4cf01d1e0a 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return BPF_PROG_RUN(ptp_insns, skb); + return bpf_prog_run(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 13cf3f9b5938..849ac552a154 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return BPF_PROG_RUN(info->filter, skb); + return bpf_prog_run(info->filter, skb); } static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 040807aa15b9..5c36013339e1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,11 +47,11 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3b472bafdc9d..df19a847829e 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -96,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); } if (prog->exts_integrated) { -- cgit v1.2.3 From 7d08c2c9117113fee118487425ed55efa50cbfa9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:55 -0700 Subject: bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions Similar to BPF_PROG_RUN, turn BPF_PROG_RUN_ARRAY macros into proper functions with all the same readability and maintainability benefits. Making them into functions required shuffling around bpf_set_run_ctx/bpf_reset_run_ctx functions. Also, explicitly specifying the type of the BPF prog run callback required adjusting __bpf_prog_run_save_cb() to accept const void *, casted internally to const struct sk_buff. Further, split out a cgroup-specific BPF_PROG_RUN_ARRAY_CG and BPF_PROG_RUN_ARRAY_CG_FLAGS from the more generic BPF_PROG_RUN_ARRAY due to the differences in bpf_run_ctx used for those two different use cases. I think BPF_PROG_RUN_ARRAY_CG would benefit from further refactoring to accept struct cgroup and enum bpf_attach_type instead of bpf_prog_array, fetching cgrp->bpf.effective[type] and RCU-dereferencing it internally. But that required including include/linux/cgroup-defs.h, which I wasn't sure is ok with everyone. The remaining generic BPF_PROG_RUN_ARRAY function will be extended to pass-through user-provided context value in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-3-andrii@kernel.org --- include/linux/bpf.h | 179 +++++++++++++++++++++++++++-------------------- include/linux/filter.h | 5 +- kernel/bpf/cgroup.c | 32 ++++----- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 124 insertions(+), 94 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 968fea98087a..344e0d4d8ef6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1146,67 +1146,116 @@ struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; - struct bpf_prog_array_item *prog_item; + const struct bpf_prog_array_item *prog_item; }; +static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) +{ + struct bpf_run_ctx *old_ctx = NULL; + +#ifdef CONFIG_BPF_SYSCALL + old_ctx = current->bpf_ctx; + current->bpf_ctx = new_ctx; +#endif + return old_ctx; +} + +static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) +{ +#ifdef CONFIG_BPF_SYSCALL + current->bpf_ctx = old_ctx; +#endif +} + /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) -#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - struct bpf_run_ctx *old_run_ctx; \ - struct bpf_cg_run_ctx run_ctx; \ - u32 _ret = 1; \ - u32 func_ret; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); \ - while ((_prog = READ_ONCE(_item->prog))) { \ - run_ctx.prog_item = _item; \ - func_ret = func(_prog, ctx); \ - _ret &= (func_ret & 1); \ - *(ret_flags) |= (func_ret >> 1); \ - _item++; \ - } \ - bpf_reset_run_ctx(old_run_ctx); \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) - -#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ - ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - struct bpf_run_ctx *old_run_ctx; \ - struct bpf_cg_run_ctx run_ctx; \ - u32 _ret = 1; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - if (unlikely(check_non_null && !_array))\ - goto _out; \ - _item = &_array->items[0]; \ - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\ - while ((_prog = READ_ONCE(_item->prog))) { \ - run_ctx.prog_item = _item; \ - _ret &= func(_prog, ctx); \ - _item++; \ - } \ - bpf_reset_run_ctx(old_run_ctx); \ -_out: \ - rcu_read_unlock(); \ - migrate_enable(); \ - _ret; \ - }) +typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); + +static __always_inline u32 +BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog, + u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 ret = 1; + u32 func_ret; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + ret &= (func_ret & 1); + *(ret_flags) |= (func_ret >> 1); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return ret; +} + +static __always_inline u32 +BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 ret = 1; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + ret &= run_prog(prog, ctx); + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return ret; +} + +static __always_inline u32 +BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, + const void *ctx, bpf_prog_run_fn run_prog) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + u32 ret = 1; + + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(array_rcu); + if (unlikely(!array)) + goto out; + item = &array->items[0]; + while ((prog = READ_ONCE(item->prog))) { + ret &= run_prog(prog, ctx); + item++; + } +out: + rcu_read_unlock(); + migrate_enable(); + return ret; +} /* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs * so BPF programs can request cwr for TCP packets. @@ -1235,7 +1284,7 @@ _out: \ u32 _flags = 0; \ bool _cn; \ u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \ + _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ @@ -1244,12 +1293,6 @@ _out: \ _ret; \ }) -#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) - -#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ - __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) - #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; @@ -1284,20 +1327,6 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } -static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) -{ - struct bpf_run_ctx *old_ctx; - - old_ctx = current->bpf_ctx; - current->bpf_ctx = new_ctx; - return old_ctx; -} - -static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) -{ - current->bpf_ctx = old_ctx; -} - extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; diff --git a/include/linux/filter.h b/include/linux/filter.h index 954373db20e7..7d248941ecea 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -723,7 +723,7 @@ static inline void bpf_restore_data_end( cb->data_end = saved_data_end; } -static inline u8 *bpf_skb_cb(struct sk_buff *skb) +static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with @@ -744,8 +744,9 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb) /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, - struct sk_buff *skb) + const void *ctx) { + const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 16dc467adfa0..a1dedba4c174 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1012,8 +1012,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } bpf_restore_data_end(skb, saved_data_end); @@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1090,8 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, - bpf_prog_run, flags); + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; } @@ -1120,8 +1120,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1139,8 +1139,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - bpf_prog_run); + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + bpf_prog_run); rcu_read_unlock(); return !allow; @@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1385,8 +1385,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1495,8 +1495,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1556,8 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 05a5a556671d..91867b14b222 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3 From 652c1b17b85b9c195978c051aa283027529db1fe Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:56 -0700 Subject: bpf: Refactor perf_event_set_bpf_prog() to use struct bpf_prog input Make internal perf_event_set_bpf_prog() use struct bpf_prog pointer as an input argument, which makes it easier to re-use for other internal uses (coming up for BPF link in the next patch). BPF program FD is not as convenient and in some cases it's not available. So switch to struct bpf_prog, move out refcounting outside and let caller do bpf_prog_put() in case of an error. This follows the approach of most of the other BPF internal functions. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-4-andrii@kernel.org --- kernel/events/core.c | 61 ++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 7d20743b48e1..2f07718bd41c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5574,7 +5574,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -5637,7 +5637,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: - return perf_event_set_bpf_prog(event, arg); + { + struct bpf_prog *prog; + int err; + + prog = bpf_prog_get(arg); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_prog_put(prog); + return err; + } + + return 0; + } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; @@ -9923,10 +9938,8 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog *prog; - if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; @@ -9934,9 +9947,8 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (event->prog) return -EEXIST; - prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); - if (IS_ERR(prog)) - return PTR_ERR(prog); + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && @@ -9952,7 +9964,6 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ - bpf_prog_put(prog); return -EPROTO; } @@ -9974,7 +9985,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) { return -EOPNOTSUPP; } @@ -10002,14 +10013,12 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; - struct bpf_prog *prog; - int ret; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog_fd); + return perf_event_set_bpf_handler(event, prog); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10018,38 +10027,24 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - prog = bpf_prog_get(prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || - (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { - /* valid fd, but invalid bpf program type */ - bpf_prog_put(prog); + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; - } /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) return -EINVAL; - } if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); - if (prog->aux->max_ctx_offset > off) { - bpf_prog_put(prog); + if (prog->aux->max_ctx_offset > off) return -EACCES; - } } - ret = perf_event_attach_bpf_prog(event, prog); - if (ret) - bpf_prog_put(prog); - return ret; + return perf_event_attach_bpf_prog(event, prog); } static void perf_event_free_bpf_prog(struct perf_event *event) @@ -10071,7 +10066,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -- cgit v1.2.3 From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:57 -0700 Subject: bpf: Implement minimal BPF perf link Introduce a new type of BPF link - BPF perf link. This brings perf_event-based BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into the common BPF link infrastructure, allowing to list all active perf_event based attachments, auto-detaching BPF program from perf_event when link's FD is closed, get generic BPF link fdinfo/get_info functionality. BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags are currently supported. Force-detaching and atomic BPF program updates are not yet implemented, but with perf_event-based BPF links we now have common framework for this without the need to extend ioctl()-based perf_event interface. One interesting consideration is a new value for bpf_attach_type, which BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to define a single BPF_PERF_EVENT attach type for all of them and adjust link_create()'s logic for checking correspondence between attach type and program type. The alternative would be to define three new attach types (e.g., BPF_KPROBE, BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by libbpf. I chose to not do this to avoid unnecessary proliferation of bpf_attach_type enum values and not have to deal with naming conflicts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org --- include/linux/bpf_types.h | 3 ++ include/linux/trace_events.h | 3 ++ include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 105 ++++++++++++++++++++++++++++++++++++++--- kernel/events/core.c | 10 ++-- tools/include/uapi/linux/bpf.h | 2 + 6 files changed, 112 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index ae3ac3a2018c..9c81724e4b98 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -136,3 +136,6 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #endif +#ifdef CONFIG_PERF_EVENTS +BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) +#endif diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index ad413b382a3c..8ac92560d3a3 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -803,6 +803,9 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +void perf_event_free_bpf_prog(struct perf_event *event); + void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a2068e39d23..80c03bedd6e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = { .fill_link_info = bpf_raw_tp_link_fill_link_info, }; +#ifdef CONFIG_PERF_EVENTS +struct bpf_perf_link { + struct bpf_link link; + struct file *perf_file; +}; + +static void bpf_perf_link_release(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + struct perf_event *event = perf_link->perf_file->private_data; + + perf_event_free_bpf_prog(event); + fput(perf_link->perf_file); +} + +static void bpf_perf_link_dealloc(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + + kfree(perf_link); +} + +static const struct bpf_link_ops bpf_perf_link_lops = { + .release = bpf_perf_link_release, + .dealloc = bpf_perf_link_dealloc, +}; + +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_perf_link *link; + struct perf_event *event; + struct file *perf_file; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + perf_file = perf_event_get(attr->link_create.target_fd); + if (IS_ERR(perf_file)) + return PTR_ERR(perf_file); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_file; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + link->perf_file = perf_file; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_file; + } + + event = perf_file->private_data; + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_file; + } + /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out_put_file: + fput(perf_file); + return err; +} +#endif /* CONFIG_PERF_EVENTS */ + #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) @@ -4147,15 +4220,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (ret) goto out; - if (prog->type == BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_EXT: ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; - } - - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { - ret = -EINVAL; - goto out; + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + if (attr->link_create.attach_type != BPF_PERF_EVENT) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; + default: + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } + break; } switch (ptype) { @@ -4179,6 +4263,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; +#endif +#ifdef CONFIG_PERF_EVENTS + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_KPROBE: + ret = bpf_perf_link_attach(attr, prog); + break; #endif default: ret = -EINVAL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f07718bd41c..9fd65667bcb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4697,7 +4697,6 @@ errout: } static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -10013,7 +10011,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; @@ -10047,7 +10045,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *pr return perf_event_attach_bpf_prog(event, prog); } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); @@ -10066,12 +10064,12 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:58 -0700 Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links Add ability for users to specify custom u64 value (bpf_cookie) when creating BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event, tracepoints). This is useful for cases when the same BPF program is used for attaching and processing invocation of different tracepoints/kprobes/uprobes in a generic fashion, but such that each invocation is distinguished from each other (e.g., BPF program can look up additional information associated with a specific kernel function without having to rely on function IP lookups). This enables new use cases to be implemented simply and efficiently that previously were possible only through code generation (and thus multiple instances of almost identical BPF program) or compilation at runtime (BCC-style) on target hosts (even more expensive resource-wise). For uprobes it is not even possible in some cases to know function IP before hand (e.g., when attaching to shared library without PID filtering, in which case base load address is not known for a library). This is done by storing u64 bpf_cookie in struct bpf_prog_array_item, corresponding to each attached and run BPF program. Given cgroup BPF programs already use two 8-byte pointers for their needs and cgroup BPF programs don't have (yet?) support for bpf_cookie, reuse that space through union of cgroup_storage and new bpf_cookie field. Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx. This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF program execution code, which luckily is now also split from BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper giving access to this user-provided cookie value from inside a BPF program. Generic perf_event BPF programs will access this value from perf_event itself through passed in BPF program context. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org --- drivers/media/rc/bpf-lirc.c | 4 ++-- include/linux/bpf.h | 16 +++++++++++++++- include/linux/perf_event.h | 1 + include/linux/trace_events.h | 6 +++--- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/core.c | 29 ++++++++++++++++++----------- kernel/bpf/syscall.c | 2 +- kernel/events/core.c | 21 ++++++++++++++------- kernel/trace/bpf_trace.c | 8 +++++--- tools/include/uapi/linux/bpf.h | 7 +++++++ 10 files changed, 73 insertions(+), 28 deletions(-) diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index bb5a9dc78f1b..3eff08d7b8e5 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -160,7 +160,7 @@ static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog) goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, 0, &new_array); if (ret < 0) goto unlock; @@ -193,7 +193,7 @@ static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog) } old_array = lirc_rcu_dereference(raw->progs); - ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, prog, NULL, 0, &new_array); /* * Do not use bpf_prog_array_delete_safe() as we would end up * with a dummy entry in the array, and the we would free the diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 344e0d4d8ef6..83c3cc5e90df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1114,7 +1114,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, */ struct bpf_prog_array_item { struct bpf_prog *prog; - struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + union { + struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u64 bpf_cookie; + }; }; struct bpf_prog_array { @@ -1140,6 +1143,7 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array); struct bpf_run_ctx {}; @@ -1149,6 +1153,11 @@ struct bpf_cg_run_ctx { const struct bpf_prog_array_item *prog_item; }; +struct bpf_trace_run_ctx { + struct bpf_run_ctx run_ctx; + u64 bpf_cookie; +}; + static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx) { struct bpf_run_ctx *old_ctx = NULL; @@ -1239,6 +1248,8 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; u32 ret = 1; migrate_disable(); @@ -1246,11 +1257,14 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu, array = rcu_dereference(array_rcu); if (unlikely(!array)) goto out; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { + run_ctx.bpf_cookie = item->bpf_cookie; ret &= run_prog(prog, ctx); item++; } + bpf_reset_run_ctx(old_run_ctx); out: rcu_read_unlock(); migrate_enable(); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d510ad750ed..fe156a8170aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -762,6 +762,7 @@ struct perf_event { #ifdef CONFIG_BPF_SYSCALL perf_overflow_handler_t orig_overflow_handler; struct bpf_prog *prog; + u64 bpf_cookie; #endif #ifdef CONFIG_EVENT_TRACING diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8ac92560d3a3..8e0631a4b046 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -675,7 +675,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); -int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); @@ -692,7 +692,7 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c } static inline int -perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -803,7 +803,7 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ee2ec27c3d4..91f24c7b38a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog_array_item *existing; + struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; - int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to * the new array. @@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; + new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; - for (; existing->prog; existing++) - if (existing->prog != exclude_prog && - existing->prog != &dummy_bpf_prog.prog) { - array->items[new_prog_idx++].prog = - existing->prog; - } + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog || + existing->prog == &dummy_bpf_prog.prog) + continue; + + new->prog = existing->prog; + new->bpf_cookie = existing->bpf_cookie; + new++; + } } - if (include_prog) - array->items[new_prog_idx++].prog = include_prog; - array->items[new_prog_idx].prog = NULL; + if (include_prog) { + new->prog = include_prog; + new->bpf_cookie = bpf_cookie; + new++; + } + new->prog = NULL; *new_array = array; return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 80c03bedd6e6..7420e1334ab2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2963,7 +2963,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } event = perf_file->private_data; - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); if (err) { bpf_link_cleanup(&link_primer); goto out_put_file; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9fd65667bcb2..2d1e63dd97f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5643,7 +5643,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -9936,7 +9936,9 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ @@ -9966,6 +9968,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog } event->prog = prog; + event->bpf_cookie = bpf_cookie; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; @@ -9983,7 +9986,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -10011,12 +10016,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog); + return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10042,7 +10048,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) return -EACCES; } - return perf_event_attach_bpf_prog(event, prog); + return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) @@ -10064,7 +10070,8 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { return -ENOENT; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91867b14b222..57879d28f824 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1675,7 +1675,8 @@ static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 int perf_event_attach_bpf_prog(struct perf_event *event, - struct bpf_prog *prog) + struct bpf_prog *prog, + u64 bpf_cookie) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; @@ -1702,12 +1703,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event, goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); if (ret < 0) goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; + event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); @@ -1728,7 +1730,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); - ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret == -ENOENT) goto unlock; if (ret < 0) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; -- cgit v1.2.3 From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:59 -0700 Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs to get access to a user-provided bpf_cookie value, specified during BPF program attachment (BPF link creation) time. Naming is hard, though. With the concept being named "BPF cookie", I've considered calling the helper: - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket cookie; - bpf_get_bpf_cookie() -- too much tautology; - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to attach BPF program to BPF hook, it's still an "attachment" and the bpf_cookie is associated with BPF program attachment to a hook, not a BPF link itself. Technically, we could support bpf_cookie with old-style cgroup programs.So I ultimately rejected it in favor of bpf_get_attach_cookie(). Currently all perf_event-backed BPF program types support bpf_get_attach_cookie() helper. Follow-up patches will add support for fentry/fexit programs as well. While at it, mark bpf_tracing_func_proto() as static to make it obvious that it's only used from within the kernel/trace/bpf_trace.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org --- include/linux/bpf.h | 3 --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 4 files changed, 66 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83c3cc5e90df..f4c16f19f83e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2102,9 +2102,6 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; -const struct bpf_func_proto *bpf_tracing_func_proto( - enum bpf_func_id func_id, const struct bpf_prog *prog); - const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 57879d28f824..cbc73c08c4a4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -975,7 +975,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; -const struct bpf_func_proto * +BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { + .func = bpf_get_attach_cookie_trace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) +{ + return ctx->event->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { + .func = bpf_get_attach_cookie_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -1109,6 +1136,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_kprobe; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1219,6 +1248,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1326,6 +1357,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_pe; default: return bpf_tracing_func_proto(func_id, prog); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 61c7aa5020e98ac2fdcf07d07eec1baf2e9f0a08 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:00 -0700 Subject: libbpf: Re-build libbpf.so when libbpf.map changes Ensure libbpf.so is re-built whenever libbpf.map is modified. Without this, changes to libbpf.map are not detected and versioned symbols mismatch error will be reported until `make clean && make` is used, which is a suboptimal developer experience. Fixes: 306b267cb3c4 ("libbpf: Verify versioned symbols") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-8-andrii@kernel.org --- tools/lib/bpf/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index ec14aa725bb0..74c3b73a5fbe 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -4,8 +4,9 @@ RM ?= rm srctree = $(abs_srctree) +VERSION_SCRIPT := libbpf.map LIBBPF_VERSION := $(shell \ - grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | \ + grep -oE '^LIBBPF_([0-9.]+)' $(VERSION_SCRIPT) | \ sort -rV | head -n1 | cut -d'_' -f2) LIBBPF_MAJOR_VERSION := $(firstword $(subst ., ,$(LIBBPF_VERSION))) @@ -110,7 +111,6 @@ SHARED_OBJDIR := $(OUTPUT)sharedobjs/ STATIC_OBJDIR := $(OUTPUT)staticobjs/ BPF_IN_SHARED := $(SHARED_OBJDIR)libbpf-in.o BPF_IN_STATIC := $(STATIC_OBJDIR)libbpf-in.o -VERSION_SCRIPT := libbpf.map BPF_HELPER_DEFS := $(OUTPUT)bpf_helper_defs.h LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET)) @@ -163,10 +163,10 @@ $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) -$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED) +$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN_SHARED) $(VERSION_SCRIPT) $(QUIET_LINK)$(CC) $(LDFLAGS) \ --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \ - -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -lz -o $@ + -Wl,--version-script=$(VERSION_SCRIPT) $< -lelf -lz -o $@ @ln -sf $(@F) $(OUTPUT)libbpf.so @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION) @@ -181,7 +181,7 @@ $(OUTPUT)libbpf.pc: check: check_abi -check_abi: $(OUTPUT)libbpf.so +check_abi: $(OUTPUT)libbpf.so $(VERSION_SCRIPT) @if [ "$(GLOBAL_SYM_COUNT)" != "$(VERSIONED_SYM_COUNT)" ]; then \ echo "Warning: Num of global symbols in $(BPF_IN_SHARED)" \ "($(GLOBAL_SYM_COUNT)) does NOT match with num of" \ -- cgit v1.2.3 From d88b71d4a91669f0b06693cd094dcd68f67ac58d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:01 -0700 Subject: libbpf: Remove unused bpf_link's destroy operation, but add dealloc bpf_link->destroy() isn't used by any code, so remove it. Instead, add ability to override deallocation procedure, with default doing plain free(link). This is necessary for cases when we want to "subclass" struct bpf_link to keep extra information, as is the case in the next patch adding struct bpf_link_perf. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-9-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ff3c0ee79d85..d30e3282bfc7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8810,7 +8810,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, struct bpf_link { int (*detach)(struct bpf_link *link); - int (*destroy)(struct bpf_link *link); + void (*dealloc)(struct bpf_link *link); char *pin_path; /* NULL, if not pinned */ int fd; /* hook FD, -1 if not applicable */ bool disconnected; @@ -8849,11 +8849,12 @@ int bpf_link__destroy(struct bpf_link *link) if (!link->disconnected && link->detach) err = link->detach(link); - if (link->destroy) - link->destroy(link); if (link->pin_path) free(link->pin_path); - free(link); + if (link->dealloc) + link->dealloc(link); + else + free(link); return libbpf_err(err); } -- cgit v1.2.3 From 668ace0ea5ab5acdb33cff0b66fcd8f41c16a0b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:02 -0700 Subject: libbpf: Use BPF perf link when supported by kernel Detect kernel support for BPF perf link and prefer it when attaching to perf_event, tracepoint, kprobe/uprobe. Underlying perf_event FD will be kept open until BPF link is destroyed, at which point both perf_event FD and BPF link FD will be closed. This preserves current behavior in which perf_event FD is open for the duration of bpf_link's lifetime and user is able to "disconnect" bpf_link from underlying FD (with bpf_link__disconnect()), so that bpf_link__destroy() doesn't close underlying perf_event FD.When BPF perf link is used, disconnect will keep both perf_event and bpf_link FDs open, so it will be up to (advanced) user to close them. This approach is demonstrated in bpf_cookie.c selftests, added in this patch set. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-10-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 111 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 21 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index d30e3282bfc7..5dc15f5b4b78 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -193,6 +193,8 @@ enum kern_feature_id { FEAT_MODULE_BTF, /* BTF_KIND_FLOAT support */ FEAT_BTF_FLOAT, + /* BPF perf link support */ + FEAT_PERF_LINK, __FEAT_CNT, }; @@ -4337,6 +4339,37 @@ static int probe_module_btf(void) return !err; } +static int probe_perf_link(void) +{ + struct bpf_load_program_attr attr; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_TRACEPOINT; + attr.insns = insns; + attr.insns_cnt = ARRAY_SIZE(insns); + attr.license = "GPL"; + prog_fd = bpf_load_program_xattr(&attr, NULL, 0); + if (prog_fd < 0) + return -errno; + + /* use invalid perf_event FD to get EBADF, if link is supported; + * otherwise EINVAL should be returned + */ + link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -4387,6 +4420,9 @@ static struct kern_feature_desc { [FEAT_BTF_FLOAT] = { "BTF_KIND_FLOAT support", probe_kern_btf_float, }, + [FEAT_PERF_LINK] = { + "BPF perf link support", probe_perf_link, + }, }; static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) @@ -8951,23 +8987,38 @@ int bpf_link__unpin(struct bpf_link *link) return 0; } -static int bpf_link__detach_perf_event(struct bpf_link *link) +struct bpf_link_perf { + struct bpf_link link; + int perf_event_fd; +}; + +static int bpf_link_perf_detach(struct bpf_link *link) { - int err; + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + int err = 0; - err = ioctl(link->fd, PERF_EVENT_IOC_DISABLE, 0); - if (err) + if (ioctl(perf_link->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0) < 0) err = -errno; + if (perf_link->perf_event_fd != link->fd) + close(perf_link->perf_event_fd); close(link->fd); + return libbpf_err(err); } +static void bpf_link_perf_dealloc(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + free(perf_link); +} + struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd) { char errmsg[STRERR_BUFSIZE]; - struct bpf_link *link; - int prog_fd, err; + struct bpf_link_perf *link; + int prog_fd, link_fd = -1, err; if (pfd < 0) { pr_warn("prog '%s': invalid perf event FD %d\n", @@ -8984,27 +9035,45 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pf link = calloc(1, sizeof(*link)); if (!link) return libbpf_err_ptr(-ENOMEM); - link->detach = &bpf_link__detach_perf_event; - link->fd = pfd; + link->link.detach = &bpf_link_perf_detach; + link->link.dealloc = &bpf_link_perf_dealloc; + link->perf_event_fd = pfd; - if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { - err = -errno; - free(link); - pr_warn("prog '%s': failed to attach to pfd %d: %s\n", - prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - if (err == -EPROTO) - pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", - prog->name, pfd); - return libbpf_err_ptr(err); + if (kernel_supports(prog->obj, FEAT_PERF_LINK)) { + link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, NULL); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n", + prog->name, pfd, + err, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link->link.fd = link_fd; + } else { + if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + if (err == -EPROTO) + pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + prog->name, pfd); + goto err_out; + } + link->link.fd = pfd; } if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { err = -errno; - free(link); - pr_warn("prog '%s': failed to enable pfd %d: %s\n", + pr_warn("prog '%s': failed to enable perf_event FD %d: %s\n", prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - return libbpf_err_ptr(err); + goto err_out; } - return link; + + return &link->link; +err_out: + if (link_fd >= 0) + close(link_fd); + free(link); + return libbpf_err_ptr(err); } /* -- cgit v1.2.3 From 3ec84f4b1638495ebff068a668dc417b4de5727e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:03 -0700 Subject: libbpf: Add bpf_cookie support to bpf_link_create() API Add ability to specify bpf_cookie value when creating BPF perf link with bpf_link_create() low-level API. Given BPF_LINK_CREATE command is growing and keeps getting new fields that are specific to the type of BPF_LINK, extend libbpf side of bpf_link_create() API and corresponding OPTS struct to accomodate such changes. Add extra checks to prevent using incompatible/unexpected combinations of fields. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-11-andrii@kernel.org --- tools/lib/bpf/bpf.c | 32 +++++++++++++++++++++++++------- tools/lib/bpf/bpf.h | 8 +++++++- tools/lib/bpf/libbpf_internal.h | 32 ++++++++++++++++++++++---------- 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 86dcac44f32f..2401fad090c5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -684,8 +684,13 @@ int bpf_link_create(int prog_fd, int target_fd, iter_info_len = OPTS_GET(opts, iter_info_len, 0); target_btf_id = OPTS_GET(opts, target_btf_id, 0); - if (iter_info_len && target_btf_id) - return libbpf_err(-EINVAL); + /* validate we don't have unexpected combinations of non-zero fields */ + if (iter_info_len || target_btf_id) { + if (iter_info_len && target_btf_id) + return libbpf_err(-EINVAL); + if (!OPTS_ZEROED(opts, target_btf_id)) + return libbpf_err(-EINVAL); + } memset(&attr, 0, sizeof(attr)); attr.link_create.prog_fd = prog_fd; @@ -693,14 +698,27 @@ int bpf_link_create(int prog_fd, int target_fd, attr.link_create.attach_type = attach_type; attr.link_create.flags = OPTS_GET(opts, flags, 0); - if (iter_info_len) { - attr.link_create.iter_info = - ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0)); - attr.link_create.iter_info_len = iter_info_len; - } else if (target_btf_id) { + if (target_btf_id) { attr.link_create.target_btf_id = target_btf_id; + goto proceed; } + switch (attach_type) { + case BPF_TRACE_ITER: + attr.link_create.iter_info = ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0)); + attr.link_create.iter_info_len = iter_info_len; + break; + case BPF_PERF_EVENT: + attr.link_create.perf_event.bpf_cookie = OPTS_GET(opts, perf_event.bpf_cookie, 0); + if (!OPTS_ZEROED(opts, perf_event)) + return libbpf_err(-EINVAL); + break; + default: + if (!OPTS_ZEROED(opts, flags)) + return libbpf_err(-EINVAL); + break; + } +proceed: fd = sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); return libbpf_err_errno(fd); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 4f758f8f50cd..6fffb3cdf39b 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -177,8 +177,14 @@ struct bpf_link_create_opts { union bpf_iter_link_info *iter_info; __u32 iter_info_len; __u32 target_btf_id; + union { + struct { + __u64 bpf_cookie; + } perf_event; + }; + size_t :0; }; -#define bpf_link_create_opts__last_field target_btf_id +#define bpf_link_create_opts__last_field perf_event LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, enum bpf_attach_type attach_type, diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index f7b691d5f9eb..533b0211f40a 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -196,6 +196,17 @@ void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t cur_cnt, size_t max_cnt, size_t add_cnt); int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); +static inline bool libbpf_is_mem_zeroed(const char *p, ssize_t len) +{ + while (len > 0) { + if (*p) + return false; + p++; + len--; + } + return true; +} + static inline bool libbpf_validate_opts(const char *opts, size_t opts_sz, size_t user_sz, const char *type_name) @@ -204,16 +215,9 @@ static inline bool libbpf_validate_opts(const char *opts, pr_warn("%s size (%zu) is too small\n", type_name, user_sz); return false; } - if (user_sz > opts_sz) { - size_t i; - - for (i = opts_sz; i < user_sz; i++) { - if (opts[i]) { - pr_warn("%s has non-zero extra bytes\n", - type_name); - return false; - } - } + if (!libbpf_is_mem_zeroed(opts + opts_sz, (ssize_t)user_sz - opts_sz)) { + pr_warn("%s has non-zero extra bytes\n", type_name); + return false; } return true; } @@ -233,6 +237,14 @@ static inline bool libbpf_validate_opts(const char *opts, (opts)->field = value; \ } while (0) +#define OPTS_ZEROED(opts, last_nonzero_field) \ +({ \ + ssize_t __off = offsetofend(typeof(*(opts)), last_nonzero_field); \ + !(opts) || libbpf_is_mem_zeroed((const void *)opts + __off, \ + (opts)->sz - __off); \ +}) + + int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, -- cgit v1.2.3 From 47faff371755ba0f1ad76e2df7f5003377d974a5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:04 -0700 Subject: libbpf: Add bpf_cookie to perf_event, kprobe, uprobe, and tp attach APIs Wire through bpf_cookie for all attach APIs that use perf_event_open under the hood: - for kprobes, extend existing bpf_kprobe_opts with bpf_cookie field; - for perf_event, uprobe, and tracepoint APIs, add their _opts variants and pass bpf_cookie through opts. For kernel that don't support BPF_LINK_CREATE for perf_events, and thus bpf_cookie is not supported either, return error and log warning for user. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-12-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 78 ++++++++++++++++++++++++++++++++++++++++-------- tools/lib/bpf/libbpf.h | 71 +++++++++++++++++++++++++++++++++++-------- tools/lib/bpf/libbpf.map | 3 ++ 3 files changed, 127 insertions(+), 25 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5dc15f5b4b78..62ce878cb8e0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9014,12 +9014,16 @@ static void bpf_link_perf_dealloc(struct bpf_link *link) free(perf_link); } -struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd) +struct bpf_link *bpf_program__attach_perf_event_opts(struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts) { char errmsg[STRERR_BUFSIZE]; struct bpf_link_perf *link; int prog_fd, link_fd = -1, err; + if (!OPTS_VALID(opts, bpf_perf_event_opts)) + return libbpf_err_ptr(-EINVAL); + if (pfd < 0) { pr_warn("prog '%s': invalid perf event FD %d\n", prog->name, pfd); @@ -9040,7 +9044,10 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pf link->perf_event_fd = pfd; if (kernel_supports(prog->obj, FEAT_PERF_LINK)) { - link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, NULL); + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts, + .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0)); + + link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, &link_opts); if (link_fd < 0) { err = -errno; pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n", @@ -9050,6 +9057,12 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pf } link->link.fd = link_fd; } else { + if (OPTS_GET(opts, bpf_cookie, 0)) { + pr_warn("prog '%s': user context value is not supported\n", prog->name); + err = -EOPNOTSUPP; + goto err_out; + } + if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { err = -errno; pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n", @@ -9076,6 +9089,11 @@ err_out: return libbpf_err_ptr(err); } +struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int pfd) +{ + return bpf_program__attach_perf_event_opts(prog, pfd, NULL); +} + /* * this function is expected to parse integer in the range of [0, 2^31-1] from * given file using scanf format string fmt. If actual parsed value is @@ -9184,8 +9202,9 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, struct bpf_link * bpf_program__attach_kprobe_opts(struct bpf_program *prog, const char *func_name, - struct bpf_kprobe_opts *opts) + const struct bpf_kprobe_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; unsigned long offset; @@ -9197,6 +9216,7 @@ bpf_program__attach_kprobe_opts(struct bpf_program *prog, retprobe = OPTS_GET(opts, retprobe, false); offset = OPTS_GET(opts, offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, offset, -1 /* pid */); @@ -9206,7 +9226,7 @@ bpf_program__attach_kprobe_opts(struct bpf_program *prog, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return libbpf_err_ptr(pfd); } - link = bpf_program__attach_perf_event(prog, pfd); + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); err = libbpf_get_error(link); if (err) { close(pfd); @@ -9261,14 +9281,22 @@ static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec, return link; } -struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, - bool retprobe, pid_t pid, - const char *binary_path, - size_t func_offset) +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int pfd, err; + bool retprobe; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + retprobe = OPTS_GET(opts, retprobe, false); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid); @@ -9279,7 +9307,7 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return libbpf_err_ptr(pfd); } - link = bpf_program__attach_perf_event(prog, pfd); + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); err = libbpf_get_error(link); if (err) { close(pfd); @@ -9292,6 +9320,16 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, return link; } +struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, + bool retprobe, pid_t pid, + const char *binary_path, + size_t func_offset) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts, .retprobe = retprobe); + + return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts); +} + static int determine_tracepoint_id(const char *tp_category, const char *tp_name) { @@ -9342,14 +9380,21 @@ static int perf_event_open_tracepoint(const char *tp_category, return pfd; } -struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, - const char *tp_category, - const char *tp_name) +struct bpf_link *bpf_program__attach_tracepoint_opts(struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int pfd, err; + if (!OPTS_VALID(opts, bpf_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + pfd = perf_event_open_tracepoint(tp_category, tp_name); if (pfd < 0) { pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n", @@ -9357,7 +9402,7 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return libbpf_err_ptr(pfd); } - link = bpf_program__attach_perf_event(prog, pfd); + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); err = libbpf_get_error(link); if (err) { close(pfd); @@ -9369,6 +9414,13 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, return link; } +struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, + const char *tp_category, + const char *tp_name) +{ + return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL); +} + static struct bpf_link *attach_tp(const struct bpf_sec_def *sec, struct bpf_program *prog) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1271d99bb7aa..1f4a67285365 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -104,17 +104,6 @@ struct bpf_object_open_opts { }; #define bpf_object_open_opts__last_field btf_custom_path -struct bpf_kprobe_opts { - /* size of this struct, for forward/backward compatiblity */ - size_t sz; - /* function's offset to install kprobe to */ - unsigned long offset; - /* kprobe is return probe */ - bool retprobe; - size_t :0; -}; -#define bpf_kprobe_opts__last_field retprobe - LIBBPF_API struct bpf_object *bpf_object__open(const char *path); LIBBPF_API struct bpf_object * bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts); @@ -255,24 +244,82 @@ LIBBPF_API int bpf_link__destroy(struct bpf_link *link); LIBBPF_API struct bpf_link * bpf_program__attach(struct bpf_program *prog); + +struct bpf_perf_event_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; +}; +#define bpf_perf_event_opts__last_field bpf_cookie + LIBBPF_API struct bpf_link * bpf_program__attach_perf_event(struct bpf_program *prog, int pfd); + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event_opts(struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts); + +struct bpf_kprobe_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* function's offset to install kprobe to */ + unsigned long offset; + /* kprobe is return probe */ + bool retprobe; + size_t :0; +}; +#define bpf_kprobe_opts__last_field retprobe + LIBBPF_API struct bpf_link * bpf_program__attach_kprobe(struct bpf_program *prog, bool retprobe, const char *func_name); LIBBPF_API struct bpf_link * bpf_program__attach_kprobe_opts(struct bpf_program *prog, const char *func_name, - struct bpf_kprobe_opts *opts); + const struct bpf_kprobe_opts *opts); + +struct bpf_uprobe_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* uprobe is return probe, invoked at function return time */ + bool retprobe; + size_t :0; +}; +#define bpf_uprobe_opts__last_field retprobe + LIBBPF_API struct bpf_link * bpf_program__attach_uprobe(struct bpf_program *prog, bool retprobe, pid_t pid, const char *binary_path, size_t func_offset); +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts); + +struct bpf_tracepoint_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; +}; +#define bpf_tracepoint_opts__last_field bpf_cookie + LIBBPF_API struct bpf_link * bpf_program__attach_tracepoint(struct bpf_program *prog, const char *tp_category, const char *tp_name); LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint_opts(struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts); + +LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(struct bpf_program *prog, const char *tp_name); LIBBPF_API struct bpf_link * diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 58e0fb2c482f..bbc53bb25f68 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -374,6 +374,9 @@ LIBBPF_0.5.0 { bpf_map__pin_path; bpf_map_lookup_and_delete_elem_flags; bpf_program__attach_kprobe_opts; + bpf_program__attach_perf_event_opts; + bpf_program__attach_tracepoint_opts; + bpf_program__attach_uprobe_opts; bpf_object__gen_loader; btf__load_from_kernel_by_id; btf__load_from_kernel_by_id_split; -- cgit v1.2.3 From f36d3557a132ec0ccb8a3536d3ebd778049d48ca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:05 -0700 Subject: selftests/bpf: Test low-level perf BPF link API Add tests utilizing low-level bpf_link_create() API to create perf BPF link. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-13-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/perf_link.c | 89 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_perf_link.c | 16 ++++ 2 files changed, 105 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_link.c create mode 100644 tools/testing/selftests/bpf/progs/test_perf_link.c diff --git a/tools/testing/selftests/bpf/prog_tests/perf_link.c b/tools/testing/selftests/bpf/prog_tests/perf_link.c new file mode 100644 index 000000000000..b1abd0c46607 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_link.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include +#include +#include +#include "test_perf_link.skel.h" + +static void burn_cpu(void) +{ + volatile int j = 0; + cpu_set_t cpu_set; + int i, err; + + /* generate some branches on cpu 0 */ + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + ASSERT_OK(err, "set_thread_affinity"); + + /* spin the loop for a while (random high number) */ + for (i = 0; i < 1000000; ++i) + ++j; +} + +void test_perf_link(void) +{ + struct test_perf_link *skel = NULL; + struct perf_event_attr attr; + int pfd = -1, link_fd = -1, err; + int run_cnt_before, run_cnt_after; + struct bpf_link_info info; + __u32 info_len = sizeof(info); + + /* create perf event */ + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 4000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; + + skel = test_perf_link__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + link_fd = bpf_link_create(bpf_program__fd(skel->progs.handler), pfd, + BPF_PERF_EVENT, NULL); + if (!ASSERT_GE(link_fd, 0, "link_fd")) + goto cleanup; + + memset(&info, 0, sizeof(info)); + err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len); + if (!ASSERT_OK(err, "link_get_info")) + goto cleanup; + + ASSERT_EQ(info.type, BPF_LINK_TYPE_PERF_EVENT, "link_type"); + ASSERT_GT(info.id, 0, "link_id"); + ASSERT_GT(info.prog_id, 0, "link_prog_id"); + + /* ensure we get at least one perf_event prog execution */ + burn_cpu(); + ASSERT_GT(skel->bss->run_cnt, 0, "run_cnt"); + + /* perf_event is still active, but we close link and BPF program + * shouldn't be executed anymore + */ + close(link_fd); + link_fd = -1; + + /* make sure there are no stragglers */ + kern_sync_rcu(); + + run_cnt_before = skel->bss->run_cnt; + burn_cpu(); + run_cnt_after = skel->bss->run_cnt; + + ASSERT_EQ(run_cnt_before, run_cnt_after, "run_cnt_before_after"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + if (pfd >= 0) + close(pfd); + test_perf_link__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_perf_link.c b/tools/testing/selftests/bpf/progs/test_perf_link.c new file mode 100644 index 000000000000..c1db9fd98d0c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_link.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include +#include + +int run_cnt = 0; + +SEC("perf_event") +int handler(struct pt_regs *ctx) +{ + __sync_fetch_and_add(&run_cnt, 1); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a549aaa67395eea89c2b9d2bea01ab0455b18408 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:06 -0700 Subject: selftests/bpf: Extract uprobe-related helpers into trace_helpers.{c,h} Extract two helpers used for working with uprobes into trace_helpers.{c,h} to be re-used between multiple uprobe-using selftests. Also rename get_offset() into more appropriate get_uprobe_offset(). Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-14-andrii@kernel.org --- .../selftests/bpf/prog_tests/attach_probe.c | 61 +------------------- tools/testing/selftests/bpf/trace_helpers.c | 66 ++++++++++++++++++++++ tools/testing/selftests/bpf/trace_helpers.h | 3 + 3 files changed, 70 insertions(+), 60 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index ec11e20d2b92..e40b41c44f8b 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,65 +2,6 @@ #include #include "test_attach_probe.skel.h" -#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 - -#define OP_RT_RA_MASK 0xffff0000UL -#define LIS_R2 0x3c400000UL -#define ADDIS_R2_R12 0x3c4c0000UL -#define ADDI_R2_R2 0x38420000UL - -static ssize_t get_offset(ssize_t addr, ssize_t base) -{ - u32 *insn = (u32 *) addr; - - /* - * A PPC64 ABIv2 function may have a local and a global entry - * point. We need to use the local entry point when patching - * functions, so identify and step over the global entry point - * sequence. - * - * The global entry point sequence is always of the form: - * - * addis r2,r12,XXXX - * addi r2,r2,XXXX - * - * A linker optimisation may convert the addis to lis: - * - * lis r2,XXXX - * addi r2,r2,XXXX - */ - if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || - ((*insn & OP_RT_RA_MASK) == LIS_R2)) && - ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) - return (ssize_t)(insn + 2) - base; - else - return addr - base; -} -#else -#define get_offset(addr, base) (addr - base) -#endif - -ssize_t get_base_addr() { - size_t start, offset; - char buf[256]; - FILE *f; - - f = fopen("/proc/self/maps", "r"); - if (!f) - return -errno; - - while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n", - &start, buf, &offset) == 3) { - if (strcmp(buf, "r-xp") == 0) { - fclose(f); - return start - offset; - } - } - - fclose(f); - return -EINVAL; -} - void test_attach_probe(void) { int duration = 0; @@ -74,7 +15,7 @@ void test_attach_probe(void) if (CHECK(base_addr < 0, "get_base_addr", "failed to find base addr: %zd", base_addr)) return; - uprobe_offset = get_offset((size_t)&get_base_addr, base_addr); + uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 1bbd1d9830c8..381dafce1d8f 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -136,3 +136,69 @@ void read_trace_pipe(void) } } } + +#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 + +#define OP_RT_RA_MASK 0xffff0000UL +#define LIS_R2 0x3c400000UL +#define ADDIS_R2_R12 0x3c4c0000UL +#define ADDI_R2_R2 0x38420000UL + +ssize_t get_uprobe_offset(const void *addr, ssize_t base) +{ + u32 *insn = (u32 *)(uintptr_t)addr; + + /* + * A PPC64 ABIv2 function may have a local and a global entry + * point. We need to use the local entry point when patching + * functions, so identify and step over the global entry point + * sequence. + * + * The global entry point sequence is always of the form: + * + * addis r2,r12,XXXX + * addi r2,r2,XXXX + * + * A linker optimisation may convert the addis to lis: + * + * lis r2,XXXX + * addi r2,r2,XXXX + */ + if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || + ((*insn & OP_RT_RA_MASK) == LIS_R2)) && + ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) + return (ssize_t)(insn + 2) - base; + else + return (uintptr_t)addr - base; +} + +#else + +ssize_t get_uprobe_offset(const void *addr, ssize_t base) +{ + return (uintptr_t)addr - base; +} + +#endif + +ssize_t get_base_addr(void) +{ + size_t start, offset; + char buf[256]; + FILE *f; + + f = fopen("/proc/self/maps", "r"); + if (!f) + return -errno; + + while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n", + &start, buf, &offset) == 3) { + if (strcmp(buf, "r-xp") == 0) { + fclose(f); + return start - offset; + } + } + + fclose(f); + return -EINVAL; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index f62fdef9e589..3d9435b3dd3b 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -18,4 +18,7 @@ int kallsyms_find(const char *sym, unsigned long long *addr); void read_trace_pipe(void); +ssize_t get_uprobe_offset(const void *addr, ssize_t base); +ssize_t get_base_addr(void); + #endif -- cgit v1.2.3 From 0a80cf67f34cab7c18d74c28bb59e131670dc268 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:07 -0700 Subject: selftests/bpf: Add bpf_cookie selftests for high-level APIs Add selftest with few subtests testing proper bpf_cookie usage. Kprobe and uprobe subtests are pretty straightforward and just validate that the same BPF program attached with different bpf_cookie will be triggered with those different bpf_cookie values. Tracepoint subtest is a bit more interesting, as it is the only perf_event-based BPF hook that shares bpf_prog_array between multiple perf_events internally. This means that the same BPF program can't be attached to the same tracepoint multiple times. So we have 3 identical copies. This arrangement allows to test bpf_prog_array_copy()'s handling of bpf_prog_array list manipulation logic when programs are attached and detached. The test validates that bpf_cookie isn't mixed up and isn't lost during such list manipulations. Perf_event subtest validates that two BPF links can be created against the same perf_event (but not at the same time, only one BPF program can be attached to perf_event itself), and that for each we can specify different bpf_cookie value. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-15-andrii@kernel.org --- .../testing/selftests/bpf/prog_tests/bpf_cookie.c | 254 +++++++++++++++++++++ .../testing/selftests/bpf/progs/test_bpf_cookie.c | 85 +++++++ 2 files changed, 339 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_cookie.c create mode 100644 tools/testing/selftests/bpf/progs/test_bpf_cookie.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c new file mode 100644 index 000000000000..5eea3c3a40fe --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "test_bpf_cookie.skel.h" + +static void kprobe_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); + struct bpf_link *link1 = NULL, *link2 = NULL; + struct bpf_link *retlink1 = NULL, *retlink2 = NULL; + + /* attach two kprobes */ + opts.bpf_cookie = 0x1; + opts.retprobe = false; + link1 = bpf_program__attach_kprobe_opts(skel->progs.handle_kprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(link1, "link1")) + goto cleanup; + + opts.bpf_cookie = 0x2; + opts.retprobe = false; + link2 = bpf_program__attach_kprobe_opts(skel->progs.handle_kprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(link2, "link2")) + goto cleanup; + + /* attach two kretprobes */ + opts.bpf_cookie = 0x10; + opts.retprobe = true; + retlink1 = bpf_program__attach_kprobe_opts(skel->progs.handle_kretprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(retlink1, "retlink1")) + goto cleanup; + + opts.bpf_cookie = 0x20; + opts.retprobe = true; + retlink2 = bpf_program__attach_kprobe_opts(skel->progs.handle_kretprobe, + SYS_NANOSLEEP_KPROBE_NAME, &opts); + if (!ASSERT_OK_PTR(retlink2, "retlink2")) + goto cleanup; + + /* trigger kprobe && kretprobe */ + usleep(1); + + ASSERT_EQ(skel->bss->kprobe_res, 0x1 | 0x2, "kprobe_res"); + ASSERT_EQ(skel->bss->kretprobe_res, 0x10 | 0x20, "kretprobe_res"); + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(retlink1); + bpf_link__destroy(retlink2); +} + +static void uprobe_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct bpf_link *link1 = NULL, *link2 = NULL; + struct bpf_link *retlink1 = NULL, *retlink2 = NULL; + size_t uprobe_offset; + ssize_t base_addr; + + base_addr = get_base_addr(); + uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); + + /* attach two uprobes */ + opts.bpf_cookie = 0x100; + opts.retprobe = false; + link1 = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, 0 /* self pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(link1, "link1")) + goto cleanup; + + opts.bpf_cookie = 0x200; + opts.retprobe = false; + link2 = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, -1 /* any pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(link2, "link2")) + goto cleanup; + + /* attach two uretprobes */ + opts.bpf_cookie = 0x1000; + opts.retprobe = true; + retlink1 = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, -1 /* any pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(retlink1, "retlink1")) + goto cleanup; + + opts.bpf_cookie = 0x2000; + opts.retprobe = true; + retlink2 = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, 0 /* self pid */, + "/proc/self/exe", uprobe_offset, &opts); + if (!ASSERT_OK_PTR(retlink2, "retlink2")) + goto cleanup; + + /* trigger uprobe && uretprobe */ + get_base_addr(); + + ASSERT_EQ(skel->bss->uprobe_res, 0x100 | 0x200, "uprobe_res"); + ASSERT_EQ(skel->bss->uretprobe_res, 0x1000 | 0x2000, "uretprobe_res"); + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(retlink1); + bpf_link__destroy(retlink2); +} + +static void tp_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_tracepoint_opts, opts); + struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL; + + /* attach first tp prog */ + opts.bpf_cookie = 0x10000; + link1 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp1, + "syscalls", "sys_enter_nanosleep", &opts); + if (!ASSERT_OK_PTR(link1, "link1")) + goto cleanup; + + /* attach second tp prog */ + opts.bpf_cookie = 0x20000; + link2 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp2, + "syscalls", "sys_enter_nanosleep", &opts); + if (!ASSERT_OK_PTR(link2, "link2")) + goto cleanup; + + /* trigger tracepoints */ + usleep(1); + + ASSERT_EQ(skel->bss->tp_res, 0x10000 | 0x20000, "tp_res1"); + + /* now we detach first prog and will attach third one, which causes + * two internal calls to bpf_prog_array_copy(), shuffling + * bpf_prog_array_items around. We test here that we don't lose track + * of associated bpf_cookies. + */ + bpf_link__destroy(link1); + link1 = NULL; + kern_sync_rcu(); + skel->bss->tp_res = 0; + + /* attach third tp prog */ + opts.bpf_cookie = 0x40000; + link3 = bpf_program__attach_tracepoint_opts(skel->progs.handle_tp3, + "syscalls", "sys_enter_nanosleep", &opts); + if (!ASSERT_OK_PTR(link3, "link3")) + goto cleanup; + + /* trigger tracepoints */ + usleep(1); + + ASSERT_EQ(skel->bss->tp_res, 0x20000 | 0x40000, "tp_res2"); + +cleanup: + bpf_link__destroy(link1); + bpf_link__destroy(link2); + bpf_link__destroy(link3); +} + +static void burn_cpu(void) +{ + volatile int j = 0; + cpu_set_t cpu_set; + int i, err; + + /* generate some branches on cpu 0 */ + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + ASSERT_OK(err, "set_thread_affinity"); + + /* spin the loop for a while (random high number) */ + for (i = 0; i < 1000000; ++i) + ++j; +} + +static void pe_subtest(struct test_bpf_cookie *skel) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, opts); + struct bpf_link *link = NULL; + struct perf_event_attr attr; + int pfd = -1; + + /* create perf event */ + memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 4000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; + + opts.bpf_cookie = 0x100000; + link = bpf_program__attach_perf_event_opts(skel->progs.handle_pe, pfd, &opts); + if (!ASSERT_OK_PTR(link, "link1")) + goto cleanup; + + burn_cpu(); /* trigger BPF prog */ + + ASSERT_EQ(skel->bss->pe_res, 0x100000, "pe_res1"); + + /* prevent bpf_link__destroy() closing pfd itself */ + bpf_link__disconnect(link); + /* close BPF link's FD explicitly */ + close(bpf_link__fd(link)); + /* free up memory used by struct bpf_link */ + bpf_link__destroy(link); + link = NULL; + kern_sync_rcu(); + skel->bss->pe_res = 0; + + opts.bpf_cookie = 0x200000; + link = bpf_program__attach_perf_event_opts(skel->progs.handle_pe, pfd, &opts); + if (!ASSERT_OK_PTR(link, "link2")) + goto cleanup; + + burn_cpu(); /* trigger BPF prog */ + + ASSERT_EQ(skel->bss->pe_res, 0x200000, "pe_res2"); + +cleanup: + close(pfd); + bpf_link__destroy(link); +} + +void test_bpf_cookie(void) +{ + struct test_bpf_cookie *skel; + + skel = test_bpf_cookie__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->bss->my_tid = syscall(SYS_gettid); + + if (test__start_subtest("kprobe")) + kprobe_subtest(skel); + if (test__start_subtest("uprobe")) + uprobe_subtest(skel); + if (test__start_subtest("tracepoint")) + tp_subtest(skel); + if (test__start_subtest("perf_event")) + pe_subtest(skel); + + test_bpf_cookie__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c new file mode 100644 index 000000000000..2d3a7710e2ce --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include +#include + +int my_tid; + +int kprobe_res; +int kprobe_multi_res; +int kretprobe_res; +int uprobe_res; +int uretprobe_res; +int tp_res; +int pe_res; + +static void update(void *ctx, int *res) +{ + if (my_tid != (u32)bpf_get_current_pid_tgid()) + return; + + *res |= bpf_get_attach_cookie(ctx); +} + +SEC("kprobe/sys_nanosleep") +int handle_kprobe(struct pt_regs *ctx) +{ + update(ctx, &kprobe_res); + return 0; +} + +SEC("kretprobe/sys_nanosleep") +int handle_kretprobe(struct pt_regs *ctx) +{ + update(ctx, &kretprobe_res); + return 0; +} + +SEC("uprobe/trigger_func") +int handle_uprobe(struct pt_regs *ctx) +{ + update(ctx, &uprobe_res); + return 0; +} + +SEC("uretprobe/trigger_func") +int handle_uretprobe(struct pt_regs *ctx) +{ + update(ctx, &uretprobe_res); + return 0; +} + +/* bpf_prog_array, used by kernel internally to keep track of attached BPF + * programs to a given BPF hook (e.g., for tracepoints) doesn't allow the same + * BPF program to be attached multiple times. So have three identical copies + * ready to attach to the same tracepoint. + */ +SEC("tp/syscalls/sys_enter_nanosleep") +int handle_tp1(struct pt_regs *ctx) +{ + update(ctx, &tp_res); + return 0; +} +SEC("tp/syscalls/sys_enter_nanosleep") +int handle_tp2(struct pt_regs *ctx) +{ + update(ctx, &tp_res); + return 0; +} +SEC("tp/syscalls/sys_enter_nanosleep") +int handle_tp3(void *ctx) +{ + update(ctx, &tp_res); + return 1; +} + +SEC("perf_event") +int handle_pe(struct pt_regs *ctx) +{ + update(ctx, &pe_res); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 5e3b8356de3623987ace530b1977ffeb9ecf5a8a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:08 -0700 Subject: libbpf: Add uprobe ref counter offset support for USDT semaphores When attaching to uprobes through perf subsystem, it's possible to specify offset of a so-called USDT semaphore, which is just a reference counted u16, used by kernel to keep track of how many tracers are attached to a given location. Support for this feature was added in [0], so just wire this through uprobe_opts. This is important to enable implementing USDT attachment and tracing through libbpf's bpf_program__attach_uprobe_opts() API. [0] a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-16-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 17 +++++++++++++---- tools/lib/bpf/libbpf.h | 4 ++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 62ce878cb8e0..88d8825fc6f6 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -9152,13 +9152,19 @@ static int determine_uprobe_retprobe_bit(void) return parse_uint_from_file(file, "config:%d\n"); } +#define PERF_UPROBE_REF_CTR_OFFSET_BITS 32 +#define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32 + static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, - uint64_t offset, int pid) + uint64_t offset, int pid, size_t ref_ctr_off) { struct perf_event_attr attr = {}; char errmsg[STRERR_BUFSIZE]; int type, pfd, err; + if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) + return -EINVAL; + type = uprobe ? determine_uprobe_perf_type() : determine_kprobe_perf_type(); if (type < 0) { @@ -9181,6 +9187,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, } attr.size = sizeof(attr); attr.type = type; + attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ attr.config2 = offset; /* kprobe_addr or probe_offset */ @@ -9219,7 +9226,7 @@ bpf_program__attach_kprobe_opts(struct bpf_program *prog, pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, - offset, -1 /* pid */); + offset, -1 /* pid */, 0 /* ref_ctr_off */); if (pfd < 0) { pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n", prog->name, retprobe ? "kretprobe" : "kprobe", func_name, @@ -9289,6 +9296,7 @@ bpf_program__attach_uprobe_opts(struct bpf_program *prog, pid_t pid, DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; + size_t ref_ctr_off; int pfd, err; bool retprobe; @@ -9296,10 +9304,11 @@ bpf_program__attach_uprobe_opts(struct bpf_program *prog, pid_t pid, return libbpf_err_ptr(-EINVAL); retprobe = OPTS_GET(opts, retprobe, false); + ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); - pfd = perf_event_open_probe(true /* uprobe */, retprobe, - binary_path, func_offset, pid); + pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, + func_offset, pid, ref_ctr_off); if (pfd < 0) { pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n", prog->name, retprobe ? "uretprobe" : "uprobe", diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 1f4a67285365..f177d897c5f7 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -284,6 +284,10 @@ bpf_program__attach_kprobe_opts(struct bpf_program *prog, struct bpf_uprobe_opts { /* size of this struct, for forward/backward compatiblity */ size_t sz; + /* offset of kernel reference counted USDT semaphore, added in + * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + size_t ref_ctr_offset; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; /* uprobe is return probe, invoked at function return time */ -- cgit v1.2.3 From 4bd11e08e0bb1862fe72495014e33795e412bffb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:06:09 -0700 Subject: selftests/bpf: Add ref_ctr_offset selftests Extend attach_probe selftests to specify ref_ctr_offset for uprobe/uretprobe and validate that its value is incremented from zero. Turns out that once uprobe is attached with ref_ctr_offset, uretprobe for the same location/function *has* to use ref_ctr_offset as well, otherwise perf_event_open() fails with -EINVAL. So this test uses ref_ctr_offset for both uprobe and uretprobe, even though for the purpose of test uprobe would be enough. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210815070609.987780-17-andrii@kernel.org --- .../selftests/bpf/prog_tests/attach_probe.c | 39 ++++++++++++++++------ tools/testing/selftests/bpf/trace_helpers.c | 21 ++++++++++++ tools/testing/selftests/bpf/trace_helpers.h | 1 + 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index e40b41c44f8b..bf307bb9e446 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -2,14 +2,18 @@ #include #include "test_attach_probe.skel.h" +/* this is how USDT semaphore is actually defined, except volatile modifier */ +volatile unsigned short uprobe_ref_ctr __attribute__((unused)) __attribute((section(".probes"))); + void test_attach_probe(void) { + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); int duration = 0; struct bpf_link *kprobe_link, *kretprobe_link; struct bpf_link *uprobe_link, *uretprobe_link; struct test_attach_probe* skel; size_t uprobe_offset; - ssize_t base_addr; + ssize_t base_addr, ref_ctr_offset; base_addr = get_base_addr(); if (CHECK(base_addr < 0, "get_base_addr", @@ -17,6 +21,10 @@ void test_attach_probe(void) return; uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); + ref_ctr_offset = get_rel_offset((uintptr_t)&uprobe_ref_ctr); + if (!ASSERT_GE(ref_ctr_offset, 0, "ref_ctr_offset")) + return; + skel = test_attach_probe__open_and_load(); if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) return; @@ -37,20 +45,28 @@ void test_attach_probe(void) goto cleanup; skel->links.handle_kretprobe = kretprobe_link; - uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe, - false /* retprobe */, - 0 /* self pid */, - "/proc/self/exe", - uprobe_offset); + ASSERT_EQ(uprobe_ref_ctr, 0, "uprobe_ref_ctr_before"); + + uprobe_opts.retprobe = false; + uprobe_opts.ref_ctr_offset = ref_ctr_offset; + uprobe_link = bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe, + 0 /* self pid */, + "/proc/self/exe", + uprobe_offset, + &uprobe_opts); if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe")) goto cleanup; skel->links.handle_uprobe = uprobe_link; - uretprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uretprobe, - true /* retprobe */, - -1 /* any pid */, - "/proc/self/exe", - uprobe_offset); + ASSERT_GT(uprobe_ref_ctr, 0, "uprobe_ref_ctr_after"); + + /* if uprobe uses ref_ctr, uretprobe has to use ref_ctr as well */ + uprobe_opts.retprobe = true; + uprobe_opts.ref_ctr_offset = ref_ctr_offset; + uretprobe_link = bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe, + -1 /* any pid */, + "/proc/self/exe", + uprobe_offset, &uprobe_opts); if (!ASSERT_OK_PTR(uretprobe_link, "attach_uretprobe")) goto cleanup; skel->links.handle_uretprobe = uretprobe_link; @@ -77,4 +93,5 @@ void test_attach_probe(void) cleanup: test_attach_probe__destroy(skel); + ASSERT_EQ(uprobe_ref_ctr, 0, "uprobe_ref_ctr_cleanup"); } diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 381dafce1d8f..e7a19b04d4ea 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -202,3 +202,24 @@ ssize_t get_base_addr(void) fclose(f); return -EINVAL; } + +ssize_t get_rel_offset(uintptr_t addr) +{ + size_t start, end, offset; + char buf[256]; + FILE *f; + + f = fopen("/proc/self/maps", "r"); + if (!f) + return -errno; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { + if (addr >= start && addr < end) { + fclose(f); + return (size_t)addr - start + offset; + } + } + + fclose(f); + return -EINVAL; +} diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 3d9435b3dd3b..d907b445524d 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -20,5 +20,6 @@ void read_trace_pipe(void); ssize_t get_uprobe_offset(const void *addr, ssize_t base); ssize_t get_base_addr(void); +ssize_t get_rel_offset(uintptr_t addr); #endif -- cgit v1.2.3 From bb57164920d738076bf91d65af35d8724526c21b Mon Sep 17 00:00:00 2001 From: grantseltzer Date: Mon, 9 Aug 2021 22:05:08 -0400 Subject: bpf: Reconfigure libbpf docs to remove unversioned API This removes the libbpf_api.rst file from the kernel documentation. The intention for this file was to pull documentation from comments above API functions in libbpf. However, due to limitations of the kernel documentation system, this API documentation could not be versioned, which is counterintuative to how users expect to use it. There is also currently no doc comments, making this a blank page. Once the kernel comment documentation is actually contributed, it will still exist in the kernel repository, just in the code itself. A seperate site is being spun up to generate documentaiton from those comments in a way in which it can be versioned properly. This also reconfigures the bpf documentation index page to make it easier to sync to the previously mentioned documentaiton site. Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210810020508.280639-1-grantseltzer@gmail.com --- Documentation/bpf/index.rst | 10 +--------- Documentation/bpf/libbpf/libbpf_api.rst | 27 --------------------------- 2 files changed, 1 insertion(+), 36 deletions(-) delete mode 100644 Documentation/bpf/libbpf/libbpf_api.rst diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index baea6c2abba5..1ceb5d704a97 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -15,15 +15,7 @@ that goes into great technical depth about the BPF Architecture. libbpf ====== -Libbpf is a userspace library for loading and interacting with bpf programs. - -.. toctree:: - :maxdepth: 1 - - libbpf/libbpf - libbpf/libbpf_api - libbpf/libbpf_build - libbpf/libbpf_naming_convention +Documentation/bpf/libbpf/libbpf.rst is a userspace library for loading and interacting with bpf programs. BPF Type Format (BTF) ===================== diff --git a/Documentation/bpf/libbpf/libbpf_api.rst b/Documentation/bpf/libbpf/libbpf_api.rst deleted file mode 100644 index f07eecd054da..000000000000 --- a/Documentation/bpf/libbpf/libbpf_api.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) - -API -=== - -This documentation is autogenerated from header files in libbpf, tools/lib/bpf - -.. kernel-doc:: tools/lib/bpf/libbpf.h - :internal: - -.. kernel-doc:: tools/lib/bpf/bpf.h - :internal: - -.. kernel-doc:: tools/lib/bpf/btf.h - :internal: - -.. kernel-doc:: tools/lib/bpf/xsk.h - :internal: - -.. kernel-doc:: tools/lib/bpf/bpf_tracing.h - :internal: - -.. kernel-doc:: tools/lib/bpf/bpf_core_read.h - :internal: - -.. kernel-doc:: tools/lib/bpf/bpf_endian.h - :internal: \ No newline at end of file -- cgit v1.2.3 From edce1a248670397f8d0be1f6967eab3c5c082013 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sun, 15 Aug 2021 16:10:35 +0800 Subject: selftests/bpf: Test btf__load_vmlinux_btf/btf__load_module_btf APIs Add test for btf__load_vmlinux_btf/btf__load_module_btf APIs. The test loads bpf_testmod module BTF and check existence of a symbol which is known to exist. Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210815081035.205879-1-hengqi.chen@gmail.com --- .../testing/selftests/bpf/prog_tests/btf_module.c | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_module.c diff --git a/tools/testing/selftests/bpf/prog_tests/btf_module.c b/tools/testing/selftests/bpf/prog_tests/btf_module.c new file mode 100644 index 000000000000..2239d1fe0332 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_module.c @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 Hengqi Chen */ + +#include +#include + +static const char *module_name = "bpf_testmod"; +static const char *symbol_name = "bpf_testmod_test_read"; + +void test_btf_module() +{ + struct btf *vmlinux_btf, *module_btf; + __s32 type_id; + + if (!env.has_testmod) { + test__skip(); + return; + } + + vmlinux_btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF")) + return; + + module_btf = btf__load_module_btf(module_name, vmlinux_btf); + if (!ASSERT_OK_PTR(module_btf, "could not load module BTF")) + goto cleanup; + + type_id = btf__find_by_name(module_btf, symbol_name); + ASSERT_GT(type_id, 0, "func not found"); + +cleanup: + btf__free(module_btf); + btf__free(vmlinux_btf); +} -- cgit v1.2.3 From 77462de14a43f4d98dbd8de0f5743a4e02450b1d Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:20 +0000 Subject: af_unix: Add read_sock for stream socket types To support sockmap for af_unix stream type, implement read_sock, which is similar to the read_sock for unix dgram sockets. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210816190327.2739291-2-jiang.wang@bytedance.com --- net/unix/af_unix.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bad8f19174e3..4455b62317d4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -679,6 +679,8 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -732,6 +734,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, + .read_sock = unix_stream_read_sock, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -2491,6 +2494,15 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } #endif +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + return -ENOTCONN; + + return unix_read_sock(sk, desc, recv_actor); +} + static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { -- cgit v1.2.3 From 94531cfcbe79c3598acf96806627b2137ca32eb9 Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:21 +0000 Subject: af_unix: Add unix_stream_proto for sockmap Previously, sockmap for AF_UNIX protocol only supports dgram type. This patch add unix stream type support, which is similar to unix_dgram_proto. To support sockmap, dgram and stream cannot share the same unix_proto anymore, because they have different implementations, such as unhash for stream type (which will remove closed or disconnected sockets from the map), so rename unix_proto to unix_dgram_proto and add a new unix_stream_proto. Also implement stream related sockmap functions. And add dgram key words to those dgram specific functions. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210816190327.2739291-3-jiang.wang@bytedance.com --- include/net/af_unix.h | 8 +++-- net/core/sock_map.c | 1 + net/unix/af_unix.c | 83 ++++++++++++++++++++++++++++++++++++++------- net/unix/unix_bpf.c | 93 +++++++++++++++++++++++++++++++++++++++------------ 4 files changed, 148 insertions(+), 37 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 4757d7f53f13..7d142e8a0550 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -87,6 +87,8 @@ long unix_outq_len(struct sock *sk); int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, + int flags); #ifdef CONFIG_SYSCTL int unix_sysctl_register(struct net *net); void unix_sysctl_unregister(struct net *net); @@ -96,9 +98,11 @@ static inline void unix_sysctl_unregister(struct net *net) {} #endif #ifdef CONFIG_BPF_SYSCALL -extern struct proto unix_proto; +extern struct proto unix_dgram_proto; +extern struct proto unix_stream_proto; -int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init unix_bpf_build_proto(void); #else static inline void __init unix_bpf_build_proto(void) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ae5fa4338d9c..e252b8ec2b85 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1494,6 +1494,7 @@ void sock_map_unhash(struct sock *sk) rcu_read_unlock(); saved_unhash(sk); } +EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_close(struct sock *sk, long timeout) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4455b62317d4..443c49081636 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -798,17 +798,35 @@ static void unix_close(struct sock *sk, long timeout) */ } -struct proto unix_proto = { - .name = "UNIX", +static void unix_unhash(struct sock *sk) +{ + /* Nothing to do here, unix socket does not need a ->unhash(). + * This is merely for sockmap. + */ +} + +struct proto unix_dgram_proto = { + .name = "UNIX-DGRAM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), + .close = unix_close, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_dgram_bpf_update_proto, +#endif +}; + +struct proto unix_stream_proto = { + .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, + .unhash = unix_unhash, #ifdef CONFIG_BPF_SYSCALL - .psock_update_sk_prot = unix_bpf_update_proto, + .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; -static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct sock *sk = NULL; struct unix_sock *u; @@ -817,7 +835,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); + if (type == SOCK_STREAM) + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); + else /*dgram and seqpacket */ + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); + if (!sk) goto out; @@ -879,7 +901,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -1293,7 +1315,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0); + newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); if (newsk == NULL) goto out; @@ -2323,8 +2345,10 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL - if (sk->sk_prot != &unix_proto) - return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_dgram_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); @@ -2728,6 +2752,20 @@ static int unix_stream_read_actor(struct sk_buff *skb, return ret ?: chunk; } +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sk->sk_socket, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state, true); +} + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -2739,6 +2777,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, .flags = flags }; +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_stream_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif return unix_stream_read_generic(&state, true); } @@ -2799,7 +2845,9 @@ static int unix_shutdown(struct socket *sock, int mode) (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; + const struct proto *prot = READ_ONCE(other->sk_prot); + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) @@ -2808,10 +2856,12 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) + if (peer_mode == SHUTDOWN_MASK) { sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - else if (peer_mode & RCV_SHUTDOWN) + other->sk_state = TCP_CLOSE; + } else if (peer_mode & RCV_SHUTDOWN) { sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); + } } if (other) sock_put(other); @@ -3289,7 +3339,13 @@ static int __init af_unix_init(void) BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - rc = proto_register(&unix_proto, 1); + rc = proto_register(&unix_dgram_proto, 1); + if (rc != 0) { + pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + goto out; + } + + rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; @@ -3310,7 +3366,8 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - proto_unregister(&unix_proto); + proto_unregister(&unix_dgram_proto); + proto_unregister(&unix_stream_proto); unregister_pernet_subsys(&unix_net_ops); } diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 20f53575b5c9..b927e2baae50 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -38,9 +38,18 @@ static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } -static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, - int *addr_len) +static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + if (sk->sk_type == SOCK_DGRAM) + return __unix_dgram_recvmsg(sk, msg, len, flags); + else + return __unix_stream_recvmsg(sk, msg, len, flags); +} + +static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, + int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; @@ -48,14 +57,14 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, psock = sk_psock_get(sk); if (unlikely(!psock)) - return __unix_dgram_recvmsg(sk, msg, len, flags); + return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); - return __unix_dgram_recvmsg(sk, msg, len, flags); + return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: @@ -71,7 +80,7 @@ msg_bytes_ready: goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); - return __unix_dgram_recvmsg(sk, msg, len, flags); + return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } @@ -80,30 +89,55 @@ msg_bytes_ready: return copied; } -static struct proto *unix_prot_saved __read_mostly; -static DEFINE_SPINLOCK(unix_prot_lock); -static struct proto unix_bpf_prot; +static struct proto *unix_dgram_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_dgram_prot_lock); +static struct proto unix_dgram_bpf_prot; + +static struct proto *unix_stream_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_stream_prot_lock); +static struct proto unix_stream_bpf_prot; -static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base) +static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; - prot->recvmsg = unix_dgram_bpf_recvmsg; + prot->recvmsg = unix_bpf_recvmsg; +} + +static void unix_stream_bpf_rebuild_protos(struct proto *prot, + const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_bpf_recvmsg; + prot->unhash = sock_map_unhash; +} + +static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { + spin_lock_bh(&unix_dgram_prot_lock); + if (likely(ops != unix_dgram_prot_saved)) { + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); + smp_store_release(&unix_dgram_prot_saved, ops); + } + spin_unlock_bh(&unix_dgram_prot_lock); + } } -static void unix_bpf_check_needs_rebuild(struct proto *ops) +static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { - if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) { - spin_lock_bh(&unix_prot_lock); - if (likely(ops != unix_prot_saved)) { - unix_bpf_rebuild_protos(&unix_bpf_prot, ops); - smp_store_release(&unix_prot_saved, ops); + if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { + spin_lock_bh(&unix_stream_prot_lock); + if (likely(ops != unix_stream_prot_saved)) { + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); + smp_store_release(&unix_stream_prot_saved, ops); } - spin_unlock_bh(&unix_prot_lock); + spin_unlock_bh(&unix_stream_prot_lock); } } -int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; @@ -114,12 +148,27 @@ int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) return 0; } - unix_bpf_check_needs_rebuild(psock->sk_proto); - WRITE_ONCE(sk->sk_prot, &unix_bpf_prot); + unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot); + return 0; +} + +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_stream_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { - unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto); + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); + } -- cgit v1.2.3 From 9b03152bd4691ba80f181413799b7fd36e83e564 Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:22 +0000 Subject: selftest/bpf: Add tests for sockmap with unix stream type. Add two tests for unix stream to unix stream redirection in sockmap tests. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210816190327.2739291-4-jiang.wang@bytedance.com --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index a9f1bf9d5dff..7a976d43281a 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -2020,11 +2020,13 @@ void test_sockmap_listen(void) run_tests(skel, skel->maps.sock_map, AF_INET); run_tests(skel, skel->maps.sock_map, AF_INET6); test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM); + test_unix_redir(skel, skel->maps.sock_map, SOCK_STREAM); skel->bss->test_sockmap = false; run_tests(skel, skel->maps.sock_hash, AF_INET); run_tests(skel, skel->maps.sock_hash, AF_INET6); test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM); + test_unix_redir(skel, skel->maps.sock_hash, SOCK_STREAM); test_sockmap_listen__destroy(skel); } -- cgit v1.2.3 From 75e0e27db6cf0d1eaa9f79c4bfab63e209394c0f Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:23 +0000 Subject: selftest/bpf: Change udp to inet in some function names This is to prepare for adding new unix stream tests. Mostly renames, also pass the socket types as an argument. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210816190327.2739291-5-jiang.wang@bytedance.com --- .../selftests/bpf/prog_tests/sockmap_listen.c | 30 ++++++++++++---------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 7a976d43281a..07ed8081f9ae 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1692,14 +1692,14 @@ static void test_reuseport(struct test_sockmap_listen *skel, } } -static int udp_socketpair(int family, int *s, int *c) +static int inet_socketpair(int family, int type, int *s, int *c) { struct sockaddr_storage addr; socklen_t len; int p0, c0; int err; - p0 = socket_loopback(family, SOCK_DGRAM | SOCK_NONBLOCK); + p0 = socket_loopback(family, type | SOCK_NONBLOCK); if (p0 < 0) return p0; @@ -1708,7 +1708,7 @@ static int udp_socketpair(int family, int *s, int *c) if (err) goto close_peer0; - c0 = xsocket(family, SOCK_DGRAM | SOCK_NONBLOCK, 0); + c0 = xsocket(family, type | SOCK_NONBLOCK, 0); if (c0 < 0) { err = c0; goto close_peer0; @@ -1747,10 +1747,10 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, zero_verdict_count(verd_mapfd); - err = udp_socketpair(family, &p0, &c0); + err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0); if (err) return; - err = udp_socketpair(family, &p1, &c1); + err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1); if (err) goto close_cli0; @@ -1825,7 +1825,7 @@ static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map udp_skb_redir_to_connected(skel, map, family); } -static void udp_unix_redir_to_connected(int family, int sock_mapfd, +static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { const char *log_prefix = redir_mode_str(mode); @@ -1843,7 +1843,7 @@ static void udp_unix_redir_to_connected(int family, int sock_mapfd, return; c0 = sfd[0], p0 = sfd[1]; - err = udp_socketpair(family, &p1, &c1); + err = inet_socketpair(family, SOCK_DGRAM, &p1, &c1); if (err) goto close; @@ -1897,14 +1897,16 @@ static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, return; skel->bss->test_ingress = false; - udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS); + inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); skel->bss->test_ingress = true; - udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS); + inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } -static void unix_udp_redir_to_connected(int family, int sock_mapfd, +static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, int verd_mapfd, enum redir_mode mode) { const char *log_prefix = redir_mode_str(mode); @@ -1917,7 +1919,7 @@ static void unix_udp_redir_to_connected(int family, int sock_mapfd, zero_verdict_count(verd_mapfd); - err = udp_socketpair(family, &p0, &c0); + err = inet_socketpair(family, SOCK_DGRAM, &p0, &c0); if (err) return; @@ -1972,9 +1974,11 @@ static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel, return; skel->bss->test_ingress = false; - unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS); + unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); skel->bss->test_ingress = true; - unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS); + unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } -- cgit v1.2.3 From 31c50aeed5a12ded8856312c13bb6dc9e64dc87f Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:24 +0000 Subject: selftest/bpf: Add new tests in sockmap for unix stream to tcp. Add two new test cases in sockmap tests, where unix stream is redirected to tcp and vice versa. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210816190327.2739291-6-jiang.wang@bytedance.com --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 07ed8081f9ae..afa14fb66f08 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1884,7 +1884,7 @@ close: xclose(p0); } -static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, +static void inet_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family) { int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); @@ -1899,9 +1899,13 @@ static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel, skel->bss->test_ingress = false; inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, REDIR_EGRESS); + inet_unix_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_EGRESS); skel->bss->test_ingress = true; inet_unix_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, REDIR_INGRESS); + inet_unix_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_INGRESS); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } @@ -1961,7 +1965,7 @@ close_cli0: } -static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel, +static void unix_inet_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family) { int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); @@ -1976,9 +1980,13 @@ static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel, skel->bss->test_ingress = false; unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, REDIR_EGRESS); + unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_EGRESS); skel->bss->test_ingress = true; unix_inet_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, REDIR_INGRESS); + unix_inet_redir_to_connected(family, SOCK_STREAM, sock_map, verdict_map, + REDIR_INGRESS); xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); } @@ -1994,8 +2002,8 @@ static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); if (!test__start_subtest(s)) return; - udp_unix_skb_redir_to_connected(skel, map, family); - unix_udp_skb_redir_to_connected(skel, map, family); + inet_unix_skb_redir_to_connected(skel, map, family); + unix_inet_skb_redir_to_connected(skel, map, family); } static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, -- cgit v1.2.3 From 3c3bd542ffbb2ac09631313ede46ae66660ae550 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 16 Aug 2021 10:52:50 -0700 Subject: selftests/bpf: Add exponential backoff to map_update_retriable in test_maps Using a fixed delay of 1 microsecond has proven flaky in slow CPU environment, e.g. Github Actions CI system. This patch adds exponential backoff with a cap of 50ms to reduce the flakiness of the test. Initial delay is chosen at random in the range [0ms, 5ms). Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210816175250.296110-1-fallentree@fb.com --- tools/testing/selftests/bpf/test_maps.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 14cea869235b..2caf58b40d40 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1396,15 +1396,22 @@ static void test_map_stress(void) #define DO_DELETE 0 #define MAP_RETRIES 20 +#define MAX_DELAY_US 50000 +#define MIN_DELAY_RANGE_US 5000 static int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts) { + int delay = rand() % MIN_DELAY_RANGE_US; + while (bpf_map_update_elem(map_fd, key, value, flags)) { if (!attempts || (errno != EAGAIN && errno != EBUSY)) return -errno; - usleep(1); + if (delay <= MAX_DELAY_US / 2) + delay *= 2; + + usleep(delay); attempts--; } -- cgit v1.2.3 From 857f75ea845706a0ec65ce2239da519214a4451a Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 16 Aug 2021 21:57:13 -0700 Subject: selftests/bpf: Add exponential backoff to map_delete_retriable in test_maps Using a fixed delay of 1 microsecond has proven flaky in slow CPU environment, e.g. Github Actions CI system. This patch adds exponential backoff with a cap of 50ms to reduce the flakiness of the test. Initial delay is chosen at random in the range [0ms, 5ms). Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817045713.3307985-1-fallentree@fb.com --- tools/testing/selftests/bpf/test_maps.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 2caf58b40d40..340695d5d652 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1420,11 +1420,16 @@ static int map_update_retriable(int map_fd, const void *key, const void *value, static int map_delete_retriable(int map_fd, const void *key, int attempts) { + int delay = rand() % MIN_DELAY_RANGE_US; + while (bpf_map_delete_elem(map_fd, key)) { if (!attempts || (errno != EAGAIN && errno != EBUSY)) return -errno; - usleep(1); + if (delay <= MAX_DELAY_US / 2) + delay *= 2; + + usleep(delay); attempts--; } -- cgit v1.2.3 From 26d82640d5ba2c3b32d79597be2dcf820ed78b16 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 16 Aug 2021 21:47:29 -0700 Subject: selftests/bpf: Skip loading bpf_testmod when using -l to list tests. When using "-l", test_progs often is executed as non-root user, load_bpf_testmod() will fail and output errors. This patch skips loading bpf testmod when "-l" is specified, making output cleaner. Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817044732.3263066-2-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 6f103106a39b..532af3353edf 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -755,7 +755,7 @@ int main(int argc, char **argv) save_netns(); stdio_hijack(); env.has_testmod = true; - if (load_bpf_testmod()) { + if (!env.list_test_names && load_bpf_testmod()) { fprintf(env.stderr, "WARNING! Selftests relying on bpf_testmod.ko will be skipped.\n"); env.has_testmod = false; } @@ -803,7 +803,7 @@ int main(int argc, char **argv) if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); } - if (env.has_testmod) + if (!env.list_test_names && env.has_testmod) unload_bpf_testmod(); stdio_restore(); -- cgit v1.2.3 From f667d1d66760fcb27aee6c9964eefde39a464afe Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 16 Aug 2021 21:47:30 -0700 Subject: selftests/bpf: Correctly display subtest skip status In skip_account(), test->skip_cnt is set to 0 at the end, this makes next print statement never display SKIP status for the subtest. This patch moves the accounting logic after the print statement, fixing the issue. This patch also added SKIP status display for normal tests. Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817044732.3263066-3-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 532af3353edf..f0fbead40883 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -148,18 +148,18 @@ void test__end_subtest() struct prog_test_def *test = env.test; int sub_error_cnt = test->error_cnt - test->old_error_cnt; - if (sub_error_cnt) - env.fail_cnt++; - else if (test->skip_cnt == 0) - env.sub_succ_cnt++; - skip_account(); - dump_test_log(test, sub_error_cnt); fprintf(env.stdout, "#%d/%d %s:%s\n", test->test_num, test->subtest_num, test->subtest_name, sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); + if (sub_error_cnt) + env.fail_cnt++; + else if (test->skip_cnt == 0) + env.sub_succ_cnt++; + skip_account(); + free(test->subtest_name); test->subtest_name = NULL; } @@ -786,17 +786,18 @@ int main(int argc, char **argv) test__end_subtest(); test->tested = true; - if (test->error_cnt) - env.fail_cnt++; - else - env.succ_cnt++; - skip_account(); dump_test_log(test, test->error_cnt); fprintf(env.stdout, "#%d %s:%s\n", test->test_num, test->test_name, - test->error_cnt ? "FAIL" : "OK"); + test->error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); + + if (test->error_cnt) + env.fail_cnt++; + else + env.succ_cnt++; + skip_account(); reset_affinity(); restore_netns(); -- cgit v1.2.3 From 99c4fd8b92b3dc6db1afa0e252d3054d501a03ca Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 16 Aug 2021 21:47:31 -0700 Subject: selftests/bpf: Also print test name in subtest status message This patch add test name in subtest status message line, making it possible to grep ':OK' in the output to generate a list of passed test+subtest names, which can be processed to generate argument list to be used with "-a", "-d" exact string matching. Example: #1/1 align/mov:OK .. #1/12 align/pointer variable subtraction:OK #1 align:OK Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817044732.3263066-4-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index f0fbead40883..90539b15b744 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -150,8 +150,8 @@ void test__end_subtest() dump_test_log(test, sub_error_cnt); - fprintf(env.stdout, "#%d/%d %s:%s\n", - test->test_num, test->subtest_num, test->subtest_name, + fprintf(env.stdout, "#%d/%d %s/%s:%s\n", + test->test_num, test->subtest_num, test->test_name, test->subtest_name, sub_error_cnt ? "FAIL" : (test->skip_cnt ? "SKIP" : "OK")); if (sub_error_cnt) -- cgit v1.2.3 From 74339a8f866cdcca3f701c859b43b538890d905b Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 16 Aug 2021 21:47:32 -0700 Subject: selftests/bpf: Support glob matching for test selector. This patch adds '-a' and '-d' arguments supporting both exact string match as well as using '*' wildcard in test/subtests selection. '-a' and '-t' can co-exists, same as '-d' and '-b', in which case they just add to the list of allowed or denied test selectors. Caveat: Same as the current substring matching mechanism, test and subtest selector applies independently, 'a*/b*' will execute all tests matching "a*", and with subtest name matching "b*", but tests matching "a*" that has no subtests will also be executed. Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817044732.3263066-5-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 78 +++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 90539b15b744..cc1cd240445d 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -13,6 +13,28 @@ #include /* backtrace */ #include +/* Adapted from perf/util/string.c */ +static bool glob_match(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*str != *pat) + return false; + str++; + pat++; + } + /* Check wild card */ + if (*pat == '*') { + while (*pat == '*') + pat++; + if (!*pat) /* Tail wild card matches all */ + return true; + while (*str) + if (glob_match(str++, pat)) + return true; + } + return !*str && !*pat; +} + #define EXIT_NO_TEST 2 #define EXIT_ERR_SETUP_INFRA 3 @@ -55,12 +77,12 @@ static bool should_run(struct test_selector *sel, int num, const char *name) int i; for (i = 0; i < sel->blacklist.cnt; i++) { - if (strstr(name, sel->blacklist.strs[i])) + if (glob_match(name, sel->blacklist.strs[i])) return false; } for (i = 0; i < sel->whitelist.cnt; i++) { - if (strstr(name, sel->whitelist.strs[i])) + if (glob_match(name, sel->whitelist.strs[i])) return true; } @@ -450,6 +472,8 @@ enum ARG_KEYS { ARG_VERBOSE = 'v', ARG_GET_TEST_CNT = 'c', ARG_LIST_TEST_NAMES = 'l', + ARG_TEST_NAME_GLOB_ALLOWLIST = 'a', + ARG_TEST_NAME_GLOB_DENYLIST = 'd', }; static const struct argp_option opts[] = { @@ -467,6 +491,10 @@ static const struct argp_option opts[] = { "Get number of selected top-level tests " }, { "list", ARG_LIST_TEST_NAMES, NULL, 0, "List test names that would run (without running them) " }, + { "allow", ARG_TEST_NAME_GLOB_ALLOWLIST, "NAMES", 0, + "Run tests with name matching the pattern (supports '*' wildcard)." }, + { "deny", ARG_TEST_NAME_GLOB_DENYLIST, "NAMES", 0, + "Don't run tests with name matching the pattern (supports '*' wildcard)." }, {}, }; @@ -491,36 +519,48 @@ static void free_str_set(const struct str_set *set) free(set->strs); } -static int parse_str_list(const char *s, struct str_set *set) +static int parse_str_list(const char *s, struct str_set *set, bool is_glob_pattern) { char *input, *state = NULL, *next, **tmp, **strs = NULL; - int cnt = 0; + int i, cnt = 0; input = strdup(s); if (!input) return -ENOMEM; - set->cnt = 0; - set->strs = NULL; - while ((next = strtok_r(state ? NULL : input, ",", &state))) { tmp = realloc(strs, sizeof(*strs) * (cnt + 1)); if (!tmp) goto err; strs = tmp; - strs[cnt] = strdup(next); - if (!strs[cnt]) - goto err; + if (is_glob_pattern) { + strs[cnt] = strdup(next); + if (!strs[cnt]) + goto err; + } else { + strs[cnt] = malloc(strlen(next) + 2 + 1); + if (!strs[cnt]) + goto err; + sprintf(strs[cnt], "*%s*", next); + } cnt++; } - set->cnt = cnt; - set->strs = (const char **)strs; + tmp = realloc(set->strs, sizeof(*strs) * (cnt + set->cnt)); + if (!tmp) + goto err; + memcpy(tmp + set->cnt, strs, sizeof(*strs) * cnt); + set->strs = (const char **)tmp; + set->cnt += cnt; + free(input); + free(strs); return 0; err: + for (i = 0; i < cnt; i++) + free(strs[i]); free(strs); free(input); return -ENOMEM; @@ -553,29 +593,35 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) } break; } + case ARG_TEST_NAME_GLOB_ALLOWLIST: case ARG_TEST_NAME: { char *subtest_str = strchr(arg, '/'); if (subtest_str) { *subtest_str = '\0'; if (parse_str_list(subtest_str + 1, - &env->subtest_selector.whitelist)) + &env->subtest_selector.whitelist, + key == ARG_TEST_NAME_GLOB_ALLOWLIST)) return -ENOMEM; } - if (parse_str_list(arg, &env->test_selector.whitelist)) + if (parse_str_list(arg, &env->test_selector.whitelist, + key == ARG_TEST_NAME_GLOB_ALLOWLIST)) return -ENOMEM; break; } + case ARG_TEST_NAME_GLOB_DENYLIST: case ARG_TEST_NAME_BLACKLIST: { char *subtest_str = strchr(arg, '/'); if (subtest_str) { *subtest_str = '\0'; if (parse_str_list(subtest_str + 1, - &env->subtest_selector.blacklist)) + &env->subtest_selector.blacklist, + key == ARG_TEST_NAME_GLOB_DENYLIST)) return -ENOMEM; } - if (parse_str_list(arg, &env->test_selector.blacklist)) + if (parse_str_list(arg, &env->test_selector.blacklist, + key == ARG_TEST_NAME_GLOB_DENYLIST)) return -ENOMEM; break; } -- cgit v1.2.3 From 6f6cc426451bb15a85896efc7c85665b59af04ae Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 17 Aug 2021 12:09:18 -0700 Subject: selftests/bpf: Replace CHECK with ASSERT_* macros in send_signal.c Replace CHECK in send_signal.c with ASSERT_* macros as ASSERT_* macros are generally preferred. There is no funcitonality change. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817190918.3186400-1-yhs@fb.com --- .../testing/selftests/bpf/prog_tests/send_signal.c | 45 ++++++++++------------ 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 023cc532992d..41e158ae888e 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -10,29 +10,25 @@ static void sigusr1_handler(int signum) } static void test_send_signal_common(struct perf_event_attr *attr, - bool signal_thread, - const char *test_name) + bool signal_thread) { struct test_send_signal_kern *skel; int pipe_c2p[2], pipe_p2c[2]; int err = -1, pmu_fd = -1; - __u32 duration = 0; char buf[256]; pid_t pid; - if (CHECK(pipe(pipe_c2p), test_name, - "pipe pipe_c2p error: %s\n", strerror(errno))) + if (!ASSERT_OK(pipe(pipe_c2p), "pipe_c2p")) return; - if (CHECK(pipe(pipe_p2c), test_name, - "pipe pipe_p2c error: %s\n", strerror(errno))) { + if (!ASSERT_OK(pipe(pipe_p2c), "pipe_p2c")) { close(pipe_c2p[0]); close(pipe_c2p[1]); return; } pid = fork(); - if (CHECK(pid < 0, test_name, "fork error: %s\n", strerror(errno))) { + if (!ASSERT_GE(pid, 0, "fork")) { close(pipe_c2p[0]); close(pipe_c2p[1]); close(pipe_p2c[0]); @@ -48,19 +44,19 @@ static void test_send_signal_common(struct perf_event_attr *attr, close(pipe_p2c[1]); /* close write */ /* notify parent signal handler is installed */ - CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); /* make sure parent enabled bpf program to send_signal */ - CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno); + ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); /* wait a little for signal handler */ sleep(1); buf[0] = sigusr1_received ? '2' : '0'; - CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); /* wait for parent notification and exit */ - CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno); + ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); close(pipe_c2p[1]); close(pipe_p2c[0]); @@ -71,20 +67,19 @@ static void test_send_signal_common(struct perf_event_attr *attr, close(pipe_p2c[0]); /* close read */ skel = test_send_signal_kern__open_and_load(); - if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n")) + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) goto skel_open_load_failure; if (!attr) { err = test_send_signal_kern__attach(skel); - if (CHECK(err, "skel_attach", "skeleton attach failed\n")) { + if (!ASSERT_OK(err, "skel_attach")) { err = -1; goto destroy_skel; } } else { pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1, -1 /* group id */, 0 /* flags */); - if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n", - strerror(errno))) { + if (!ASSERT_GE(pmu_fd, 0, "perf_event_open")) { err = -1; goto destroy_skel; } @@ -96,7 +91,7 @@ static void test_send_signal_common(struct perf_event_attr *attr, } /* wait until child signal handler installed */ - CHECK(read(pipe_c2p[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno); + ASSERT_EQ(read(pipe_c2p[0], buf, 1), 1, "pipe_read"); /* trigger the bpf send_signal */ skel->bss->pid = pid; @@ -104,21 +99,21 @@ static void test_send_signal_common(struct perf_event_attr *attr, skel->bss->signal_thread = signal_thread; /* notify child that bpf program can send_signal now */ - CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); /* wait for result */ err = read(pipe_c2p[0], buf, 1); - if (CHECK(err < 0, test_name, "reading pipe error: %s\n", strerror(errno))) + if (!ASSERT_GE(err, 0, "reading pipe")) goto disable_pmu; - if (CHECK(err == 0, test_name, "reading pipe error: size 0\n")) { + if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) { err = -1; goto disable_pmu; } - CHECK(buf[0] != '2', test_name, "incorrect result\n"); + ASSERT_EQ(buf[0], '2', "incorrect result"); /* notify child safe to exit */ - CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno); + ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write"); disable_pmu: close(pmu_fd); @@ -132,7 +127,7 @@ skel_open_load_failure: static void test_send_signal_tracepoint(bool signal_thread) { - test_send_signal_common(NULL, signal_thread, "tracepoint"); + test_send_signal_common(NULL, signal_thread); } static void test_send_signal_perf(bool signal_thread) @@ -143,7 +138,7 @@ static void test_send_signal_perf(bool signal_thread) .config = PERF_COUNT_SW_CPU_CLOCK, }; - test_send_signal_common(&attr, signal_thread, "perf_sw_event"); + test_send_signal_common(&attr, signal_thread); } static void test_send_signal_nmi(bool signal_thread) @@ -172,7 +167,7 @@ static void test_send_signal_nmi(bool signal_thread) close(pmu_fd); } - test_send_signal_common(&attr, signal_thread, "perf_hw_event"); + test_send_signal_common(&attr, signal_thread); } void test_send_signal(void) -- cgit v1.2.3 From b16ac5bf732a5e23d164cf908ec7742d6a6120d3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 17 Aug 2021 12:09:23 -0700 Subject: selftests/bpf: Fix flaky send_signal test libbpf CI has reported send_signal test is flaky although I am not able to reproduce it in my local environment. But I am able to reproduce with on-demand libbpf CI ([1]). Through code analysis, the following is possible reason. The failed subtest runs bpf program in softirq environment. Since bpf_send_signal() only sends to a fork of "test_progs" process. If the underlying current task is not "test_progs", bpf_send_signal() will not be triggered and the subtest will fail. To reduce the chances where the underlying process is not the intended one, this patch boosted scheduling priority to -20 (highest allowed by setpriority() call). And I did 10 runs with on-demand libbpf CI with this patch and I didn't observe any failures. [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817190923.3186725-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/send_signal.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c index 41e158ae888e..776916b61c40 100644 --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include +#include #include "test_send_signal_kern.skel.h" int sigusr1_received = 0; @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr, } if (pid == 0) { + int old_prio; + /* install signal handler and notify parent */ signal(SIGUSR1, sigusr1_handler); close(pipe_c2p[0]); /* close read */ close(pipe_p2c[1]); /* close write */ + /* boost with a high priority so we got a higher chance + * that if an interrupt happens, the underlying task + * is this process. + */ + errno = 0; + old_prio = getpriority(PRIO_PROCESS, 0); + ASSERT_OK(errno, "getpriority"); + ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority"); + /* notify parent signal handler is installed */ ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write"); @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr, /* wait for parent notification and exit */ ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read"); + /* restore the old priority */ + ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority"); + close(pipe_c2p[1]); close(pipe_p2c[0]); exit(0); -- cgit v1.2.3 From 8cacfc85b615cc0bae01241593c4b25da6570efc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Aug 2021 18:08:42 +0100 Subject: bpf: Remove redundant initialization of variable allow The variable allow is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817170842.495440-1-colin.king@canonical.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a1dedba4c174..9f35928bab0a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1135,7 +1135,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow = 1; + int allow; rcu_read_lock(); cgrp = task_dfl_cgroup(current); -- cgit v1.2.3 From d20b41115ad53293201cc07ee429a38740cb056b Mon Sep 17 00:00:00 2001 From: Grant Seltzer Date: Wed, 18 Aug 2021 11:13:13 -0400 Subject: libbpf: Rename libbpf documentation index file This patch renames a documentation libbpf.rst to index.rst. In order for readthedocs.org to pick this file up and properly build the documentation site. It also changes the title type of the ABI subsection in the naming convention doc. This is so that readthedocs.org doesn't treat this section as a separate document. Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210818151313.49992-1-grantseltzer@gmail.com --- Documentation/bpf/libbpf/index.rst | 22 ++++++++++++++++++++++ Documentation/bpf/libbpf/libbpf.rst | 14 -------------- .../bpf/libbpf/libbpf_naming_convention.rst | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) create mode 100644 Documentation/bpf/libbpf/index.rst delete mode 100644 Documentation/bpf/libbpf/libbpf.rst diff --git a/Documentation/bpf/libbpf/index.rst b/Documentation/bpf/libbpf/index.rst new file mode 100644 index 000000000000..4f8adfc3ab83 --- /dev/null +++ b/Documentation/bpf/libbpf/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +libbpf +====== + +For API documentation see the `versioned API documentation site `_. + +.. toctree:: + :maxdepth: 1 + + libbpf_naming_convention + libbpf_build + +This is documentation for libbpf, a userspace library for loading and +interacting with bpf programs. + +All general BPF questions, including kernel functionality, libbpf APIs and +their application, should be sent to bpf@vger.kernel.org mailing list. +You can `subscribe `_ to the +mailing list search its `archive `_. +Please search the archive before asking new questions. It very well might +be that this was already addressed or answered before. diff --git a/Documentation/bpf/libbpf/libbpf.rst b/Documentation/bpf/libbpf/libbpf.rst deleted file mode 100644 index 1b1e61d5ead1..000000000000 --- a/Documentation/bpf/libbpf/libbpf.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) - -libbpf -====== - -This is documentation for libbpf, a userspace library for loading and -interacting with bpf programs. - -All general BPF questions, including kernel functionality, libbpf APIs and -their application, should be sent to bpf@vger.kernel.org mailing list. -You can `subscribe `_ to the -mailing list search its `archive `_. -Please search the archive before asking new questions. It very well might -be that this was already addressed or answered before. diff --git a/Documentation/bpf/libbpf/libbpf_naming_convention.rst b/Documentation/bpf/libbpf/libbpf_naming_convention.rst index 6bf9c5ac7576..9c68d5014ff1 100644 --- a/Documentation/bpf/libbpf/libbpf_naming_convention.rst +++ b/Documentation/bpf/libbpf/libbpf_naming_convention.rst @@ -69,7 +69,7 @@ functions. These can be mixed and matched. Note that these functions are not reentrant for performance reasons. ABI -========== +--- libbpf can be both linked statically or used as DSO. To avoid possible conflicts with other libraries an application is linked with, all -- cgit v1.2.3 From 6cf1770d63dd2d0d0d4048e7b3ee360336c072d9 Mon Sep 17 00:00:00 2001 From: Xu Liu Date: Wed, 18 Aug 2021 18:58:19 +0800 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_SOCK_OPS We'd like to be able to identify netns from sockops hooks to accelerate local process communication form different netns. Signed-off-by: Xu Liu Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818105820.91894-2-liuxu623@gmail.com --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 5cf38e8886f1..59b8f5050180 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4676,6 +4676,18 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; +BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { + .func = bpf_get_netns_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -7491,6 +7503,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; -- cgit v1.2.3 From 374e74de96310cc63b9e3cde876e031107e6af6c Mon Sep 17 00:00:00 2001 From: Xu Liu Date: Wed, 18 Aug 2021 18:58:20 +0800 Subject: selftests/bpf: Test for get_netns_cookie Add test to use get_netns_cookie() from BPF_PROG_TYPE_SOCK_OPS. Signed-off-by: Xu Liu Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818105820.91894-3-liuxu623@gmail.com --- .../selftests/bpf/prog_tests/netns_cookie.c | 61 ++++++++++++++++++++++ .../selftests/bpf/progs/netns_cookie_prog.c | 39 ++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/netns_cookie.c create mode 100644 tools/testing/selftests/bpf/progs/netns_cookie_prog.c diff --git a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c new file mode 100644 index 000000000000..6f3cd472fb65 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "netns_cookie_prog.skel.h" +#include "network_helpers.h" + +#ifndef SO_NETNS_COOKIE +#define SO_NETNS_COOKIE 71 +#endif + +static int duration; + +void test_netns_cookie(void) +{ + int server_fd = 0, client_fd = 0, cgroup_fd = 0, err = 0, val = 0; + struct netns_cookie_prog *skel; + uint64_t cookie_expected_value; + socklen_t vallen = sizeof(cookie_expected_value); + + skel = netns_cookie_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + cgroup_fd = test__join_cgroup("/netns_cookie"); + if (CHECK(cgroup_fd < 0, "join_cgroup", "cgroup creation failed\n")) + goto out; + + skel->links.get_netns_cookie_sockops = bpf_program__attach_cgroup( + skel->progs.get_netns_cookie_sockops, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.get_netns_cookie_sockops, "prog_attach")) + goto close_cgroup_fd; + + server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) + goto close_cgroup_fd; + + client_fd = connect_to_fd(server_fd, 0); + if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) + goto close_server_fd; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.netns_cookies), + &client_fd, &val); + if (!ASSERT_OK(err, "map_lookup(socket_cookies)")) + goto close_client_fd; + + err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE, + &cookie_expected_value, &vallen); + if (!ASSERT_OK(err, "getsockopt)")) + goto close_client_fd; + + ASSERT_EQ(val, cookie_expected_value, "cookie_value"); + +close_client_fd: + close(client_fd); +close_server_fd: + close(server_fd); +close_cgroup_fd: + close(cgroup_fd); +out: + netns_cookie_prog__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c new file mode 100644 index 000000000000..4ed8d75aa299 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" + +#include + +#define AF_INET6 10 + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} netns_cookies SEC(".maps"); + +SEC("sockops") +int get_netns_cookie_sockops(struct bpf_sock_ops *ctx) +{ + struct bpf_sock *sk = ctx->sk; + int *cookie; + + if (ctx->family != AF_INET6) + return 1; + + if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) + return 1; + + if (!sk) + return 1; + + cookie = bpf_sk_storage_get(&netns_cookies, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!cookie) + return 1; + + *cookie = bpf_get_netns_cookie(ctx); + + return 1; +} -- cgit v1.2.3 From f9dabe016b63c9629e152bf876c126c29de223cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 19 Aug 2021 15:59:33 +0200 Subject: bpf: Undo off-by-one in interpreter tail call count limit The BPF interpreter as well as x86-64 BPF JIT were both in line by allowing up to 33 tail calls (however odd that number may be!). Recently, this was changed for the interpreter to reduce it down to 32 with the assumption that this should have been the actual limit "which is in line with the behavior of the x86 JITs" according to b61a28cf11d61 ("bpf: Fix off-by-one in tail call count limiting"). Paul recently reported: I'm a bit surprised by this because I had previously tested the tail call limit of several JIT compilers and found it to be 33 (i.e., allowing chains of up to 34 programs). I've just extended a test program I had to validate this again on the x86-64 JIT, and found a limit of 33 tail calls again [1]. Also note we had previously changed the RISC-V and MIPS JITs to allow up to 33 tail calls [2, 3], for consistency with other JITs and with the interpreter. We had decided to increase these two to 33 rather than decrease the other JITs to 32 for backward compatibility, though that probably doesn't matter much as I'd expect few people to actually use 33 tail calls. [1] https://github.com/pchaigno/tail-call-bench/commit/ae7887482985b4b1745c9b2ef7ff9ae506c82886 [2] 96bc4432f5ad ("bpf, riscv: Limit to 33 tail calls") [3] e49e6f6db04e ("bpf, mips: Limit to 33 tail calls") Therefore, revert b61a28cf11d61 to re-align interpreter to limit a maximum of 33 tail calls. While it is unlikely to hit the limit for the vast majority, programs in the wild could one way or another depend on this, so lets rather be a bit more conservative, and lets align the small remainder of JITs to 33. If needed in future, this limit could be slightly increased, but not decreased. Fixes: b61a28cf11d61 ("bpf: Fix off-by-one in tail call count limiting") Reported-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Johan Almbladh Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/CAO5pjwTWrC0_dzTbTHFPSqDwA56aVH+4KFGVqdq8=ASs0MqZGQ@mail.gmail.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 91f24c7b38a1..9f4636d021b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1564,7 +1564,7 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) + if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; -- cgit v1.2.3 From 594286b7574c6e8217b1c233cc0d0650f2268a77 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 19 Aug 2021 08:52:09 -0700 Subject: bpf: Fix NULL event->prog pointer access in bpf_overflow_handler Andrii reported that libbpf CI hit the following oops when running selftest send_signal: [ 1243.160719] BUG: kernel NULL pointer dereference, address: 0000000000000030 [ 1243.161066] #PF: supervisor read access in kernel mode [ 1243.161066] #PF: error_code(0x0000) - not-present page [ 1243.161066] PGD 0 P4D 0 [ 1243.161066] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 1243.161066] CPU: 1 PID: 882 Comm: new_name Tainted: G O 5.14.0-rc5 #1 [ 1243.161066] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 1243.161066] RIP: 0010:bpf_overflow_handler+0x9a/0x1e0 [ 1243.161066] Code: 5a 84 c0 0f 84 06 01 00 00 be 66 02 00 00 48 c7 c7 6d 96 07 82 48 8b ab 18 05 00 00 e8 df 55 eb ff 66 90 48 8d 75 48 48 89 e7 55 30 41 89 c4 e8 fb c1 f0 ff 84 c0 0f 84 94 00 00 00 e8 6e 0f [ 1243.161066] RSP: 0018:ffffc900000c0d80 EFLAGS: 00000046 [ 1243.161066] RAX: 0000000000000002 RBX: ffff8881002e0dd0 RCX: 00000000b4b47cf8 [ 1243.161066] RDX: ffffffff811dcb06 RSI: 0000000000000048 RDI: ffffc900000c0d80 [ 1243.161066] RBP: 0000000000000000 R08: 0000000000000000 R09: 1a9d56bb00000000 [ 1243.161066] R10: 0000000000000001 R11: 0000000000080000 R12: 0000000000000000 [ 1243.161066] R13: ffffc900000c0e00 R14: ffffc900001c3c68 R15: 0000000000000082 [ 1243.161066] FS: 00007fc0be2d3380(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 [ 1243.161066] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1243.161066] CR2: 0000000000000030 CR3: 0000000104f8e000 CR4: 00000000000006e0 [ 1243.161066] Call Trace: [ 1243.161066] [ 1243.161066] __perf_event_overflow+0x4f/0xf0 [ 1243.161066] perf_swevent_hrtimer+0x116/0x130 [ 1243.161066] ? __lock_acquire+0x378/0x2730 [ 1243.161066] ? __lock_acquire+0x372/0x2730 [ 1243.161066] ? lock_is_held_type+0xd5/0x130 [ 1243.161066] ? find_held_lock+0x2b/0x80 [ 1243.161066] ? lock_is_held_type+0xd5/0x130 [ 1243.161066] ? perf_event_groups_first+0x80/0x80 [ 1243.161066] ? perf_event_groups_first+0x80/0x80 [ 1243.161066] __hrtimer_run_queues+0x1a3/0x460 [ 1243.161066] hrtimer_interrupt+0x110/0x220 [ 1243.161066] __sysvec_apic_timer_interrupt+0x8a/0x260 [ 1243.161066] sysvec_apic_timer_interrupt+0x89/0xc0 [ 1243.161066] [ 1243.161066] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 1243.161066] RIP: 0010:finish_task_switch+0xaf/0x250 [ 1243.161066] Code: 31 f6 68 90 2a 09 81 49 8d 7c 24 18 e8 aa d6 03 00 4c 89 e7 e8 12 ff ff ff 4c 89 e7 e8 ca 9c 80 00 e8 35 af 0d 00 fb 4d 85 f6 <58> 74 1d 65 48 8b 04 25 c0 6d 01 00 4c 3b b0 a0 04 00 00 74 37 f0 [ 1243.161066] RSP: 0018:ffffc900001c3d18 EFLAGS: 00000282 [ 1243.161066] RAX: 000000000000031f RBX: ffff888104cf4980 RCX: 0000000000000000 [ 1243.161066] RDX: 0000000000000000 RSI: ffffffff82095460 RDI: ffffffff820adc4e [ 1243.161066] RBP: ffffc900001c3d58 R08: 0000000000000001 R09: 0000000000000001 [ 1243.161066] R10: 0000000000000001 R11: 0000000000080000 R12: ffff88813bd2bc80 [ 1243.161066] R13: ffff8881002e8000 R14: ffff88810022ad80 R15: 0000000000000000 [ 1243.161066] ? finish_task_switch+0xab/0x250 [ 1243.161066] ? finish_task_switch+0x70/0x250 [ 1243.161066] __schedule+0x36b/0xbb0 [ 1243.161066] ? _raw_spin_unlock_irqrestore+0x2d/0x50 [ 1243.161066] ? lockdep_hardirqs_on+0x79/0x100 [ 1243.161066] schedule+0x43/0xe0 [ 1243.161066] pipe_read+0x30b/0x450 [ 1243.161066] ? wait_woken+0x80/0x80 [ 1243.161066] new_sync_read+0x164/0x170 [ 1243.161066] vfs_read+0x122/0x1b0 [ 1243.161066] ksys_read+0x93/0xd0 [ 1243.161066] do_syscall_64+0x35/0x80 [ 1243.161066] entry_SYSCALL_64_after_hwframe+0x44/0xae The oops can also be reproduced with the following steps: ./vmtest.sh -s # at qemu shell cd /root/bpf && while true; do ./test_progs -t send_signal Further analysis showed that the failure is introduced with commit b89fbfbb854c ("bpf: Implement minimal BPF perf link"). With the above commit, the following scenario becomes possible: cpu1 cpu2 hrtimer_interrupt -> bpf_overflow_handler (due to closing link_fd) bpf_perf_link_release -> perf_event_free_bpf_prog -> perf_event_free_bpf_handler -> WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler) event->prog = NULL bpf_prog_run(event->prog, &ctx) In the above case, the event->prog is NULL for bpf_prog_run, hence causing oops. To fix the issue, check whether event->prog is NULL or not. If it is, do not call bpf_prog_run. This seems working as the above reproducible step runs more than one hour and I didn't see any failures. Fixes: b89fbfbb854c ("bpf: Implement minimal BPF perf link") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210819155209.1927994-1-yhs@fb.com --- kernel/events/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d1e63dd97f2..011cc5069b7b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9920,13 +9920,16 @@ static void bpf_overflow_handler(struct perf_event *event, .data = data, .event = event, }; + struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = bpf_prog_run(event->prog, &ctx); + prog = READ_ONCE(event->prog); + if (prog) + ret = bpf_prog_run(prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3 From 3666b167ea68997b73dd5b78678a1c3f0d6730bb Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Thu, 19 Aug 2021 09:36:09 -0700 Subject: selftests/bpf: Adding delay in socketmap_listen to reduce flakyness This patch adds a 1ms delay to reduce flakyness of the test. Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210819163609.2583758-1-fallentree@fb.com --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index afa14fb66f08..6a5df28f9a3d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1603,8 +1603,10 @@ static void unix_redir_to_connected(int sotype, int sock_mapfd, again: n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); if (n < 0) { - if (errno == EAGAIN && retries--) + if (errno == EAGAIN && retries--) { + usleep(1000); goto again; + } FAIL_ERRNO("%s: read", log_prefix); } if (n == 0) @@ -1776,8 +1778,10 @@ static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd, again: n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); if (n < 0) { - if (errno == EAGAIN && retries--) + if (errno == EAGAIN && retries--) { + usleep(1000); goto again; + } FAIL_ERRNO("%s: read", log_prefix); } if (n == 0) @@ -1869,8 +1873,10 @@ static void inet_unix_redir_to_connected(int family, int type, int sock_mapfd, again: n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); if (n < 0) { - if (errno == EAGAIN && retries--) + if (errno == EAGAIN && retries--) { + usleep(1000); goto again; + } FAIL_ERRNO("%s: read", log_prefix); } if (n == 0) -- cgit v1.2.3 From f0dce1d9b7c81fc3dc9d0cc0bc7ef9b3eae22584 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:15 -0700 Subject: bpf: Use kvmalloc for map values in syscall Use kvmalloc/kvfree for temporary value when manipulating a map via syscall. kmalloc might not be sufficient for percpu maps where the value is big (and further multiplied by hundreds of CPUs). Can be reproduced with netcnt test on qemu with "-smp 255". Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-1-sdf@google.com --- kernel/bpf/syscall.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7420e1334ab2..075f650d297a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1076,7 +1076,7 @@ static int map_lookup_elem(union bpf_attr *attr) value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1091,7 +1091,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = 0; free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: @@ -1137,16 +1137,10 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - value_size = round_up(map->value_size, 8) * num_possible_cpus(); - else - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1157,7 +1151,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) err = bpf_map_update_value(map, f, key, value, attr->flags); free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: @@ -1367,7 +1361,7 @@ int generic_map_update_batch(struct bpf_map *map, if (!key) return -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { kfree(key); return -ENOMEM; @@ -1390,7 +1384,7 @@ int generic_map_update_batch(struct bpf_map *map, if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; - kfree(value); + kvfree(value); kfree(key); return err; } @@ -1429,7 +1423,7 @@ int generic_map_lookup_batch(struct bpf_map *map, if (!buf_prevkey) return -ENOMEM; - buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { kfree(buf_prevkey); return -ENOMEM; @@ -1492,7 +1486,7 @@ int generic_map_lookup_batch(struct bpf_map *map, free_buf: kfree(buf_prevkey); - kfree(buf); + kvfree(buf); return err; } @@ -1547,7 +1541,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1579,7 +1573,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) err = 0; free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: -- cgit v1.2.3 From 44779a4b85abd1d1dab9e5b90bd5e6adcfc8143a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:16 -0700 Subject: bpf: Use kvmalloc for map keys in syscalls Same as previous patch but for the keys. memdup_bpfptr is renamed to kvmemdup_bpfptr (and converted to kvmalloc). Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-2-sdf@google.com --- include/linux/bpfptr.h | 12 ++++++++++-- kernel/bpf/syscall.c | 34 +++++++++++++++++----------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 5cdeab497cb3..546e27fc6d46 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -62,9 +62,17 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size); } -static inline void *memdup_bpfptr(bpfptr_t src, size_t len) +static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len) { - return memdup_sockptr((sockptr_t) src, len); + void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN); + + if (!p) + return ERR_PTR(-ENOMEM); + if (copy_from_bpfptr(p, src, len)) { + kvfree(p); + return ERR_PTR(-EFAULT); + } + return p; } static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 075f650d297a..4e50c0bfdb7d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1013,7 +1013,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static void *__bpf_copy_key(void __user *ukey, u64 key_size) { if (key_size) - return memdup_user(ukey, key_size); + return vmemdup_user(ukey, key_size); if (ukey) return ERR_PTR(-EINVAL); @@ -1024,7 +1024,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) { if (key_size) - return memdup_bpfptr(ukey, key_size); + return kvmemdup_bpfptr(ukey, key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); @@ -1093,7 +1093,7 @@ static int map_lookup_elem(union bpf_attr *attr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1153,7 +1153,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1205,7 +1205,7 @@ static int map_delete_elem(union bpf_attr *attr) bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1247,7 +1247,7 @@ static int map_get_next_key(union bpf_attr *attr) } err = -ENOMEM; - next_key = kmalloc(map->key_size, GFP_USER); + next_key = kvmalloc(map->key_size, GFP_USER); if (!next_key) goto free_key; @@ -1270,9 +1270,9 @@ out: err = 0; free_next_key: - kfree(next_key); + kvfree(next_key); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1299,7 +1299,7 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; - key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1326,7 +1326,7 @@ int generic_map_delete_batch(struct bpf_map *map, if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; - kfree(key); + kvfree(key); return err; } @@ -1357,13 +1357,13 @@ int generic_map_update_batch(struct bpf_map *map, if (!max_count) return 0; - key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { - kfree(key); + kvfree(key); return -ENOMEM; } @@ -1385,7 +1385,7 @@ int generic_map_update_batch(struct bpf_map *map, err = -EFAULT; kvfree(value); - kfree(key); + kvfree(key); return err; } @@ -1419,13 +1419,13 @@ int generic_map_lookup_batch(struct bpf_map *map, if (put_user(0, &uattr->batch.count)) return -EFAULT; - buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!buf_prevkey) return -ENOMEM; buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kfree(buf_prevkey); + kvfree(buf_prevkey); return -ENOMEM; } @@ -1485,7 +1485,7 @@ int generic_map_lookup_batch(struct bpf_map *map, err = -EFAULT; free_buf: - kfree(buf_prevkey); + kvfree(buf_prevkey); kvfree(buf); return err; } @@ -1575,7 +1575,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; -- cgit v1.2.3 From 2c531639deb5e3ddfd6e8123b82052b2d9fbc6e5 Mon Sep 17 00:00:00 2001 From: Prankur Gupta Date: Tue, 17 Aug 2021 15:42:20 -0700 Subject: bpf: Add support for {set|get} socket options from setsockopt BPF Add logic to call bpf_setsockopt() and bpf_getsockopt() from setsockopt BPF programs. An example use case is when the user sets the IPV6_TCLASS socket option, we would also like to change the tcp-cc for that socket. We don't have any use case for calling bpf_setsockopt() from supposedly read- only sys_getsockopt(), so it is made available to BPF_CGROUP_SETSOCKOPT only at this point. Signed-off-by: Prankur Gupta Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210817224221.3257826-2-prankgup@fb.com --- kernel/bpf/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f35928bab0a..8e9d99e2ade4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1873,6 +1873,14 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) + return &bpf_sk_getsockopt_proto; + return NULL; #endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: -- cgit v1.2.3 From f2a6ee924d26527dc55a745dc917a820f34e64e5 Mon Sep 17 00:00:00 2001 From: Prankur Gupta Date: Tue, 17 Aug 2021 15:42:21 -0700 Subject: selftests/bpf: Add tests for {set|get} socket option from setsockopt BPF Adding selftests for the newly added functionality to call bpf_setsockopt() and bpf_getsockopt() from setsockopt BPF programs. Test Details: 1. BPF Program Checks for changes in IPV6_TCLASS(SOL_IPV6) via setsockopt If the cca for the socket is not cubic do nothing If the newly set value for IPV6_TCLASS is 45 (0x2d) (as per our use-case) then change the cc from cubic to reno 2. User Space Program Creates an AF_INET6 socket and set the cca for that to be "cubic" Attach the program and set the IPV6_TCLASS to 0x2d using setsockopt Verify the cca for the socket changed to reno Signed-off-by: Prankur Gupta Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210817224221.3257826-3-prankgup@fb.com --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 18 ++++++ .../selftests/bpf/prog_tests/sockopt_qos_to_cc.c | 70 ++++++++++++++++++++++ .../selftests/bpf/progs/sockopt_qos_to_cc.c | 39 ++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c create mode 100644 tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 029589c008c9..c9f9bdad60c7 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -12,6 +12,10 @@ SEC("struct_ops/"#name) \ BPF_PROG(name, args) +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + #define tcp_jiffies32 ((__u32)bpf_jiffies64()) struct sock_common { @@ -203,6 +207,20 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); } +static __always_inline bool tcp_cc_eq(const char *a, const char *b) +{ + int i; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (a[i] != b[i]) + return false; + if (!a[i]) + break; + } + + return true; +} + extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c new file mode 100644 index 000000000000..6b53b3cb8dad --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_qos_to_cc.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "sockopt_qos_to_cc.skel.h" + +static void run_setsockopt_test(int cg_fd, int sock_fd) +{ + socklen_t optlen; + char cc[16]; /* TCP_CA_NAME_MAX */ + int buf; + int err = -1; + + buf = 0x2D; + err = setsockopt(sock_fd, SOL_IPV6, IPV6_TCLASS, &buf, sizeof(buf)); + if (!ASSERT_OK(err, "setsockopt(sock_fd, IPV6_TCLASS)")) + return; + + /* Verify the setsockopt cc change */ + optlen = sizeof(cc); + err = getsockopt(sock_fd, SOL_TCP, TCP_CONGESTION, cc, &optlen); + if (!ASSERT_OK(err, "getsockopt(sock_fd, TCP_CONGESTION)")) + return; + + if (!ASSERT_STREQ(cc, "reno", "getsockopt(sock_fd, TCP_CONGESTION)")) + return; +} + +void test_sockopt_qos_to_cc(void) +{ + struct sockopt_qos_to_cc *skel; + char cc_cubic[16] = "cubic"; /* TCP_CA_NAME_MAX */ + int cg_fd = -1; + int sock_fd = -1; + int err; + + cg_fd = test__join_cgroup("/sockopt_qos_to_cc"); + if (!ASSERT_GE(cg_fd, 0, "cg-join(sockopt_qos_to_cc)")) + return; + + skel = sockopt_qos_to_cc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto done; + + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(sock_fd, 0, "v6 socket open")) + goto done; + + err = setsockopt(sock_fd, SOL_TCP, TCP_CONGESTION, &cc_cubic, + sizeof(cc_cubic)); + if (!ASSERT_OK(err, "setsockopt(sock_fd, TCP_CONGESTION)")) + goto done; + + skel->links.sockopt_qos_to_cc = + bpf_program__attach_cgroup(skel->progs.sockopt_qos_to_cc, + cg_fd); + if (!ASSERT_OK_PTR(skel->links.sockopt_qos_to_cc, + "prog_attach(sockopt_qos_to_cc)")) + goto done; + + run_setsockopt_test(cg_fd, sock_fd); + +done: + if (sock_fd != -1) + close(sock_fd); + if (cg_fd != -1) + close(cg_fd); + /* destroy can take null and error pointer */ + sockopt_qos_to_cc__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c new file mode 100644 index 000000000000..1bce83b6e3a7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sockopt_qos_to_cc.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include +#include +#include +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; + +SEC("cgroup/setsockopt") +int sockopt_qos_to_cc(struct bpf_sockopt *ctx) +{ + void *optval_end = ctx->optval_end; + int *optval = ctx->optval; + char buf[TCP_CA_NAME_MAX]; + char cc_reno[TCP_CA_NAME_MAX] = "reno"; + char cc_cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (ctx->level != SOL_IPV6 || ctx->optname != IPV6_TCLASS) + return 1; + + if (optval + 1 > optval_end) + return 0; /* EPERM, bounds check */ + + if (bpf_getsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 0; + + if (!tcp_cc_eq(buf, cc_cubic)) + return 0; + + if (*optval == 0x2d) { + if (bpf_setsockopt(ctx->sk, SOL_TCP, TCP_CONGESTION, &cc_reno, + sizeof(cc_reno))) + return 0; + } + return 1; +} -- cgit v1.2.3 From d359902d5c357b280e7a0862bb8a1ba56b3fc197 Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Sat, 21 Aug 2021 18:07:36 +0000 Subject: af_unix: Fix NULL pointer bug in unix_shutdown Commit 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") introduced a bug for af_unix SEQPACKET type. In unix_shutdown, the unhash function will call prot->unhash(), which is NULL for SEQPACKET. And kernel will panic. On ARM32, it will show following messages: (it likely affects x86 too). Fix the bug by checking the prot->unhash is NULL or not first. Kernel log: <--- cut here --- Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = 2fba1ffb *pgd=00000000 Internal error: Oops: 80000005 [#1] PREEMPT SMP THUMB2 Modules linked in: CPU: 1 PID: 1999 Comm: falkon Tainted: G W 5.14.0-rc5-01175-g94531cfcbe79-dirty #9240 Hardware name: NVIDIA Tegra SoC (Flattened Device Tree) PC is at 0x0 LR is at unix_shutdown+0x81/0x1a8 pc : [<00000000>] lr : [] psr: 600f0013 sp : e45aff70 ip : e463a3c0 fp : beb54f04 r10: 00000125 r9 : e45ae000 r8 : c4a56664 r7 : 00000001 r6 : c4a56464 r5 : 00000001 r4 : c4a56400 r3 : 00000000 r2 : c5a6b180 r1 : 00000000 r0 : c4a56400 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 50c5387d Table: 05aa804a DAC: 00000051 Register r0 information: slab PING start c4a56400 pointer offset 0 Register r1 information: NULL pointer Register r2 information: slab task_struct start c5a6b180 pointer offset 0 Register r3 information: NULL pointer Register r4 information: slab PING start c4a56400 pointer offset 0 Register r5 information: non-paged memory Register r6 information: slab PING start c4a56400 pointer offset 100 Register r7 information: non-paged memory Register r8 information: slab PING start c4a56400 pointer offset 612 Register r9 information: non-slab/vmalloc memory Register r10 information: non-paged memory Register r11 information: non-paged memory Register r12 information: slab filp start e463a3c0 pointer offset 0 Process falkon (pid: 1999, stack limit = 0x9ec48895) Stack: (0xe45aff70 to 0xe45b0000) ff60: e45ae000 c5f26a00 00000000 00000125 ff80: c0100264 c07f7fa3 beb54f04 fffffff7 00000001 e6f3fc0e b5e5e9ec beb54ec4 ffa0: b5da0ccc c010024b b5e5e9ec beb54ec4 0000000f 00000000 00000000 beb54ebc ffc0: b5e5e9ec beb54ec4 b5da0ccc 00000125 beb54f58 00785238 beb5529c beb54f04 ffe0: b5da1e24 beb54eac b301385c b62b6ee8 600f0030 0000000f 00000000 00000000 [] (unix_shutdown) from [] (__sys_shutdown+0x2f/0x50) [] (__sys_shutdown) from [] (__sys_trace_return+0x1/0x16) Exception stack(0xe45affa8 to 0xe45afff0) Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Reported-by: Dmitry Osipenko Signed-off-by: Jiang Wang Signed-off-by: Daniel Borkmann Tested-by: Dmitry Osipenko Acked-by: Kuniyuki Iwashima Link: https://lore.kernel.org/bpf/20210821180738.1151155-1-jiang.wang@bytedance.com --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 443c49081636..15c1e4e4012d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2847,7 +2847,8 @@ static int unix_shutdown(struct socket *sock, int mode) int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); - prot->unhash(other); + if (prot->unhash) + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) -- cgit v1.2.3 From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- include/linux/bpf-cgroup.h | 182 ++++++++++++++++++++++++++++------------- include/uapi/linux/bpf.h | 2 +- kernel/bpf/cgroup.c | 156 +++++++++++++++++++++-------------- net/ipv4/af_inet.c | 6 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 6 +- net/ipv6/udp.c | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 8 files changed, 226 insertions(+), 132 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a74cd1c3bd87..2746fd804216 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,9 +23,73 @@ struct ctl_table_header; struct task_struct; #ifdef CONFIG_CGROUP_BPF +enum cgroup_bpf_attach_type { + CGROUP_BPF_ATTACH_TYPE_INVALID = -1, + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS, + CGROUP_INET_SOCK_CREATE, + CGROUP_SOCK_OPS, + CGROUP_DEVICE, + CGROUP_INET4_BIND, + CGROUP_INET6_BIND, + CGROUP_INET4_CONNECT, + CGROUP_INET6_CONNECT, + CGROUP_INET4_POST_BIND, + CGROUP_INET6_POST_BIND, + CGROUP_UDP4_SENDMSG, + CGROUP_UDP6_SENDMSG, + CGROUP_SYSCTL, + CGROUP_UDP4_RECVMSG, + CGROUP_UDP6_RECVMSG, + CGROUP_GETSOCKOPT, + CGROUP_SETSOCKOPT, + CGROUP_INET4_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, + CGROUP_INET4_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, + CGROUP_INET_SOCK_RELEASE, + MAX_CGROUP_BPF_ATTACH_TYPE +}; + +#define CGROUP_ATYPE(type) \ + case BPF_##type: return type + +static inline enum cgroup_bpf_attach_type +to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + CGROUP_ATYPE(CGROUP_INET_INGRESS); + CGROUP_ATYPE(CGROUP_INET_EGRESS); + CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); + CGROUP_ATYPE(CGROUP_SOCK_OPS); + CGROUP_ATYPE(CGROUP_DEVICE); + CGROUP_ATYPE(CGROUP_INET4_BIND); + CGROUP_ATYPE(CGROUP_INET6_BIND); + CGROUP_ATYPE(CGROUP_INET4_CONNECT); + CGROUP_ATYPE(CGROUP_INET6_CONNECT); + CGROUP_ATYPE(CGROUP_INET4_POST_BIND); + CGROUP_ATYPE(CGROUP_INET6_POST_BIND); + CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); + CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); + CGROUP_ATYPE(CGROUP_SYSCTL); + CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); + CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); + CGROUP_ATYPE(CGROUP_GETSOCKOPT); + CGROUP_ATYPE(CGROUP_SETSOCKOPT); + CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); + CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); + CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); + default: + return CGROUP_BPF_ATTACH_TYPE_INVALID; + } +} + +#undef CGROUP_ATYPE -extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; -#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -67,15 +131,15 @@ struct bpf_prog_array; struct cgroup_bpf { /* array of effective progs in this cgroup */ - struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog_array __rcu *effective[MAX_CGROUP_BPF_ATTACH_TYPE]; /* attached progs to this cgroup and attach flags * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct list_head progs[MAX_BPF_ATTACH_TYPE]; - u32 flags[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_CGROUP_BPF_ATTACH_TYPE]; + u32 flags[MAX_CGROUP_BPF_ATTACH_TYPE]; /* list of cgroup shared storages */ struct list_head storages; @@ -115,28 +179,28 @@ int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type); + short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type); + enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, char __user *optval, @@ -179,9 +243,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ - BPF_CGROUP_INET_INGRESS); \ + CGROUP_INET_INGRESS); \ \ __ret; \ }) @@ -189,54 +253,54 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ - BPF_CGROUP_INET_EGRESS); \ + CGROUP_INET_EGRESS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + if (cgroup_bpf_enabled(atype)) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ - BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) -#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + if (cgroup_bpf_enabled(atype)) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, \ &__unused_flags); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ({ \ u32 __unused_flags; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ t_ctx, \ &__unused_flags); \ release_sock(sk); \ @@ -249,13 +313,13 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ - if (cgroup_bpf_enabled(type)) { \ + if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ @@ -265,33 +329,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ - ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ - cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by @@ -311,33 +375,33 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ - BPF_CGROUP_SOCK_OPS); \ + CGROUP_SOCK_OPS); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ - __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ + if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ + __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ - BPF_CGROUP_DEVICE); \ + CGROUP_DEVICE); \ \ __ret; \ }) @@ -346,10 +410,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ + if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ - BPF_CGROUP_SYSCTL); \ + CGROUP_SYSCTL); \ __ret; \ }) @@ -357,7 +421,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -368,7 +432,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -377,7 +441,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ @@ -392,7 +456,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ @@ -451,14 +515,14 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled(type) (0) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) +#define cgroup_bpf_enabled(atype) (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) @@ -470,7 +534,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8e9d99e2ade4..03145d45e3d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work) struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp; - unsigned int type; + unsigned int atype; mutex_lock(&cgroup_mutex); - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { - struct list_head *progs = &cgrp->bpf.progs[type]; + for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { + struct list_head *progs = &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl, *pltmp; list_for_each_entry_safe(pl, pltmp, progs, node) { @@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); } old_array = rcu_dereference_protected( - cgrp->bpf.effective[type], + cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } @@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *p; @@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (!p) return true; do { - u32 flags = p->bpf.flags[type]; + u32 flags = p->bpf.flags[atype]; u32 cnt; if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[type]); + cnt = prog_list_length(&p->bpf.progs[atype]); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, * to programs in this cgroup */ static int compute_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; @@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp, /* count number of effective programs by walking parents */ do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[type]); + if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[atype]); p = cgroup_parent(p); } while (p); @@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[type], node) { + list_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp, } static void activate_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { - old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array @@ -328,7 +328,7 @@ cleanup: } static int update_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; int err; @@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp, if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; - err = compute_effective_progs(desc, type, &desc->bpf.inactive); + err = compute_effective_progs(desc, atype, &desc->bpf.inactive); if (err) goto cleanup; } @@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp, continue; } - activate_effective_progs(desc, type, desc->bpf.inactive); + activate_effective_progs(desc, atype, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); - struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; + struct list_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type)) + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + + if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) + if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); - cgrp->bpf.flags[type] = saved_flags; + cgrp->bpf.flags[atype] = saved_flags; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key[type]); + static_branch_inc(&cgroup_bpf_enabled_key[atype]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -520,7 +527,7 @@ cleanup: * all descendant cgroups. This function is guaranteed to succeed. */ static void replace_effective_prog(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link) { struct bpf_prog_array_item *item; @@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp, /* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { - if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - head = &cg->bpf.progs[type]; + head = &cg->bpf.progs[atype]; list_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; @@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp, found: BUG_ON(!cg); progs = rcu_dereference_protected( - desc->bpf.effective[type], + desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); item = &progs->items[pos]; WRITE_ONCE(item->prog, link->link.prog); @@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog) { - struct list_head *progs = &cgrp->bpf.progs[link->type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; + struct list_head *progs; bool found = false; + atype = to_cgroup_bpf_attach_type(link->type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + if (link->link.prog->type != new_prog->type) return -EINVAL; @@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, return -ENOENT; old_prog = xchg(&link->link.prog, new_prog); - replace_effective_prog(cgrp, link->type, link); + replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); return 0; } @@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type) { - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog_list *pl; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + struct list_head *progs; + u32 flags; int err; + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; + if (prog && link) /* only one of prog or link can be specified */ return -EINVAL; @@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; @@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ - cgrp->bpf.flags[type] = 0; + cgrp->bpf.flags[atype] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; cleanup: @@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; + struct list_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; + u32 flags; + + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; - effective = rcu_dereference_protected(cgrp->bpf.effective[type], + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) @@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_cgroup; } - err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, - BPF_F_ALLOW_MULTI); + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, + link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; @@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - if (type == BPF_CGROUP_INET_EGRESS) { + if (atype == CGROUP_INET_EGRESS) { ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } @@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) { @@ -1090,7 +1120,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; @@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type) + short access, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp; struct bpf_cgroup_dev_ctx ctx = { @@ -1139,7 +1169,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); @@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct bpf_sysctl_kern ctx = { .head = head, @@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum bpf_attach_type attach_type) + enum cgroup_bpf_attach_type attach_type) { struct bpf_prog_array *prog_array; bool empty; @@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1385,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; @@ -1495,7 +1525,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1556,7 +1586,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0e4d758c2585..1d816a5fd3eb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET4_BIND, &flags); + CGROUP_INET4_BIND, &flags); if (err) return err; @@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, + CGROUP_INET4_GETPEERNAME, NULL); } else { __be32 addr = inet->inet_rcv_saddr; @@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, + CGROUP_INET4_GETSOCKNAME, NULL); } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1a742b710e54..8851c9463b4b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d92c90d97763..b5878bb8e419 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET6_BIND, &flags); + CGROUP_INET6_BIND, &flags); if (err) return err; @@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c5e15e94bb00..ea53847b5b7e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1475,7 +1475,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { -- cgit v1.2.3 From a6258837c8a81dcd9b0f1b061bd35302ad4d5914 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Mon, 23 Aug 2021 14:36:29 -0700 Subject: selftests/bpf: Reduce flakyness in timer_mim This patch extends wait time in timer_mim. As observed in slow CI environment, it is possible to have interrupt/preemption long enough to cause the test to fail, almost 1 failure in 5 runs. Signed-off-by: Yucong Sun Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210823213629.3519641-1-fallentree@fb.com --- tools/testing/selftests/bpf/prog_tests/timer_mim.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c index f5acbcbe33a4..ced8f6cf347c 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -23,8 +23,12 @@ static int timer_mim(struct timer_mim *timer_skel) /* check that timer_cb[12] are incrementing 'cnt' */ cnt1 = READ_ONCE(timer_skel->bss->cnt); - usleep(200); /* 100 times more than interval */ - cnt2 = READ_ONCE(timer_skel->bss->cnt); + for (int i = 0; i < 100; i++) { + cnt2 = READ_ONCE(timer_skel->bss->cnt); + if (cnt2 != cnt1) + break; + usleep(200); /* 100 times more than interval */ + } ASSERT_GT(cnt2, cnt1, "cnt"); ASSERT_EQ(timer_skel->bss->err, 0, "err"); @@ -37,8 +41,12 @@ static int timer_mim(struct timer_mim *timer_skel) /* check that timer_cb[12] are no longer running */ cnt1 = READ_ONCE(timer_skel->bss->cnt); - usleep(200); - cnt2 = READ_ONCE(timer_skel->bss->cnt); + for (int i = 0; i < 100; i++) { + usleep(200); /* 100 times more than interval */ + cnt2 = READ_ONCE(timer_skel->bss->cnt); + if (cnt2 == cnt1) + break; + } ASSERT_EQ(cnt2, cnt1, "cnt"); return 0; -- cgit v1.2.3 From 2d82d73da35b72b53fe0d96350a2b8d929d07e42 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 20 Aug 2021 09:55:53 +0800 Subject: selftests/bpf: Enlarge select() timeout for test_maps 0Day robot observed that it's easily timeout on a heavy load host. ------------------- # selftests: bpf: test_maps # Fork 1024 tasks to 'test_update_delete' # Fork 1024 tasks to 'test_update_delete' # Fork 100 tasks to 'test_hashmap' # Fork 100 tasks to 'test_hashmap_percpu' # Fork 100 tasks to 'test_hashmap_sizes' # Fork 100 tasks to 'test_hashmap_walk' # Fork 100 tasks to 'test_arraymap' # Fork 100 tasks to 'test_arraymap_percpu' # Failed sockmap unexpected timeout not ok 3 selftests: bpf: test_maps # exit=1 # selftests: bpf: test_lru_map # nr_cpus:8 ------------------- Since this test will be scheduled by 0Day to a random host that could have only a few cpus(2-8), enlarge the timeout to avoid a false NG report. In practice, i tried to pin it to only one cpu by 'taskset 0x01 ./test_maps', and knew 10S is likely enough, but i still perfer to a larger value 30. Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210820015556.23276-2-lizhijian@cn.fujitsu.com --- tools/testing/selftests/bpf/test_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 340695d5d652..c7a36a9378f8 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -985,7 +985,7 @@ static void test_sockmap(unsigned int tasks, void *data) FD_ZERO(&w); FD_SET(sfd[3], &w); - to.tv_sec = 1; + to.tv_sec = 30; to.tv_usec = 0; s = select(sfd[3] + 1, &w, NULL, NULL, &to); if (s == -1) { -- cgit v1.2.3 From 5a980b5baf3942653d30c451416ca485ec09577f Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 20 Aug 2021 09:55:54 +0800 Subject: selftests/bpf: Make test_doc_build.sh work from script directory Previously, it fails as below: ------------- root@lkp-skl-d01 /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf# ./test_doc_build.sh ++ realpath --relative-to=/opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf ./test_doc_build.sh + SCRIPT_REL_PATH=test_doc_build.sh ++ dirname test_doc_build.sh + SCRIPT_REL_DIR=. ++ realpath /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/./../../../../ + KDIR_ROOT_DIR=/opt/rootfs/v5.14-rc4 + cd /opt/rootfs/v5.14-rc4 + for tgt in docs docs-clean + make -s -C /opt/rootfs/v5.14-rc4/. docs make: *** No rule to make target 'docs'. Stop. + for tgt in docs docs-clean + make -s -C /opt/rootfs/v5.14-rc4/. docs-clean make: *** No rule to make target 'docs-clean'. Stop. ----------- Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210820015556.23276-3-lizhijian@cn.fujitsu.com --- tools/testing/selftests/bpf/test_doc_build.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh index ed12111cd2f0..d67ced95a6cf 100755 --- a/tools/testing/selftests/bpf/test_doc_build.sh +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -4,9 +4,10 @@ set -e # Assume script is located under tools/testing/selftests/bpf/. We want to start # build attempts from the top of kernel repository. -SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0) +SCRIPT_REL_PATH=$(realpath $0) SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH) -KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) +KDIR_ROOT_DIR=$(realpath $SCRIPT_REL_DIR/../../../../) +SCRIPT_REL_DIR=$(dirname $(realpath --relative-to=$KDIR_ROOT_DIR $SCRIPT_REL_PATH)) cd $KDIR_ROOT_DIR for tgt in docs docs-clean; do -- cgit v1.2.3 From 7a3bdca20b10fb93a34aca22e19c4b27c5602edb Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 20 Aug 2021 09:55:55 +0800 Subject: selftests/bpf: Add default bpftool built by selftests to PATH For 'make run_tests': selftests will build bpftool into tools/testing/selftests/bpf/tools/sbin/bpftool by default. ================== root@lkp-skl-d01 /opt/rootfs/v5.14-rc4# make -C tools/testing/selftests/bpf run_tests make: Entering directory '/opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf' MKDIR include MKDIR libbpf MKDIR bpftool [...] GEN /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/tools/build/bpftool/profiler.skel.h CC /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/tools/build/bpftool/prog.o GEN /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/tools/build/bpftool/pid_iter.skel.h CC /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/tools/build/bpftool/pids.o LINK /opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/tools/build/bpftool/bpftool INSTALL bpftool GEN vmlinux.h [...] # test_feature_dev_json (test_bpftool.TestBpftool) ... ERROR # test_feature_kernel (test_bpftool.TestBpftool) ... ERROR # test_feature_kernel_full (test_bpftool.TestBpftool) ... ERROR # test_feature_kernel_full_vs_not_full (test_bpftool.TestBpftool) ... ERROR # test_feature_macros (test_bpftool.TestBpftool) ... Error: bug: failed to retrieve CAP_BPF status: Invalid argument # ERROR # # ====================================================================== # ERROR: test_feature_dev_json (test_bpftool.TestBpftool) # ---------------------------------------------------------------------- # Traceback (most recent call last): # File "/opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/test_bpftool.py", line 57, in wrapper # return f(*args, iface, **kwargs) # File "/opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/test_bpftool.py", line 82, in test_feature_dev_json # res = bpftool_json(["feature", "probe", "dev", iface]) # File "/opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/test_bpftool.py", line 42, in bpftool_json # res = _bpftool(args) # File "/opt/rootfs/v5.14-rc4/tools/testing/selftests/bpf/test_bpftool.py", line 34, in _bpftool # return subprocess.check_output(_args) # File "/usr/lib/python3.7/subprocess.py", line 395, in check_output # **kwargs).stdout # File "/usr/lib/python3.7/subprocess.py", line 487, in run # output=stdout, stderr=stderr) # subprocess.CalledProcessError: Command '['bpftool', '-j', 'feature', 'probe', 'dev', 'dummy0']' returned non-zero exit status 255. # ================== Signed-off-by: Li Zhijian Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210820015556.23276-4-lizhijian@cn.fujitsu.com --- tools/testing/selftests/bpf/test_bpftool.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh index 66690778e36d..6b7ba19be1d0 100755 --- a/tools/testing/selftests/bpf/test_bpftool.sh +++ b/tools/testing/selftests/bpf/test_bpftool.sh @@ -2,4 +2,9 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2020 SUSE LLC. +SCRIPT_DIR=$(dirname $(realpath $0)) + +# 'make -C tools/testing/selftests/bpf' will install to BPFTOOL_INSTALL_PATH +BPFTOOL_INSTALL_PATH="$SCRIPT_DIR"/tools/sbin +export PATH=$BPFTOOL_INSTALL_PATH:$PATH python3 -m unittest -v test_bpftool.TestBpftool -- cgit v1.2.3 From 404bd9ff5d7ccb938ab033f6971c6ee2b8384387 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 20 Aug 2021 09:55:56 +0800 Subject: selftests/bpf: Add missing files required by test_bpftool.sh for installing test_bpftool.sh relies on bpftool and test_bpftool.py. 'make install' will install bpftool to INSTALL_PATH/bpf/bpftool, and export it to PATH so that it can be used after installing. Signed-off-by: Li Zhijian Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820015556.23276-5-lizhijian@cn.fujitsu.com --- tools/testing/selftests/bpf/Makefile | 4 +++- tools/testing/selftests/bpf/test_bpftool.sh | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2a58b7b5aea4..866531c08e4f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -79,7 +79,7 @@ TEST_PROGS := test_kmod.sh \ TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ - test_xdp_vlan.sh + test_xdp_vlan.sh test_bpftool.py # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ @@ -187,6 +187,8 @@ $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ cp $(SCRATCH_DIR)/runqslower $@ +TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL) + $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ) $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh index 6b7ba19be1d0..718f59692ccb 100755 --- a/tools/testing/selftests/bpf/test_bpftool.sh +++ b/tools/testing/selftests/bpf/test_bpftool.sh @@ -2,9 +2,10 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2020 SUSE LLC. +# 'make -C tools/testing/selftests/bpf install' will install to SCRIPT_DIR SCRIPT_DIR=$(dirname $(realpath $0)) # 'make -C tools/testing/selftests/bpf' will install to BPFTOOL_INSTALL_PATH BPFTOOL_INSTALL_PATH="$SCRIPT_DIR"/tools/sbin -export PATH=$BPFTOOL_INSTALL_PATH:$PATH +export PATH=$SCRIPT_DIR:$BPFTOOL_INSTALL_PATH:$PATH python3 -m unittest -v test_bpftool.TestBpftool -- cgit v1.2.3 From 00e1116031e154098c55441e4936b32e4b20b31c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 20 Aug 2021 10:55:49 +0800 Subject: selftests/bpf: Exit with KSFT_SKIP if no Makefile found This would happend when we run the tests after install kselftests root@lkp-skl-d01 ~# /kselftests/run_kselftest.sh -t bpf:test_doc_build.sh TAP version 13 1..1 # selftests: bpf: test_doc_build.sh perl: warning: Setting locale failed. perl: warning: Please check that your locale settings: LANGUAGE = (unset), LC_ALL = (unset), LC_ADDRESS = "en_US.UTF-8", LC_NAME = "en_US.UTF-8", LC_MONETARY = "en_US.UTF-8", LC_PAPER = "en_US.UTF-8", LC_IDENTIFICATION = "en_US.UTF-8", LC_TELEPHONE = "en_US.UTF-8", LC_MEASUREMENT = "en_US.UTF-8", LC_TIME = "en_US.UTF-8", LC_NUMERIC = "en_US.UTF-8", LANG = "en_US.UTF-8" are supported and installed on your system. perl: warning: Falling back to the standard locale ("C"). # skip: bpftool files not found! # ok 1 selftests: bpf: test_doc_build.sh # SKIP Signed-off-by: Li Zhijian Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820025549.28325-1-lizhijian@cn.fujitsu.com --- tools/testing/selftests/bpf/test_bpftool_build.sh | 2 +- tools/testing/selftests/bpf/test_doc_build.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index ac349a5cea7e..b03a87571592 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -22,7 +22,7 @@ KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../) cd $KDIR_ROOT_DIR if [ ! -e tools/bpf/bpftool/Makefile ]; then echo -e "skip: bpftool files not found!\n" - exit 0 + exit 4 # KSFT_SKIP=4 fi ERROR=0 diff --git a/tools/testing/selftests/bpf/test_doc_build.sh b/tools/testing/selftests/bpf/test_doc_build.sh index d67ced95a6cf..679cf968c7d1 100755 --- a/tools/testing/selftests/bpf/test_doc_build.sh +++ b/tools/testing/selftests/bpf/test_doc_build.sh @@ -10,6 +10,11 @@ KDIR_ROOT_DIR=$(realpath $SCRIPT_REL_DIR/../../../../) SCRIPT_REL_DIR=$(dirname $(realpath --relative-to=$KDIR_ROOT_DIR $SCRIPT_REL_PATH)) cd $KDIR_ROOT_DIR +if [ ! -e $PWD/$SCRIPT_REL_DIR/Makefile ]; then + echo -e "skip: bpftool files not found!\n" + exit 4 # KSFT_SKIP=4 +fi + for tgt in docs docs-clean; do make -s -C $PWD/$SCRIPT_REL_DIR $tgt; done -- cgit v1.2.3 From fab60e29fcc6d60396da20d63d45fd0d305ba4e4 Mon Sep 17 00:00:00 2001 From: Xu Liu Date: Fri, 20 Aug 2021 15:17:11 +0800 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_SK_MSG We'd like to be able to identify netns from sk_msg hooks to accelerate local process communication form different netns. Signed-off-by: Xu Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820071712.52852-2-liuxu623@gmail.com --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 59b8f5050180..cfbd01167eb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4688,6 +4688,18 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; +BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { + .func = bpf_get_netns_cookie_sk_msg, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -7551,6 +7563,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; -- cgit v1.2.3 From 6cbca1ee0d74ea14d7b6cff16745b66b8f0fda5c Mon Sep 17 00:00:00 2001 From: Xu Liu Date: Fri, 20 Aug 2021 15:17:12 +0800 Subject: selftests/bpf: Test for get_netns_cookie Add test to use get_netns_cookie() from BPF_PROG_TYPE_SK_MSG. Signed-off-by: Xu Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820071712.52852-3-liuxu623@gmail.com --- .../selftests/bpf/prog_tests/netns_cookie.c | 57 ++++++++++++++-------- .../selftests/bpf/progs/netns_cookie_prog.c | 55 +++++++++++++++++++-- 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c index 6f3cd472fb65..71d8f3ba7d6b 100644 --- a/tools/testing/selftests/bpf/prog_tests/netns_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/netns_cookie.c @@ -12,10 +12,12 @@ static int duration; void test_netns_cookie(void) { - int server_fd = 0, client_fd = 0, cgroup_fd = 0, err = 0, val = 0; + int server_fd = -1, client_fd = -1, cgroup_fd = -1; + int err, val, ret, map, verdict; struct netns_cookie_prog *skel; uint64_t cookie_expected_value; socklen_t vallen = sizeof(cookie_expected_value); + static const char send_msg[] = "message"; skel = netns_cookie_prog__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -23,39 +25,56 @@ void test_netns_cookie(void) cgroup_fd = test__join_cgroup("/netns_cookie"); if (CHECK(cgroup_fd < 0, "join_cgroup", "cgroup creation failed\n")) - goto out; + goto done; skel->links.get_netns_cookie_sockops = bpf_program__attach_cgroup( skel->progs.get_netns_cookie_sockops, cgroup_fd); if (!ASSERT_OK_PTR(skel->links.get_netns_cookie_sockops, "prog_attach")) - goto close_cgroup_fd; + goto done; + + verdict = bpf_program__fd(skel->progs.get_netns_cookie_sk_msg); + map = bpf_map__fd(skel->maps.sock_map); + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "prog_attach")) + goto done; server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) - goto close_cgroup_fd; + goto done; client_fd = connect_to_fd(server_fd, 0); if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) - goto close_server_fd; + goto done; + + ret = send(client_fd, send_msg, sizeof(send_msg), 0); + if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", ret)) + goto done; - err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.netns_cookies), - &client_fd, &val); - if (!ASSERT_OK(err, "map_lookup(socket_cookies)")) - goto close_client_fd; + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sockops_netns_cookies), + &client_fd, &val); + if (!ASSERT_OK(err, "map_lookup(sockops_netns_cookies)")) + goto done; err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE, - &cookie_expected_value, &vallen); - if (!ASSERT_OK(err, "getsockopt)")) - goto close_client_fd; + &cookie_expected_value, &vallen); + if (!ASSERT_OK(err, "getsockopt")) + goto done; + + ASSERT_EQ(val, cookie_expected_value, "cookie_value"); + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_msg_netns_cookies), + &client_fd, &val); + if (!ASSERT_OK(err, "map_lookup(sk_msg_netns_cookies)")) + goto done; ASSERT_EQ(val, cookie_expected_value, "cookie_value"); -close_client_fd: - close(client_fd); -close_server_fd: - close(server_fd); -close_cgroup_fd: - close(cgroup_fd); -out: +done: + if (server_fd != -1) + close(server_fd); + if (client_fd != -1) + close(client_fd); + if (cgroup_fd != -1) + close(cgroup_fd); netns_cookie_prog__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c index 4ed8d75aa299..aeff3a4f9287 100644 --- a/tools/testing/selftests/bpf/progs/netns_cookie_prog.c +++ b/tools/testing/selftests/bpf/progs/netns_cookie_prog.c @@ -11,29 +11,74 @@ struct { __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, int); -} netns_cookies SEC(".maps"); +} sockops_netns_cookies SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_msg_netns_cookies SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); SEC("sockops") int get_netns_cookie_sockops(struct bpf_sock_ops *ctx) { struct bpf_sock *sk = ctx->sk; int *cookie; + __u32 key = 0; if (ctx->family != AF_INET6) return 1; - if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB) + if (!sk) + return 1; + + switch (ctx->op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + cookie = bpf_sk_storage_get(&sockops_netns_cookies, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!cookie) + return 1; + + *cookie = bpf_get_netns_cookie(ctx); + break; + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + bpf_sock_map_update(ctx, &sock_map, &key, BPF_NOEXIST); + break; + default: + break; + } + + return 1; +} + +SEC("sk_msg") +int get_netns_cookie_sk_msg(struct sk_msg_md *msg) +{ + struct bpf_sock *sk = msg->sk; + int *cookie; + + if (msg->family != AF_INET6) return 1; if (!sk) return 1; - cookie = bpf_sk_storage_get(&netns_cookies, sk, 0, - BPF_SK_STORAGE_GET_F_CREATE); + cookie = bpf_sk_storage_get(&sk_msg_netns_cookies, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); if (!cookie) return 1; - *cookie = bpf_get_netns_cookie(ctx); + *cookie = bpf_get_netns_cookie(msg); return 1; } + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d7af7e497f0308bc97809cc48b58e8e0f13887e1 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 20 Aug 2021 09:39:35 -0700 Subject: bpf: Fix possible out of bound write in narrow load handling Fix a verifier bug found by smatch static checker in [0]. This problem has never been seen in prod to my best knowledge. Fixing it still seems to be a good idea since it's hard to say for sure whether it's possible or not to have a scenario where a combination of convert_ctx_access() and a narrow load would lead to an out of bound write. When narrow load is handled, one or two new instructions are added to insn_buf array, but before it was only checked that cnt >= ARRAY_SIZE(insn_buf) And it's safe to add a new instruction to insn_buf[cnt++] only once. The second try will lead to out of bound write. And this is what can happen if `shift` is set. Fix it by making sure that if the BPF_RSH instruction has to be added in addition to BPF_AND then there is enough space for two more instructions in insn_buf. The full report [0] is below: kernel/bpf/verifier.c:12304 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c:12311 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c 12282 12283 insn->off = off & ~(size_default - 1); 12284 insn->code = BPF_LDX | BPF_MEM | size_code; 12285 } 12286 12287 target_size = 0; 12288 cnt = convert_ctx_access(type, insn, insn_buf, env->prog, 12289 &target_size); 12290 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Bounds check. 12291 (ctx_field_size && !target_size)) { 12292 verbose(env, "bpf verifier is misconfigured\n"); 12293 return -EINVAL; 12294 } 12295 12296 if (is_narrower_load && size < target_size) { 12297 u8 shift = bpf_ctx_narrow_access_offset( 12298 off, size, size_default) * 8; 12299 if (ctx_field_size <= 4) { 12300 if (shift) 12301 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, ^^^^^ increment beyond end of array 12302 insn->dst_reg, 12303 shift); --> 12304 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, ^^^^^ out of bounds write 12305 (1 << size * 8) - 1); 12306 } else { 12307 if (shift) 12308 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, 12309 insn->dst_reg, 12310 shift); 12311 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, ^^^^^^^^^^^^^^^ Same. 12312 (1ULL << size * 8) - 1); 12313 } 12314 } 12315 12316 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 12317 if (!new_prog) 12318 return -ENOMEM; 12319 12320 delta += cnt - 1; 12321 12322 /* keep walking new program and skip insns we just inserted */ 12323 env->prog = new_prog; 12324 insn = new_prog->insnsi + i + delta; 12325 } 12326 12327 return 0; 12328 } [0] https://lore.kernel.org/bpf/20210817050843.GA21456@kili/ v1->v2: - clarify that problem was only seen by static checker but not in prod; Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0") Reported-by: Dan Carpenter Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820163935.1902398-1-rdna@fb.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f5a0077c9981..206c221453cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12295,6 +12295,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; + if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier narrow ctx load misconfigured\n"); + return -EINVAL; + } if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3 From 50b796e645a5d217fd9d8648ec594241e6f1dd57 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:49 +0530 Subject: samples: bpf: Fix a couple of warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cookie_uid_helper_example.c: In function ‘main’: cookie_uid_helper_example.c:178:69: warning: ‘ -j ACCEPT’ directive writing 10 bytes into a region of size between 8 and 58 [-Wformat-overflow=] 178 | sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", | ^~~~~~~~~~ /home/kkd/src/linux/samples/bpf/cookie_uid_helper_example.c:178:9: note: ‘sprintf’ output between 53 and 103 bytes into a destination of size 100 178 | sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 179 | file); | ~~~~~ Fix by using snprintf and a sufficiently sized buffer. tracex4_user.c:35:15: warning: ‘write’ reading 12 bytes from a region of size 11 [-Wstringop-overread] 35 | key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use size as 11. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-2-memxor@gmail.com --- samples/bpf/cookie_uid_helper_example.c | 11 ++++++++--- samples/bpf/tracex4_user.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index cc3bce8d3aac..54958802c032 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -167,7 +167,7 @@ static void prog_load(void) static void prog_attach_iptables(char *file) { int ret; - char rules[100]; + char rules[256]; if (bpf_obj_pin(prog_fd, file)) error(1, errno, "bpf_obj_pin"); @@ -175,8 +175,13 @@ static void prog_attach_iptables(char *file) printf("file path too long: %s\n", file); exit(1); } - sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", - file); + ret = snprintf(rules, sizeof(rules), + "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", + file); + if (ret < 0 || ret >= sizeof(rules)) { + printf("error constructing iptables command\n"); + exit(1); + } ret = system(rules); if (ret < 0) { printf("iptables rule update failed: %d/n", WEXITSTATUS(ret)); diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index cea399424bca..566e6440e8c2 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -32,7 +32,7 @@ static void print_old_objects(int fd) __u64 key, next_key; struct pair v; - key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ + key = write(1, "\e[1;1H\e[2J", 11); /* clear screen */ key = -1; while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { -- cgit v1.2.3 From f2e85d4a751663514c1e84ea65f334ce1ca13a28 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:50 +0530 Subject: tools: include: Add ethtool_drvinfo definition to UAPI header Instead of copying the whole header in, just add the struct definitions we need for now. In the future it can be synced as a copy of in-tree header if required. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-3-memxor@gmail.com --- tools/include/uapi/linux/ethtool.h | 53 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h index c86c3e942df9..47afae3895ec 100644 --- a/tools/include/uapi/linux/ethtool.h +++ b/tools/include/uapi/linux/ethtool.h @@ -48,4 +48,57 @@ struct ethtool_channels { __u32 combined_count; }; +#define ETHTOOL_FWVERS_LEN 32 +#define ETHTOOL_BUSINFO_LEN 32 +#define ETHTOOL_EROMVERS_LEN 32 + +/** + * struct ethtool_drvinfo - general driver and device information + * @cmd: Command number = %ETHTOOL_GDRVINFO + * @driver: Driver short name. This should normally match the name + * in its bus driver structure (e.g. pci_driver::name). Must + * not be an empty string. + * @version: Driver version string; may be an empty string + * @fw_version: Firmware version string; may be an empty string + * @erom_version: Expansion ROM version string; may be an empty string + * @bus_info: Device bus address. This should match the dev_name() + * string for the underlying bus device, if there is one. May be + * an empty string. + * @reserved2: Reserved for future use; see the note on reserved space. + * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and + * %ETHTOOL_SPFLAGS commands; also the number of strings in the + * %ETH_SS_PRIV_FLAGS set + * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS + * command; also the number of strings in the %ETH_SS_STATS set + * @testinfo_len: Number of results returned by the %ETHTOOL_TEST + * command; also the number of strings in the %ETH_SS_TEST set + * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM + * and %ETHTOOL_SEEPROM commands, in bytes + * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS + * command, in bytes + * + * Users can use the %ETHTOOL_GSSET_INFO command to get the number of + * strings in any string set (from Linux 2.6.34). + * + * Drivers should set at most @driver, @version, @fw_version and + * @bus_info in their get_drvinfo() implementation. The ethtool + * core fills in the other fields using other driver operations. + */ +struct ethtool_drvinfo { + __u32 cmd; + char driver[32]; + char version[32]; + char fw_version[ETHTOOL_FWVERS_LEN]; + char bus_info[ETHTOOL_BUSINFO_LEN]; + char erom_version[ETHTOOL_EROMVERS_LEN]; + char reserved2[12]; + __u32 n_priv_flags; + __u32 n_stats; + __u32 testinfo_len; + __u32 eedump_len; + __u32 regdump_len; +}; + +#define ETHTOOL_GDRVINFO 0x00000003 + #endif /* _UAPI_LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 156f886cf69715265f7b65cb4153bce8f8570326 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:51 +0530 Subject: samples: bpf: Add basic infrastructure for XDP samples This file implements some common helpers to consolidate differences in features and functionality between the various XDP samples and give them a consistent look, feel, and reporting capabilities. This commit only adds support for receive statistics, which does not rely on any tracepoint, but on the XDP program installed on the device by each XDP redirect sample. Some of the key features are: * A concise output format accompanied by helpful text explaining its fields. * An elaborate output format building upon the concise one, and folding out details in case of errors and staying out of view otherwise. * Printing driver names for devices redirecting packets. * Getting mac address for interface. * Printing summarized total statistics for the entire session. * Ability to dynamically switch between concise and verbose mode, using SIGQUIT (Ctrl + \). In later patches, the support will be extended for each tracepoint with its own custom output in concise and verbose mode. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-4-memxor@gmail.com --- samples/bpf/xdp_sample_shared.h | 17 + samples/bpf/xdp_sample_user.c | 838 ++++++++++++++++++++++++++++++++++++++++ samples/bpf/xdp_sample_user.h | 60 +++ 3 files changed, 915 insertions(+) create mode 100644 samples/bpf/xdp_sample_shared.h create mode 100644 samples/bpf/xdp_sample_user.c create mode 100644 samples/bpf/xdp_sample_user.h diff --git a/samples/bpf/xdp_sample_shared.h b/samples/bpf/xdp_sample_shared.h new file mode 100644 index 000000000000..8a7669a5d563 --- /dev/null +++ b/samples/bpf/xdp_sample_shared.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _XDP_SAMPLE_SHARED_H +#define _XDP_SAMPLE_SHARED_H + +struct datarec { + size_t processed; + size_t dropped; + size_t issue; + union { + size_t xdp_pass; + size_t info; + }; + size_t xdp_drop; + size_t xdp_redirect; +} __attribute__((aligned(64))); + +#endif diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c new file mode 100644 index 000000000000..073aa3424e4b --- /dev/null +++ b/samples/bpf/xdp_sample_user.c @@ -0,0 +1,838 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_util.h" +#include "xdp_sample_user.h" + +#define __sample_print(fmt, cond, ...) \ + ({ \ + if (cond) \ + printf(fmt, ##__VA_ARGS__); \ + }) + +#define print_always(fmt, ...) __sample_print(fmt, 1, ##__VA_ARGS__) +#define print_default(fmt, ...) \ + __sample_print(fmt, sample_log_level & LL_DEFAULT, ##__VA_ARGS__) +#define __print_err(err, fmt, ...) \ + ({ \ + __sample_print(fmt, err > 0 || sample_log_level & LL_DEFAULT, \ + ##__VA_ARGS__); \ + sample_err_exp = sample_err_exp ? true : err > 0; \ + }) +#define print_err(err, fmt, ...) __print_err(err, fmt, ##__VA_ARGS__) + +#define __COLUMN(x) "%'10" x " %-13s" +#define FMT_COLUMNf __COLUMN(".0f") +#define FMT_COLUMNd __COLUMN("d") +#define FMT_COLUMNl __COLUMN("llu") +#define RX(rx) rx, "rx/s" +#define PPS(pps) pps, "pkt/s" +#define DROP(drop) drop, "drop/s" +#define ERR(err) err, "error/s" +#define HITS(hits) hits, "hit/s" +#define XMIT(xmit) xmit, "xmit/s" +#define PASS(pass) pass, "pass/s" +#define REDIR(redir) redir, "redir/s" +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ + +#define XDP_UNKNOWN (XDP_REDIRECT + 1) +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +#define XDP_REDIRECT_ERR_MAX 7 + +enum map_type { + MAP_RX, + NUM_MAP, +}; + +enum log_level { + LL_DEFAULT = 1U << 0, + LL_SIMPLE = 1U << 1, + LL_DEBUG = 1U << 2, +}; + +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; + +struct map_entry { + struct hlist_node node; + __u64 pair; + struct record val; +}; + +struct stats_record { + struct record rx_cnt; +}; + +struct sample_output { + struct { + __u64 rx; + } totals; + struct { + __u64 pps; + __u64 drop; + __u64 err; + } rx_cnt; +}; + +struct xdp_desc { + int ifindex; + __u32 prog_id; + int flags; +} sample_xdp_progs[32]; + +struct datarec *sample_mmap[NUM_MAP]; +struct bpf_map *sample_map[NUM_MAP]; +size_t sample_map_count[NUM_MAP]; +enum log_level sample_log_level; +struct sample_output sample_out; +unsigned long sample_interval; +bool sample_err_exp; +int sample_xdp_cnt; +int sample_n_cpus; +int sample_sig_fd; +int sample_mask; + +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + return UINT64_MAX; + } + return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +static void sample_print_help(int mask) +{ + printf("Output format description\n\n" + "By default, redirect success statistics are disabled, use -s to enable.\n" + "The terse output mode is default, verbose mode can be activated using -v\n" + "Use SIGQUIT (Ctrl + \\) to switch the mode dynamically at runtime\n\n" + "Terse mode displays at most the following fields:\n" + " rx/s Number of packets received per second\n" + " redir/s Number of packets successfully redirected per second\n" + " err,drop/s Aggregated count of errors per second (including dropped packets)\n" + " xmit/s Number of packets transmitted on the output device per second\n\n" + "Output description for verbose mode:\n" + " FIELD DESCRIPTION\n"); + + if (mask & SAMPLE_RX_CNT) { + printf(" receive\t\tDisplays the number of packets received & errors encountered\n" + " \t\t\tWhenever an error or packet drop occurs, details of per CPU error\n" + " \t\t\tand drop statistics will be expanded inline in terse mode.\n" + " \t\t\t\tpkt/s - Packets received per second\n" + " \t\t\t\tdrop/s - Packets dropped per second\n" + " \t\t\t\terror/s - Errors encountered per second\n\n"); + } +} + +void sample_usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error) +{ + int i; + + if (!error) + sample_print_help(mask); + + printf("\n%s\nOption for %s:\n", doc, argv[0]); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-15s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value: %d)", + *long_options[i].flag); + else + printf("\t short-option: -%c", long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = libbpf_num_possible_cpus(); + struct datarec *array; + + array = calloc(nr_cpus, sizeof(*array)); + if (!array) { + fprintf(stderr, "Failed to allocate memory (nr_cpus: %u)\n", + nr_cpus); + return NULL; + } + return array; +} + +static int map_entry_init(struct map_entry *e, __u64 pair) +{ + e->pair = pair; + INIT_HLIST_NODE(&e->node); + e->val.timestamp = gettime(); + e->val.cpu = alloc_record_per_cpu(); + if (!e->val.cpu) + return -ENOMEM; + return 0; +} + +static void map_collect_percpu(struct datarec *values, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = libbpf_num_possible_cpus(); + __u64 sum_xdp_redirect = 0; + __u64 sum_processed = 0; + __u64 sum_xdp_pass = 0; + __u64 sum_xdp_drop = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = READ_ONCE(values[i].processed); + rec->cpu[i].dropped = READ_ONCE(values[i].dropped); + rec->cpu[i].issue = READ_ONCE(values[i].issue); + rec->cpu[i].xdp_pass = READ_ONCE(values[i].xdp_pass); + rec->cpu[i].xdp_drop = READ_ONCE(values[i].xdp_drop); + rec->cpu[i].xdp_redirect = READ_ONCE(values[i].xdp_redirect); + + sum_processed += rec->cpu[i].processed; + sum_dropped += rec->cpu[i].dropped; + sum_issue += rec->cpu[i].issue; + sum_xdp_pass += rec->cpu[i].xdp_pass; + sum_xdp_drop += rec->cpu[i].xdp_drop; + sum_xdp_redirect += rec->cpu[i].xdp_redirect; + } + + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + rec->total.xdp_pass = sum_xdp_pass; + rec->total.xdp_drop = sum_xdp_drop; + rec->total.xdp_redirect = sum_xdp_redirect; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i; + + rec = calloc(1, sizeof(*rec) + sample_n_cpus * sizeof(struct record)); + if (!rec) { + fprintf(stderr, "Failed to allocate memory\n"); + return NULL; + } + + if (sample_mask & SAMPLE_RX_CNT) { + rec->rx_cnt.cpu = alloc_record_per_cpu(); + if (!rec->rx_cnt.cpu) { + fprintf(stderr, + "Failed to allocate rx_cnt per-CPU array\n"); + goto end_rec; + } + } + + return rec; +end_rec: + free(rec); + return NULL; +} + +static void free_stats_record(struct stats_record *r) +{ + struct hlist_node *tmp; + struct map_entry *e; + int i; + + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double)period / NANOSEC_PER_SEC); + + return period_; +} + +static double sample_round(double val) +{ + if (val - floor(val) < 0.5) + return floor(val); + return ceil(val); +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = sample_round(packets / period_); + } + return pps; +} + +static __u64 calc_info_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->info - p->info; + pps = sample_round(packets / period_); + } + return pps; +} + +static void calc_xdp_pps(struct datarec *r, struct datarec *p, double *xdp_pass, + double *xdp_drop, double *xdp_redirect, double period_) +{ + *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; + if (period_ > 0) { + *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; + *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; + *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; + } +} + +static void stats_get_rx_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, struct sample_output *out) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PPS(pps), DROP(drop), ERR(err)); + } + + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + out->rx_cnt.pps = pps; + out->rx_cnt.drop = drop; + out->rx_cnt.err = err; + out->totals.rx += pps; + out->totals.drop += drop; + out->totals.err += err; + } +} + + +static void stats_print(const char *prefix, int mask, struct stats_record *r, + struct stats_record *p, struct sample_output *out) +{ + int nr_cpus = libbpf_num_possible_cpus(); + const char *str; + + print_always("%-23s", prefix ?: "Summary"); + if (mask & SAMPLE_RX_CNT) + print_always(FMT_COLUMNl, RX(out->totals.rx)); + printf("\n"); + + if (mask & SAMPLE_RX_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->rx_cnt.pps ? + "receive total" : + "receive"; + print_err((out->rx_cnt.err || out->rx_cnt.drop), + " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl "\n", + str, PPS(out->rx_cnt.pps), DROP(out->rx_cnt.drop), + ERR(out->rx_cnt.err)); + + stats_get_rx_cnt(r, p, nr_cpus, NULL); + } + + if (sample_log_level & LL_DEFAULT || + ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { + sample_err_exp = false; + printf("\n"); + } +} + +int sample_setup_maps(struct bpf_map **maps) +{ + sample_n_cpus = libbpf_num_possible_cpus(); + + for (int i = 0; i < NUM_MAP; i++) { + sample_map[i] = maps[i]; + + switch (i) { + case MAP_RX: + sample_map_count[i] = sample_n_cpus; + break; + default: + return -EINVAL; + } + if (bpf_map__resize(sample_map[i], sample_map_count[i]) < 0) + return -errno; + } + return 0; +} + +static int sample_setup_maps_mappings(void) +{ + for (int i = 0; i < NUM_MAP; i++) { + size_t size = sample_map_count[i] * sizeof(struct datarec); + + sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, bpf_map__fd(sample_map[i]), 0); + if (sample_mmap[i] == MAP_FAILED) + return -errno; + } + return 0; +} + +int __sample_init(int mask) +{ + sigset_t st; + + sigemptyset(&st); + sigaddset(&st, SIGQUIT); + sigaddset(&st, SIGINT); + sigaddset(&st, SIGTERM); + + if (sigprocmask(SIG_BLOCK, &st, NULL) < 0) + return -errno; + + sample_sig_fd = signalfd(-1, &st, SFD_CLOEXEC | SFD_NONBLOCK); + if (sample_sig_fd < 0) + return -errno; + + sample_mask = mask; + + return sample_setup_maps_mappings(); +} + +static int __sample_remove_xdp(int ifindex, __u32 prog_id, int xdp_flags) +{ + __u32 cur_prog_id = 0; + int ret; + + if (prog_id) { + ret = bpf_get_link_xdp_id(ifindex, &cur_prog_id, xdp_flags); + if (ret < 0) + return -errno; + + if (prog_id != cur_prog_id) { + print_always( + "Program on ifindex %d does not match installed " + "program, skipping unload\n", + ifindex); + return -ENOENT; + } + } + + return bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); +} + +int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, + bool force) +{ + int ret, xdp_flags = 0; + __u32 prog_id = 0; + + if (sample_xdp_cnt == 32) { + fprintf(stderr, + "Total limit for installed XDP programs in a sample reached\n"); + return -ENOTSUP; + } + + xdp_flags |= !force ? XDP_FLAGS_UPDATE_IF_NOEXIST : 0; + xdp_flags |= generic ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; + ret = bpf_set_link_xdp_fd(ifindex, bpf_program__fd(xdp_prog), + xdp_flags); + if (ret < 0) { + ret = -errno; + fprintf(stderr, + "Failed to install program \"%s\" on ifindex %d, mode = %s, " + "force = %s: %s\n", + bpf_program__name(xdp_prog), ifindex, + generic ? "skb" : "native", force ? "true" : "false", + strerror(-ret)); + return ret; + } + + ret = bpf_get_link_xdp_id(ifindex, &prog_id, xdp_flags); + if (ret < 0) { + ret = -errno; + fprintf(stderr, + "Failed to get XDP program id for ifindex %d, removing program: %s\n", + ifindex, strerror(errno)); + __sample_remove_xdp(ifindex, 0, xdp_flags); + return ret; + } + sample_xdp_progs[sample_xdp_cnt++] = + (struct xdp_desc){ ifindex, prog_id, xdp_flags }; + + return 0; +} + +static void sample_summary_print(void) +{ + double period = sample_out.rx_cnt.pps; + + if (sample_out.totals.rx) { + double pkts = sample_out.totals.rx; + + print_always(" Packets received : %'-10llu\n", + sample_out.totals.rx); + print_always(" Average packets/s : %'-10.0f\n", + sample_round(pkts / period)); + } +} + +void sample_exit(int status) +{ + size_t size; + + for (int i = 0; i < NUM_MAP; i++) { + size = sample_map_count[i] * sizeof(**sample_mmap); + munmap(sample_mmap[i], size); + } + while (sample_xdp_cnt--) { + int i = sample_xdp_cnt, ifindex, xdp_flags; + __u32 prog_id; + + prog_id = sample_xdp_progs[i].prog_id; + ifindex = sample_xdp_progs[i].ifindex; + xdp_flags = sample_xdp_progs[i].flags; + + __sample_remove_xdp(ifindex, prog_id, xdp_flags); + } + sample_summary_print(); + close(sample_sig_fd); + exit(status); +} + +static int sample_stats_collect(struct stats_record *rec) +{ + int i; + + if (sample_mask & SAMPLE_RX_CNT) + map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt); + + return 0; +} + +static void sample_summary_update(struct sample_output *out, int interval) +{ + sample_out.totals.rx += out->totals.rx; + sample_out.rx_cnt.pps += interval; +} + +static void sample_stats_print(int mask, struct stats_record *cur, + struct stats_record *prev, char *prog_name, + int interval) +{ + struct sample_output out = {}; + + if (mask & SAMPLE_RX_CNT) + stats_get_rx_cnt(cur, prev, 0, &out); + sample_summary_update(&out, interval); + + stats_print(prog_name, mask, cur, prev, &out); +} + +void sample_switch_mode(void) +{ + sample_log_level ^= LL_DEBUG - 1; +} + +static int sample_signal_cb(void) +{ + struct signalfd_siginfo si; + int r; + + r = read(sample_sig_fd, &si, sizeof(si)); + if (r < 0) + return -errno; + + switch (si.ssi_signo) { + case SIGQUIT: + sample_switch_mode(); + printf("\n"); + break; + default: + printf("\n"); + return 1; + } + + return 0; +} + +/* Pointer swap trick */ +static void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int sample_timer_cb(int timerfd, struct stats_record **rec, + struct stats_record **prev, int interval) +{ + char line[64] = "Summary"; + int ret; + __u64 t; + + ret = read(timerfd, &t, sizeof(t)); + if (ret < 0) + return -errno; + + swap(prev, rec); + ret = sample_stats_collect(*rec); + if (ret < 0) + return ret; + + if (sample_xdp_cnt == 2) { + char fi[IFNAMSIZ]; + char to[IFNAMSIZ]; + const char *f, *t; + + f = t = NULL; + if (if_indextoname(sample_xdp_progs[0].ifindex, fi)) + f = fi; + if (if_indextoname(sample_xdp_progs[1].ifindex, to)) + t = to; + + snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?"); + } + + sample_stats_print(sample_mask, *rec, *prev, line, interval); + return 0; +} + +int sample_run(int interval, void (*post_cb)(void *), void *ctx) +{ + struct timespec ts = { interval, 0 }; + struct itimerspec its = { ts, ts }; + struct stats_record *rec, *prev; + struct pollfd pfd[2] = {}; + int timerfd, ret; + + if (!interval) { + fprintf(stderr, "Incorrect interval 0\n"); + return -EINVAL; + } + sample_interval = interval; + /* Pretty print numbers */ + setlocale(LC_NUMERIC, "en_US.UTF-8"); + + timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC | TFD_NONBLOCK); + if (timerfd < 0) + return -errno; + timerfd_settime(timerfd, 0, &its, NULL); + + pfd[0].fd = sample_sig_fd; + pfd[0].events = POLLIN; + + pfd[1].fd = timerfd; + pfd[1].events = POLLIN; + + ret = -ENOMEM; + rec = alloc_stats_record(); + if (!rec) + goto end; + prev = alloc_stats_record(); + if (!prev) + goto end_rec; + + ret = sample_stats_collect(rec); + if (ret < 0) + goto end_rec_prev; + + for (;;) { + ret = poll(pfd, 2, -1); + if (ret < 0) { + if (errno == EINTR) + continue; + else + break; + } + + if (pfd[0].revents & POLLIN) + ret = sample_signal_cb(); + else if (pfd[1].revents & POLLIN) + ret = sample_timer_cb(timerfd, &rec, &prev, interval); + + if (ret) + break; + + if (post_cb) + post_cb(ctx); + } + +end_rec_prev: + free_stats_record(prev); +end_rec: + free_stats_record(rec); +end: + close(timerfd); + + return ret; +} + +const char *get_driver_name(int ifindex) +{ + struct ethtool_drvinfo drv = {}; + char ifname[IF_NAMESIZE]; + static char drvname[32]; + struct ifreq ifr = {}; + int fd, r = 0; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return "[error]"; + + if (!if_indextoname(ifindex, ifname)) + goto end; + + drv.cmd = ETHTOOL_GDRVINFO; + safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + ifr.ifr_data = (void *)&drv; + + r = ioctl(fd, SIOCETHTOOL, &ifr); + if (r) + goto end; + + safe_strncpy(drvname, drv.driver, sizeof(drvname)); + + close(fd); + return drvname; + +end: + r = errno; + close(fd); + return r == EOPNOTSUPP ? "loopback" : "[error]"; +} + +int get_mac_addr(int ifindex, void *mac_addr) +{ + char ifname[IF_NAMESIZE]; + struct ifreq ifr = {}; + int fd, r; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -errno; + + if (!if_indextoname(ifindex, ifname)) { + r = -errno; + goto end; + } + + safe_strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + + r = ioctl(fd, SIOCGIFHWADDR, &ifr); + if (r) { + r = -errno; + goto end; + } + + memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); + +end: + close(fd); + return r; +} + +__attribute__((constructor)) static void sample_ctor(void) +{ + if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { + fprintf(stderr, "Failed to set libbpf strict mode: %s\n", + strerror(errno)); + /* Just exit, nothing to cleanup right now */ + exit(EXIT_FAIL_BPF); + } +} diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h new file mode 100644 index 000000000000..d630998df547 --- /dev/null +++ b/samples/bpf/xdp_sample_user.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef XDP_SAMPLE_USER_H +#define XDP_SAMPLE_USER_H + +#include +#include + +#include "xdp_sample_shared.h" + +enum stats_mask { + _SAMPLE_REDIRECT_MAP = 1U << 0, + SAMPLE_RX_CNT = 1U << 1, +}; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +int sample_setup_maps(struct bpf_map **maps); +int __sample_init(int mask); +void sample_exit(int status); +int sample_run(int interval, void (*post_cb)(void *), void *ctx); + +void sample_switch_mode(void); +int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, + bool force); +void sample_usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error); + +const char *get_driver_name(int ifindex); +int get_mac_addr(int ifindex, void *mac_addr); + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" +__attribute__((unused)) +static inline char *safe_strncpy(char *dst, const char *src, size_t size) +{ + if (!size) + return dst; + strncpy(dst, src, size - 1); + dst[size - 1] = '\0'; + return dst; +} +#pragma GCC diagnostic pop + +#define DEFINE_SAMPLE_INIT(name) \ + static int sample_init(struct name *skel, int mask) \ + { \ + int ret; \ + ret = __sample_init(mask); \ + if (ret < 0) \ + return ret; \ + return 0; \ + } + +#endif -- cgit v1.2.3 From 323140389405e5d5d2020b2e3e04863d12cf3e32 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:52 +0530 Subject: samples: bpf: Add BPF support for redirect tracepoint This adds the shared BPF file that will be used going forward for sharing tracepoint programs among XDP redirect samples. Since vmlinux.h conflicts with tools/include for READ_ONCE/WRITE_ONCE and ARRAY_SIZE, they are copied in to xdp_sample.bpf.h along with other helpers that will be required. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-5-memxor@gmail.com --- samples/bpf/xdp_sample.bpf.c | 112 ++++++++++++++++++++++++++++++++++ samples/bpf/xdp_sample.bpf.h | 141 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 samples/bpf/xdp_sample.bpf.c create mode 100644 samples/bpf/xdp_sample.bpf.h diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c new file mode 100644 index 000000000000..e22f2a97a988 --- /dev/null +++ b/samples/bpf/xdp_sample.bpf.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ +#include "xdp_sample.bpf.h" + +#include +#include +#include + +array_map rx_cnt SEC(".maps"); +array_map redir_err_cnt SEC(".maps"); + +const volatile int nr_cpus = 0; + +/* These can be set before loading so that redundant comparisons can be DCE'd by + * the verifier, and only actual matches are tried after loading tp_btf program. + * This allows sample to filter tracepoint stats based on net_device. + */ +const volatile int from_match[32] = {}; +const volatile int to_match[32] = {}; + +/* Find if b is part of set a, but if a is empty set then evaluate to true */ +#define IN_SET(a, b) \ + ({ \ + bool __res = !(a)[0]; \ + for (int i = 0; i < ARRAY_SIZE(a) && (a)[i]; i++) { \ + __res = (a)[i] == (b); \ + if (__res) \ + break; \ + } \ + __res; \ + }) + +static __always_inline __u32 xdp_get_err_key(int err) +{ + switch (err) { + case 0: + return 0; + case -EINVAL: + return 2; + case -ENETDOWN: + return 3; + case -EMSGSIZE: + return 4; + case -EOPNOTSUPP: + return 5; + case -ENOSPC: + return 6; + default: + return 1; + } +} + +static __always_inline int xdp_redirect_collect_stat(int from, int err) +{ + u32 cpu = bpf_get_smp_processor_id(); + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + u32 idx; + + if (!IN_SET(from_match, from)) + return 0; + + key = xdp_get_err_key(err); + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&redir_err_cnt, &idx); + if (!rec) + return 0; + if (key) + NO_TEAR_INC(rec->dropped); + else + NO_TEAR_INC(rec->processed); + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tp_btf/xdp_redirect_err") +int BPF_PROG(tp_xdp_redirect_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map_err") +int BPF_PROG(tp_xdp_redirect_map_err, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect") +int BPF_PROG(tp_xdp_redirect, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} + +SEC("tp_btf/xdp_redirect_map") +int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev, + const struct bpf_prog *xdp, const void *tgt, int err, + const struct bpf_map *map, u32 index) +{ + return xdp_redirect_collect_stat(dev->ifindex, err); +} diff --git a/samples/bpf/xdp_sample.bpf.h b/samples/bpf/xdp_sample.bpf.h new file mode 100644 index 000000000000..25b1dbe9b37b --- /dev/null +++ b/samples/bpf/xdp_sample.bpf.h @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _XDP_SAMPLE_BPF_H +#define _XDP_SAMPLE_BPF_H + +#include "vmlinux.h" +#include +#include +#include + +#include "xdp_sample_shared.h" + +#define ETH_ALEN 6 +#define ETH_P_802_3_MIN 0x0600 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 +#define ETH_P_IP 0x0800 +#define ETH_P_IPV6 0x86DD +#define ETH_P_ARP 0x0806 +#define IPPROTO_ICMPV6 58 + +#define EINVAL 22 +#define ENETDOWN 100 +#define EMSGSIZE 90 +#define EOPNOTSUPP 95 +#define ENOSPC 28 + +typedef struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_MMAPABLE); + __type(key, unsigned int); + __type(value, struct datarec); +} array_map; + +extern array_map rx_cnt; +extern const volatile int nr_cpus; + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline void swap_src_dst_mac(void *data) +{ + unsigned short *p = data; + unsigned short dst[3]; + + dst[0] = p[0]; + dst[1] = p[1]; + dst[2] = p[2]; + p[0] = p[3]; + p[1] = p[4]; + p[2] = p[5]; + p[3] = dst[0]; + p[4] = dst[1]; + p[5] = dst[2]; +} + +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define bpf_ntohs(x) __builtin_bswap16(x) +#define bpf_htons(x) __builtin_bswap16(x) +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define bpf_ntohs(x) (x) +#define bpf_htons(x) (x) +#else +# error "Endianness detection needs to be set up for your compiler?!" +#endif + +/* + * Note: including linux/compiler.h or linux/kernel.h for the macros below + * conflicts with vmlinux.h include in BPF files, so we define them here. + * + * Following functions are taken from kernel sources and + * break aliasing rules in their original form. + * + * While kernel is compiled with -fno-strict-aliasing, + * perf uses -Wstrict-aliasing=3 which makes build fail + * under gcc 4.4. + * + * Using extra __may_alias__ type to allow aliasing + * in this case. + */ +typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; +typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; +typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; +typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; + +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; + case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; + case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; + case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)res, (const void *)p, size); + asm volatile ("" : : : "memory"); + } +} + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; + case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; + case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; + case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; + default: + asm volatile ("" : : : "memory"); + __builtin_memcpy((void *)p, (const void *)res, size); + asm volatile ("" : : : "memory"); + } +} + +#define READ_ONCE(x) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +/* Add a value using relaxed read and relaxed write. Less expensive than + * fetch_add when there is no write concurrency. + */ +#define NO_TEAR_ADD(x, val) WRITE_ONCE((x), READ_ONCE(x) + (val)) +#define NO_TEAR_INC(x) NO_TEAR_ADD((x), 1) + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#endif -- cgit v1.2.3 From 1d930fd2cdbf5e156c32c73ea7f3d5b12bdc41d7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:53 +0530 Subject: samples: bpf: Add redirect tracepoint statistics support This implements per-errno reporting (for the ones we explicitly recognize), adds some help output, and implements the stats retrieval and printing functions. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-6-memxor@gmail.com --- samples/bpf/xdp_sample_user.c | 194 ++++++++++++++++++++++++++++++++++++++++++ samples/bpf/xdp_sample_user.h | 21 +++++ 2 files changed, 215 insertions(+) diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c index 073aa3424e4b..c34592566825 100644 --- a/samples/bpf/xdp_sample_user.c +++ b/samples/bpf/xdp_sample_user.c @@ -73,6 +73,7 @@ enum map_type { MAP_RX, + MAP_REDIRECT_ERR, NUM_MAP, }; @@ -96,17 +97,24 @@ struct map_entry { struct stats_record { struct record rx_cnt; + struct record redir_err[XDP_REDIRECT_ERR_MAX]; }; struct sample_output { struct { __u64 rx; + __u64 redir; + __u64 err; } totals; struct { __u64 pps; __u64 drop; __u64 err; } rx_cnt; + struct { + __u64 suc; + __u64 err; + } redir_cnt; }; struct xdp_desc { @@ -127,6 +135,27 @@ int sample_n_cpus; int sample_sig_fd; int sample_mask; +static const char *xdp_redirect_err_names[XDP_REDIRECT_ERR_MAX] = { + /* Key=1 keeps unknown errors */ + "Success", + "Unknown", + "EINVAL", + "ENETDOWN", + "EMSGSIZE", + "EOPNOTSUPP", + "ENOSPC", +}; + +/* Keyed from Unknown */ +static const char *xdp_redirect_err_help[XDP_REDIRECT_ERR_MAX - 1] = { + "Unknown error", + "Invalid redirection", + "Device being redirected to is down", + "Packet length too large for device", + "Operation not supported", + "No space in ptr_ring of cpumap kthread", +}; + static __u64 gettime(void) { struct timespec t; @@ -162,6 +191,21 @@ static void sample_print_help(int mask) " \t\t\t\tdrop/s - Packets dropped per second\n" " \t\t\t\terror/s - Errors encountered per second\n\n"); } + if (mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { + printf(" redirect\t\tDisplays the number of packets successfully redirected\n" + " \t\t\tErrors encountered are expanded under redirect_err field\n" + " \t\t\tNote that passing -s to enable it has a per packet overhead\n" + " \t\t\t\tredir/s - Packets redirected successfully per second\n\n" + " redirect_err\t\tDisplays the number of packets that failed redirection\n" + " \t\t\tThe errno is expanded under this field with per CPU count\n" + " \t\t\tThe recognized errors are:\n"); + + for (int i = 2; i < XDP_REDIRECT_ERR_MAX; i++) + printf("\t\t\t %s: %s\n", xdp_redirect_err_names[i], + xdp_redirect_err_help[i - 1]); + + printf(" \n\t\t\t\terror/s - Packets that failed redirection per second\n\n"); + } } void sample_usage(char *argv[], const struct option *long_options, @@ -269,8 +313,25 @@ static struct stats_record *alloc_stats_record(void) goto end_rec; } } + if (sample_mask & (SAMPLE_REDIRECT_CNT | SAMPLE_REDIRECT_ERR_CNT)) { + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) { + rec->redir_err[i].cpu = alloc_record_per_cpu(); + if (!rec->redir_err[i].cpu) { + fprintf(stderr, + "Failed to allocate redir_err per-CPU array for " + "\"%s\" case\n", + xdp_redirect_err_names[i]); + while (i--) + free(rec->redir_err[i].cpu); + goto end_rx_cnt; + } + } + } return rec; + +end_rx_cnt: + free(rec->rx_cnt.cpu); end_rec: free(rec); return NULL; @@ -282,6 +343,8 @@ static void free_stats_record(struct stats_record *r) struct map_entry *e; int i; + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) + free(r->redir_err[i].cpu); free(r->rx_cnt.cpu); free(r); } @@ -407,6 +470,87 @@ static void stats_get_rx_cnt(struct stats_record *stats_rec, } } +static void stats_get_redirect_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + struct record *rec, *prev; + double t, pps; + int i; + + rec = &stats_rec->redir_err[0]; + prev = &stats_prev->redir_err[0]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + if (!pps) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf "\n", str, REDIR(pps)); + } + + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + out->redir_cnt.suc = pps; + out->totals.redir += pps; + } +} + +static void stats_get_redirect_err_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + struct record *rec, *prev; + double t, drop, sum = 0; + int rec_i, i; + + for (rec_i = 1; rec_i < XDP_REDIRECT_ERR_MAX; rec_i++) { + char str[64]; + + rec = &stats_rec->redir_err[rec_i]; + prev = &stats_prev->redir_err[rec_i]; + t = calc_period(rec, prev); + + drop = calc_drop_pps(&rec->total, &prev->total, t); + if (drop > 0 && !out) { + snprintf(str, sizeof(str), + sample_log_level & LL_DEFAULT ? "%s total" : + "%s", + xdp_redirect_err_names[rec_i]); + print_err(drop, " %-18s " FMT_COLUMNf "\n", str, + ERR(drop)); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + double drop; + + drop = calc_drop_pps(r, p, t); + if (!drop) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s" FMT_COLUMNf "\n", str, + ERR(drop)); + } + + sum += drop; + } + + if (out) { + out->redir_cnt.err = sum; + out->totals.err += sum; + } +} + static void stats_print(const char *prefix, int mask, struct stats_record *r, struct stats_record *p, struct sample_output *out) @@ -417,6 +561,8 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, print_always("%-23s", prefix ?: "Summary"); if (mask & SAMPLE_RX_CNT) print_always(FMT_COLUMNl, RX(out->totals.rx)); + if (mask & SAMPLE_REDIRECT_CNT) + print_always(FMT_COLUMNl, REDIR(out->totals.redir)); printf("\n"); if (mask & SAMPLE_RX_CNT) { @@ -431,6 +577,24 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, stats_get_rx_cnt(r, p, nr_cpus, NULL); } + if (mask & SAMPLE_REDIRECT_CNT) { + str = out->redir_cnt.suc ? "redirect total" : "redirect"; + print_default(" %-20s " FMT_COLUMNl "\n", str, + REDIR(out->redir_cnt.suc)); + + stats_get_redirect_cnt(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_REDIRECT_ERR_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->redir_cnt.err ? + "redirect_err total" : + "redirect_err"; + print_err(out->redir_cnt.err, " %-20s " FMT_COLUMNl "\n", str, + ERR(out->redir_cnt.err)); + + stats_get_redirect_err_cnt(r, p, nr_cpus, NULL); + } + if (sample_log_level & LL_DEFAULT || ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { sample_err_exp = false; @@ -449,6 +613,10 @@ int sample_setup_maps(struct bpf_map **maps) case MAP_RX: sample_map_count[i] = sample_n_cpus; break; + case MAP_REDIRECT_ERR: + sample_map_count[i] = + XDP_REDIRECT_ERR_MAX * sample_n_cpus; + break; default: return -EINVAL; } @@ -568,6 +736,17 @@ static void sample_summary_print(void) print_always(" Average packets/s : %'-10.0f\n", sample_round(pkts / period)); } + if (sample_out.totals.redir) { + double pkts = sample_out.totals.redir; + + print_always(" Packets redirected : %'-10llu\n", + sample_out.totals.redir); + print_always(" Average redir/s : %'-10.0f\n", + sample_round(pkts / period)); + } + if (sample_out.totals.err) + print_always(" Errors recorded : %'-10llu\n", + sample_out.totals.err); } void sample_exit(int status) @@ -600,12 +779,23 @@ static int sample_stats_collect(struct stats_record *rec) if (sample_mask & SAMPLE_RX_CNT) map_collect_percpu(sample_mmap[MAP_RX], &rec->rx_cnt); + if (sample_mask & SAMPLE_REDIRECT_CNT) + map_collect_percpu(sample_mmap[MAP_REDIRECT_ERR], &rec->redir_err[0]); + + if (sample_mask & SAMPLE_REDIRECT_ERR_CNT) { + for (i = 1; i < XDP_REDIRECT_ERR_MAX; i++) + map_collect_percpu(&sample_mmap[MAP_REDIRECT_ERR][i * sample_n_cpus], + &rec->redir_err[i]); + } + return 0; } static void sample_summary_update(struct sample_output *out, int interval) { sample_out.totals.rx += out->totals.rx; + sample_out.totals.redir += out->totals.redir; + sample_out.totals.err += out->totals.err; sample_out.rx_cnt.pps += interval; } @@ -617,6 +807,10 @@ static void sample_stats_print(int mask, struct stats_record *cur, if (mask & SAMPLE_RX_CNT) stats_get_rx_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_REDIRECT_CNT) + stats_get_redirect_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_REDIRECT_ERR_CNT) + stats_get_redirect_err_cnt(cur, prev, 0, &out); sample_summary_update(&out, interval); stats_print(prog_name, mask, cur, prev, &out); diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h index d630998df547..1935a0e2f85b 100644 --- a/samples/bpf/xdp_sample_user.h +++ b/samples/bpf/xdp_sample_user.h @@ -10,6 +10,10 @@ enum stats_mask { _SAMPLE_REDIRECT_MAP = 1U << 0, SAMPLE_RX_CNT = 1U << 1, + SAMPLE_REDIRECT_ERR_CNT = 1U << 2, + SAMPLE_REDIRECT_CNT = 1U << 7, + SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, }; /* Exit return codes */ @@ -47,6 +51,15 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size) } #pragma GCC diagnostic pop +#define __attach_tp(name) \ + ({ \ + if (!bpf_program__is_tracing(skel->progs.name)) \ + return -EINVAL; \ + skel->links.name = bpf_program__attach(skel->progs.name); \ + if (!skel->links.name) \ + return -errno; \ + }) + #define DEFINE_SAMPLE_INIT(name) \ static int sample_init(struct name *skel, int mask) \ { \ @@ -54,6 +67,14 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size) ret = __sample_init(mask); \ if (ret < 0) \ return ret; \ + if (mask & SAMPLE_REDIRECT_MAP_CNT) \ + __attach_tp(tp_xdp_redirect_map); \ + if (mask & SAMPLE_REDIRECT_CNT) \ + __attach_tp(tp_xdp_redirect); \ + if (mask & SAMPLE_REDIRECT_ERR_MAP_CNT) \ + __attach_tp(tp_xdp_redirect_map_err); \ + if (mask & SAMPLE_REDIRECT_ERR_CNT) \ + __attach_tp(tp_xdp_redirect_err); \ return 0; \ } -- cgit v1.2.3 From 451588764e2f3e3ab197b23c7958f750707e2a24 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:54 +0530 Subject: samples: bpf: Add BPF support for xdp_exception tracepoint This would allow us to store stats for each XDP action, including their per-CPU counts. Consolidating this here allows all redirect samples to detect xdp_exception events. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-7-memxor@gmail.com --- samples/bpf/xdp_sample.bpf.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c index e22f2a97a988..53ab5a972405 100644 --- a/samples/bpf/xdp_sample.bpf.c +++ b/samples/bpf/xdp_sample.bpf.c @@ -8,6 +8,7 @@ array_map rx_cnt SEC(".maps"); array_map redir_err_cnt SEC(".maps"); +array_map exception_cnt SEC(".maps"); const volatile int nr_cpus = 0; @@ -110,3 +111,29 @@ int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev, { return xdp_redirect_collect_stat(dev->ifindex, err); } + +SEC("tp_btf/xdp_exception") +int BPF_PROG(tp_xdp_exception, const struct net_device *dev, + const struct bpf_prog *xdp, u32 act) +{ + u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 key = act, idx; + + if (!IN_SET(from_match, dev->ifindex)) + return 0; + if (!IN_SET(to_match, dev->ifindex)) + return 0; + + if (key > XDP_REDIRECT) + key = XDP_REDIRECT + 1; + + idx = key * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&exception_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_INC(rec->dropped); + + return 0; +} + -- cgit v1.2.3 From 82c450803a917da6edb34a0a769d0b5a0b10990c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:55 +0530 Subject: samples: bpf: Add xdp_exception tracepoint statistics support This implements the retrieval and printing, as well the help output. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-8-memxor@gmail.com --- samples/bpf/xdp_sample_user.c | 113 ++++++++++++++++++++++++++++++++++++++++++ samples/bpf/xdp_sample_user.h | 3 ++ 2 files changed, 116 insertions(+) diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c index c34592566825..52a30fd1f2a3 100644 --- a/samples/bpf/xdp_sample_user.c +++ b/samples/bpf/xdp_sample_user.c @@ -74,6 +74,7 @@ enum map_type { MAP_RX, MAP_REDIRECT_ERR, + MAP_EXCEPTION, NUM_MAP, }; @@ -98,6 +99,7 @@ struct map_entry { struct stats_record { struct record rx_cnt; struct record redir_err[XDP_REDIRECT_ERR_MAX]; + struct record exception[XDP_ACTION_MAX]; }; struct sample_output { @@ -115,6 +117,9 @@ struct sample_output { __u64 suc; __u64 err; } redir_cnt; + struct { + __u64 hits; + } except_cnt; }; struct xdp_desc { @@ -156,6 +161,15 @@ static const char *xdp_redirect_err_help[XDP_REDIRECT_ERR_MAX - 1] = { "No space in ptr_ring of cpumap kthread", }; +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; + static __u64 gettime(void) { struct timespec t; @@ -169,6 +183,13 @@ static __u64 gettime(void) return (__u64)t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; } +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + static void sample_print_help(int mask) { printf("Output format description\n\n" @@ -206,6 +227,15 @@ static void sample_print_help(int mask) printf(" \n\t\t\t\terror/s - Packets that failed redirection per second\n\n"); } + + if (mask & SAMPLE_EXCEPTION_CNT) { + printf(" xdp_exception\t\tDisplays xdp_exception tracepoint events\n" + " \t\t\tThis can occur due to internal driver errors, unrecognized\n" + " \t\t\tXDP actions and due to explicit user trigger by use of XDP_ABORTED\n" + " \t\t\tEach action is expanded below this field with its count\n" + " \t\t\t\thit/s - Number of times the tracepoint was hit per second\n\n"); + } + } void sample_usage(char *argv[], const struct option *long_options, @@ -327,9 +357,26 @@ static struct stats_record *alloc_stats_record(void) } } } + if (sample_mask & SAMPLE_EXCEPTION_CNT) { + for (i = 0; i < XDP_ACTION_MAX; i++) { + rec->exception[i].cpu = alloc_record_per_cpu(); + if (!rec->exception[i].cpu) { + fprintf(stderr, + "Failed to allocate exception per-CPU array for " + "\"%s\" case\n", + action2str(i)); + while (i--) + free(rec->exception[i].cpu); + goto end_redir; + } + } + } return rec; +end_redir: + for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) + free(rec->redir_err[i].cpu); end_rx_cnt: free(rec->rx_cnt.cpu); end_rec: @@ -343,6 +390,8 @@ static void free_stats_record(struct stats_record *r) struct map_entry *e; int i; + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->exception[i].cpu); for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) free(r->redir_err[i].cpu); free(r->rx_cnt.cpu); @@ -551,6 +600,50 @@ static void stats_get_redirect_err_cnt(struct stats_record *stats_rec, } } +static void stats_get_exception_cnt(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + double t, drop, sum = 0; + struct record *rec, *prev; + int rec_i, i; + + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + rec = &stats_rec->exception[rec_i]; + prev = &stats_prev->exception[rec_i]; + t = calc_period(rec, prev); + + drop = calc_drop_pps(&rec->total, &prev->total, t); + /* Fold out errors after heading */ + sum += drop; + + if (drop > 0 && !out) { + print_always(" %-18s " FMT_COLUMNf "\n", + action2str(rec_i), ERR(drop)); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + double drop; + + drop = calc_drop_pps(r, p, t); + if (!drop) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s" FMT_COLUMNf "\n", + str, ERR(drop)); + } + } + } + + if (out) { + out->except_cnt.hits = sum; + out->totals.err += sum; + } +} static void stats_print(const char *prefix, int mask, struct stats_record *r, struct stats_record *p, struct sample_output *out) @@ -595,6 +688,16 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, stats_get_redirect_err_cnt(r, p, nr_cpus, NULL); } + if (mask & SAMPLE_EXCEPTION_CNT) { + str = out->except_cnt.hits ? "xdp_exception total" : + "xdp_exception"; + + print_err(out->except_cnt.hits, " %-20s " FMT_COLUMNl "\n", str, + HITS(out->except_cnt.hits)); + + stats_get_exception_cnt(r, p, nr_cpus, NULL); + } + if (sample_log_level & LL_DEFAULT || ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { sample_err_exp = false; @@ -617,6 +720,9 @@ int sample_setup_maps(struct bpf_map **maps) sample_map_count[i] = XDP_REDIRECT_ERR_MAX * sample_n_cpus; break; + case MAP_EXCEPTION: + sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus; + break; default: return -EINVAL; } @@ -788,6 +894,11 @@ static int sample_stats_collect(struct stats_record *rec) &rec->redir_err[i]); } + if (sample_mask & SAMPLE_EXCEPTION_CNT) + for (i = 0; i < XDP_ACTION_MAX; i++) + map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus], + &rec->exception[i]); + return 0; } @@ -811,6 +922,8 @@ static void sample_stats_print(int mask, struct stats_record *cur, stats_get_redirect_cnt(cur, prev, 0, &out); if (mask & SAMPLE_REDIRECT_ERR_CNT) stats_get_redirect_err_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_EXCEPTION_CNT) + stats_get_exception_cnt(cur, prev, 0, &out); sample_summary_update(&out, interval); stats_print(prog_name, mask, cur, prev, &out); diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h index 1935a0e2f85b..aa28e4bdd628 100644 --- a/samples/bpf/xdp_sample_user.h +++ b/samples/bpf/xdp_sample_user.h @@ -11,6 +11,7 @@ enum stats_mask { _SAMPLE_REDIRECT_MAP = 1U << 0, SAMPLE_RX_CNT = 1U << 1, SAMPLE_REDIRECT_ERR_CNT = 1U << 2, + SAMPLE_EXCEPTION_CNT = 1U << 5, SAMPLE_REDIRECT_CNT = 1U << 7, SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, @@ -75,6 +76,8 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size) __attach_tp(tp_xdp_redirect_map_err); \ if (mask & SAMPLE_REDIRECT_ERR_CNT) \ __attach_tp(tp_xdp_redirect_err); \ + if (mask & SAMPLE_EXCEPTION_CNT) \ + __attach_tp(tp_xdp_exception); \ return 0; \ } -- cgit v1.2.3 From 0cf3c2fc4b1afbd8d9c376754af34c1d2bd56de7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:56 +0530 Subject: samples: bpf: Add BPF support for cpumap tracepoints These are invoked in two places, when the XDP frame or SKB (for generic XDP) enqueued to the ptr_ring (cpumap_enqueue) and when kthread processes the frame after invoking the CPUMAP program for it (returning stats for the batch). We use cpumap_map_id to filter on the map_id as a way to avoid printing incorrect stats for parallel sessions of xdp_redirect_cpu. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-9-memxor@gmail.com --- samples/bpf/xdp_sample.bpf.c | 58 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c index 53ab5a972405..f01a5529751c 100644 --- a/samples/bpf/xdp_sample.bpf.c +++ b/samples/bpf/xdp_sample.bpf.c @@ -8,6 +8,8 @@ array_map rx_cnt SEC(".maps"); array_map redir_err_cnt SEC(".maps"); +array_map cpumap_enqueue_cnt SEC(".maps"); +array_map cpumap_kthread_cnt SEC(".maps"); array_map exception_cnt SEC(".maps"); const volatile int nr_cpus = 0; @@ -19,6 +21,8 @@ const volatile int nr_cpus = 0; const volatile int from_match[32] = {}; const volatile int to_match[32] = {}; +int cpumap_map_id = 0; + /* Find if b is part of set a, but if a is empty set then evaluate to true */ #define IN_SET(a, b) \ ({ \ @@ -112,6 +116,59 @@ int BPF_PROG(tp_xdp_redirect_map, const struct net_device *dev, return xdp_redirect_collect_stat(dev->ifindex, err); } +SEC("tp_btf/xdp_cpumap_enqueue") +int BPF_PROG(tp_xdp_cpumap_enqueue, int map_id, unsigned int processed, + unsigned int drops, int to_cpu) +{ + u32 cpu = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 idx; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + idx = to_cpu * nr_cpus + cpu; + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &idx); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + if (processed > 0) + NO_TEAR_INC(rec->issue); + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +SEC("tp_btf/xdp_cpumap_kthread") +int BPF_PROG(tp_xdp_cpumap_kthread, int map_id, unsigned int processed, + unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats) +{ + struct datarec *rec; + u32 cpu; + + if (cpumap_map_id && cpumap_map_id != map_id) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, processed); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_ADD(rec->xdp_pass, xdp_stats->pass); + NO_TEAR_ADD(rec->xdp_drop, xdp_stats->drop); + NO_TEAR_ADD(rec->xdp_redirect, xdp_stats->redirect); + /* Count times kthread yielded CPU via schedule call */ + if (sched) + NO_TEAR_INC(rec->issue); + return 0; +} + SEC("tp_btf/xdp_exception") int BPF_PROG(tp_xdp_exception, const struct net_device *dev, const struct bpf_prog *xdp, u32 act) @@ -136,4 +193,3 @@ int BPF_PROG(tp_xdp_exception, const struct net_device *dev, return 0; } - -- cgit v1.2.3 From d771e217506adcfbfb08c693fb9332ee4859d61d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:57 +0530 Subject: samples: bpf: Add cpumap tracepoint statistics support This consolidates retrieval and printing into the XDP sample helper. For the kthread stats, it expands xdp_stats separately with its own per-CPU stats. For cpumap enqueue, we display FROM->TO stats also with its per-CPU stats. The help out explains in detail the various aspects of the output. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-10-memxor@gmail.com --- samples/bpf/xdp_sample_user.c | 219 +++++++++++++++++++++++++++++++++++++++++- samples/bpf/xdp_sample_user.h | 6 ++ 2 files changed, 224 insertions(+), 1 deletion(-) diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c index 52a30fd1f2a3..e2692dee1dbb 100644 --- a/samples/bpf/xdp_sample_user.c +++ b/samples/bpf/xdp_sample_user.c @@ -74,6 +74,8 @@ enum map_type { MAP_RX, MAP_REDIRECT_ERR, + MAP_CPUMAP_ENQUEUE, + MAP_CPUMAP_KTHREAD, MAP_EXCEPTION, NUM_MAP, }; @@ -99,13 +101,16 @@ struct map_entry { struct stats_record { struct record rx_cnt; struct record redir_err[XDP_REDIRECT_ERR_MAX]; + struct record kthread; struct record exception[XDP_ACTION_MAX]; + struct record enq[]; }; struct sample_output { struct { __u64 rx; __u64 redir; + __u64 drop; __u64 err; } totals; struct { @@ -228,6 +233,30 @@ static void sample_print_help(int mask) printf(" \n\t\t\t\terror/s - Packets that failed redirection per second\n\n"); } + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { + printf(" enqueue to cpu N\tDisplays the number of packets enqueued to bulk queue of CPU N\n" + " \t\t\tExpands to cpu:FROM->N to display enqueue stats for each CPU enqueuing to CPU N\n" + " \t\t\tReceived packets can be associated with the CPU redirect program is enqueuing \n" + " \t\t\tpackets to.\n" + " \t\t\t\tpkt/s - Packets enqueued per second from other CPU to CPU N\n" + " \t\t\t\tdrop/s - Packets dropped when trying to enqueue to CPU N\n" + " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n"); + } + + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + printf(" kthread\t\tDisplays the number of packets processed in CPUMAP kthread for each CPU\n" + " \t\t\tPackets consumed from ptr_ring in kthread, and its xdp_stats (after calling \n" + " \t\t\tCPUMAP bpf prog) are expanded below this. xdp_stats are expanded as a total and\n" + " \t\t\tthen per-CPU to associate it to each CPU's pinned CPUMAP kthread.\n" + " \t\t\t\tpkt/s - Packets consumed per second from ptr_ring\n" + " \t\t\t\tdrop/s - Packets dropped per second in kthread\n" + " \t\t\t\tsched - Number of times kthread called schedule()\n\n" + " \t\t\txdp_stats (also expands to per-CPU counts)\n" + " \t\t\t\tpass/s - XDP_PASS count for CPUMAP program execution\n" + " \t\t\t\tdrop/s - XDP_DROP count for CPUMAP program execution\n" + " \t\t\t\tredir/s - XDP_REDIRECT count for CPUMAP program execution\n\n"); + } + if (mask & SAMPLE_EXCEPTION_CNT) { printf(" xdp_exception\t\tDisplays xdp_exception tracepoint events\n" " \t\t\tThis can occur due to internal driver errors, unrecognized\n" @@ -357,6 +386,14 @@ static struct stats_record *alloc_stats_record(void) } } } + if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + rec->kthread.cpu = alloc_record_per_cpu(); + if (!rec->kthread.cpu) { + fprintf(stderr, + "Failed to allocate kthread per-CPU array\n"); + goto end_redir; + } + } if (sample_mask & SAMPLE_EXCEPTION_CNT) { for (i = 0; i < XDP_ACTION_MAX; i++) { rec->exception[i].cpu = alloc_record_per_cpu(); @@ -367,13 +404,32 @@ static struct stats_record *alloc_stats_record(void) action2str(i)); while (i--) free(rec->exception[i].cpu); - goto end_redir; + goto end_kthread; + } + } + } + if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { + for (i = 0; i < sample_n_cpus; i++) { + rec->enq[i].cpu = alloc_record_per_cpu(); + if (!rec->enq[i].cpu) { + fprintf(stderr, + "Failed to allocate enqueue per-CPU array for " + "CPU %d\n", + i); + while (i--) + free(rec->enq[i].cpu); + goto end_exception; } } } return rec; +end_exception: + for (i = 0; i < XDP_ACTION_MAX; i++) + free(rec->exception[i].cpu); +end_kthread: + free(rec->kthread.cpu); end_redir: for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) free(rec->redir_err[i].cpu); @@ -390,8 +446,11 @@ static void free_stats_record(struct stats_record *r) struct map_entry *e; int i; + for (i = 0; i < sample_n_cpus; i++) + free(r->enq[i].cpu); for (i = 0; i < XDP_ACTION_MAX; i++) free(r->exception[i].cpu); + free(r->kthread.cpu); for (i = 0; i < XDP_REDIRECT_ERR_MAX; i++) free(r->redir_err[i].cpu); free(r->rx_cnt.cpu); @@ -519,6 +578,137 @@ static void stats_get_rx_cnt(struct stats_record *stats_rec, } } +static void stats_get_cpumap_enqueue(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i, to_cpu; + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < sample_n_cpus; to_cpu++) { + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + if (pps > 0 || drop > 0) { + char str[64]; + + snprintf(str, sizeof(str), "enqueue to cpu %d", to_cpu); + + if (err > 0) + err = pps / err; /* calc average bulk size */ + + print_err(drop, + " %-20s " FMT_COLUMNf FMT_COLUMNf __COLUMN( + ".2f") "\n", + str, PPS(pps), DROP(drop), err, "bulk-avg"); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d->%d", i, to_cpu); + if (err > 0) + err = pps / err; /* calc average bulk size */ + print_default( + " %-18s " FMT_COLUMNf FMT_COLUMNf __COLUMN( + ".2f") "\n", + str, PPS(pps), DROP(drop), err, "bulk-avg"); + } + } +} + +static void stats_get_cpumap_remote(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + double xdp_pass, xdp_drop, xdp_redirect; + struct record *rec, *prev; + double t; + int i; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + + calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, + &xdp_redirect, t); + if (xdp_pass || xdp_drop || xdp_redirect) { + print_err(xdp_drop, + " %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", + "xdp_stats", PASS(xdp_pass), DROP(xdp_drop), + REDIR(xdp_redirect)); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, &xdp_redirect, t); + if (!xdp_pass && !xdp_drop && !xdp_redirect) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-16s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PASS(xdp_pass), DROP(xdp_drop), + REDIR(xdp_redirect)); + } +} + +static void stats_get_cpumap_kthread(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus) +{ + struct record *rec, *prev; + double t, pps, drop, err; + int i; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + + print_err(drop, " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf "\n", + pps ? "kthread total" : "kthread", PPS(pps), DROP(drop), err, + "sched"); + + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + print_default(" %-18s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + "\n", + str, PPS(pps), DROP(drop), err, "sched"); + } +} + static void stats_get_redirect_cnt(struct stats_record *stats_rec, struct stats_record *stats_prev, unsigned int nr_cpus, @@ -656,6 +846,9 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, print_always(FMT_COLUMNl, RX(out->totals.rx)); if (mask & SAMPLE_REDIRECT_CNT) print_always(FMT_COLUMNl, REDIR(out->totals.redir)); + printf(FMT_COLUMNl, + out->totals.err + out->totals.drop + out->totals.drop_xmit, + "err,drop/s"); printf("\n"); if (mask & SAMPLE_RX_CNT) { @@ -670,6 +863,14 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, stats_get_rx_cnt(r, p, nr_cpus, NULL); } + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) + stats_get_cpumap_enqueue(r, p, nr_cpus); + + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) { + stats_get_cpumap_kthread(r, p, nr_cpus); + stats_get_cpumap_remote(r, p, nr_cpus); + } + if (mask & SAMPLE_REDIRECT_CNT) { str = out->redir_cnt.suc ? "redirect total" : "redirect"; print_default(" %-20s " FMT_COLUMNl "\n", str, @@ -714,6 +915,7 @@ int sample_setup_maps(struct bpf_map **maps) switch (i) { case MAP_RX: + case MAP_CPUMAP_KTHREAD: sample_map_count[i] = sample_n_cpus; break; case MAP_REDIRECT_ERR: @@ -722,6 +924,8 @@ int sample_setup_maps(struct bpf_map **maps) break; case MAP_EXCEPTION: sample_map_count[i] = XDP_ACTION_MAX * sample_n_cpus; + case MAP_CPUMAP_ENQUEUE: + sample_map_count[i] = sample_n_cpus * sample_n_cpus; break; default: return -EINVAL; @@ -850,6 +1054,9 @@ static void sample_summary_print(void) print_always(" Average redir/s : %'-10.0f\n", sample_round(pkts / period)); } + if (sample_out.totals.drop) + print_always(" Rx dropped : %'-10llu\n", + sample_out.totals.drop); if (sample_out.totals.err) print_always(" Errors recorded : %'-10llu\n", sample_out.totals.err); @@ -894,6 +1101,15 @@ static int sample_stats_collect(struct stats_record *rec) &rec->redir_err[i]); } + if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) + for (i = 0; i < sample_n_cpus; i++) + map_collect_percpu(&sample_mmap[MAP_CPUMAP_ENQUEUE][i * sample_n_cpus], + &rec->enq[i]); + + if (sample_mask & SAMPLE_CPUMAP_KTHREAD_CNT) + map_collect_percpu(sample_mmap[MAP_CPUMAP_KTHREAD], + &rec->kthread); + if (sample_mask & SAMPLE_EXCEPTION_CNT) for (i = 0; i < XDP_ACTION_MAX; i++) map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus], @@ -906,6 +1122,7 @@ static void sample_summary_update(struct sample_output *out, int interval) { sample_out.totals.rx += out->totals.rx; sample_out.totals.redir += out->totals.redir; + sample_out.totals.drop += out->totals.drop; sample_out.totals.err += out->totals.err; sample_out.rx_cnt.pps += interval; } diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h index aa28e4bdd628..203732615fee 100644 --- a/samples/bpf/xdp_sample_user.h +++ b/samples/bpf/xdp_sample_user.h @@ -11,6 +11,8 @@ enum stats_mask { _SAMPLE_REDIRECT_MAP = 1U << 0, SAMPLE_RX_CNT = 1U << 1, SAMPLE_REDIRECT_ERR_CNT = 1U << 2, + SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, + SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, SAMPLE_EXCEPTION_CNT = 1U << 5, SAMPLE_REDIRECT_CNT = 1U << 7, SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, @@ -76,6 +78,10 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size) __attach_tp(tp_xdp_redirect_map_err); \ if (mask & SAMPLE_REDIRECT_ERR_CNT) \ __attach_tp(tp_xdp_redirect_err); \ + if (mask & SAMPLE_CPUMAP_ENQUEUE_CNT) \ + __attach_tp(tp_xdp_cpumap_enqueue); \ + if (mask & SAMPLE_CPUMAP_KTHREAD_CNT) \ + __attach_tp(tp_xdp_cpumap_kthread); \ if (mask & SAMPLE_EXCEPTION_CNT) \ __attach_tp(tp_xdp_exception); \ return 0; \ -- cgit v1.2.3 From 5f116212f4018fc9aa7a2a828b27aab540b8e5fa Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:58 +0530 Subject: samples: bpf: Add BPF support for devmap_xmit tracepoint This adds support for the devmap_xmit tracepoint, and its multi device variant that can be used to obtain streams for each individual net_device to net_device redirection. This is useful for decomposing total xmit stats in xdp_monitor. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-11-memxor@gmail.com --- samples/bpf/xdp_sample.bpf.c | 71 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/samples/bpf/xdp_sample.bpf.c b/samples/bpf/xdp_sample.bpf.c index f01a5529751c..0eb7e1dcae22 100644 --- a/samples/bpf/xdp_sample.bpf.c +++ b/samples/bpf/xdp_sample.bpf.c @@ -11,6 +11,14 @@ array_map redir_err_cnt SEC(".maps"); array_map cpumap_enqueue_cnt SEC(".maps"); array_map cpumap_kthread_cnt SEC(".maps"); array_map exception_cnt SEC(".maps"); +array_map devmap_xmit_cnt SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 32 * 32); + __type(key, u64); + __type(value, struct datarec); +} devmap_xmit_cnt_multi SEC(".maps"); const volatile int nr_cpus = 0; @@ -193,3 +201,66 @@ int BPF_PROG(tp_xdp_exception, const struct net_device *dev, return 0; } + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec *rec; + int idx_in, idx_out; + u32 cpu; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + cpu = bpf_get_smp_processor_id(); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &cpu); + if (!rec) + return 0; + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + /* Record bulk events, then userspace can calc average bulk size */ + NO_TEAR_INC(rec->info); + /* Record error cases, where no frame were sent */ + /* Catch API error of drv ndo_xdp_xmit sent more than count */ + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device *from_dev, + const struct net_device *to_dev, int sent, int drops, int err) +{ + struct datarec empty = {}; + struct datarec *rec; + int idx_in, idx_out; + u64 idx; + + idx_in = from_dev->ifindex; + idx_out = to_dev->ifindex; + idx = idx_in; + idx = idx << 32 | idx_out; + + if (!IN_SET(from_match, idx_in)) + return 0; + if (!IN_SET(to_match, idx_out)) + return 0; + + bpf_map_update_elem(&devmap_xmit_cnt_multi, &idx, &empty, BPF_NOEXIST); + rec = bpf_map_lookup_elem(&devmap_xmit_cnt_multi, &idx); + if (!rec) + return 0; + + NO_TEAR_ADD(rec->processed, sent); + NO_TEAR_ADD(rec->dropped, drops); + NO_TEAR_INC(rec->info); + if (err || drops < 0) + NO_TEAR_INC(rec->issue); + return 0; +} -- cgit v1.2.3 From af93d58c27b6ac4154f1651f47be2a159f8ce30f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:49:59 +0530 Subject: samples: bpf: Add devmap_xmit tracepoint statistics support This adds support for retrieval and printing for devmap_xmit total and mutli mode tracepoint. For multi mode, we keep a hash map entry for each redirection stream, such that we can dynamically add and remove entries on output. The from_match and to_match will be set by individual samples when setting up the XDP program on these devices. The multi mode tracepoint is also handy for xdp_redirect_map_multi, where up to 32 devices can be specified. Also add samples_init_pre_load macro to finally set up the resized maps and mmap them in place for low overhead stats retrieval. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-12-memxor@gmail.com --- samples/bpf/xdp_sample_user.c | 317 +++++++++++++++++++++++++++++++++++++++++- samples/bpf/xdp_sample_user.h | 17 +++ 2 files changed, 331 insertions(+), 3 deletions(-) diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c index e2692dee1dbb..eb484c15492d 100644 --- a/samples/bpf/xdp_sample_user.c +++ b/samples/bpf/xdp_sample_user.c @@ -77,6 +77,8 @@ enum map_type { MAP_CPUMAP_ENQUEUE, MAP_CPUMAP_KTHREAD, MAP_EXCEPTION, + MAP_DEVMAP_XMIT, + MAP_DEVMAP_XMIT_MULTI, NUM_MAP, }; @@ -103,6 +105,8 @@ struct stats_record { struct record redir_err[XDP_REDIRECT_ERR_MAX]; struct record kthread; struct record exception[XDP_ACTION_MAX]; + struct record devmap_xmit; + DECLARE_HASHTABLE(xmit_map, 5); struct record enq[]; }; @@ -111,7 +115,9 @@ struct sample_output { __u64 rx; __u64 redir; __u64 drop; + __u64 drop_xmit; __u64 err; + __u64 xmit; } totals; struct { __u64 pps; @@ -125,6 +131,12 @@ struct sample_output { struct { __u64 hits; } except_cnt; + struct { + __u64 pps; + __u64 drop; + __u64 err; + double bavg; + } xmit_cnt; }; struct xdp_desc { @@ -265,6 +277,16 @@ static void sample_print_help(int mask) " \t\t\t\thit/s - Number of times the tracepoint was hit per second\n\n"); } + if (mask & SAMPLE_DEVMAP_XMIT_CNT) { + printf(" devmap_xmit\t\tDisplays devmap_xmit tracepoint events\n" + " \t\t\tThis tracepoint is invoked for successful transmissions on output\n" + " \t\t\tdevice but these statistics are not available for generic XDP mode,\n" + " \t\t\thence they will be omitted from the output when using SKB mode\n" + " \t\t\t\txmit/s - Number of packets that were transmitted per second\n" + " \t\t\t\tdrop/s - Number of packets that failed transmissions per second\n" + " \t\t\t\tdrv_err/s - Number of internal driver errors per second\n" + " \t\t\t\tbulk-avg - Average number of packets processed for each event\n\n"); + } } void sample_usage(char *argv[], const struct option *long_options, @@ -353,6 +375,74 @@ static void map_collect_percpu(struct datarec *values, struct record *rec) rec->total.xdp_redirect = sum_xdp_redirect; } +static int map_collect_percpu_devmap(int map_fd, struct stats_record *rec) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u32 batch, count = 32; + struct datarec *values; + bool init = false; + __u64 *keys; + int i, ret; + + keys = calloc(count, sizeof(__u64)); + if (!keys) + return -ENOMEM; + values = calloc(count * nr_cpus, sizeof(struct datarec)); + if (!values) { + free(keys); + return -ENOMEM; + } + + for (;;) { + bool exit = false; + + ret = bpf_map_lookup_batch(map_fd, init ? &batch : NULL, &batch, + keys, values, &count, NULL); + if (ret < 0 && errno != ENOENT) + break; + if (errno == ENOENT) + exit = true; + + init = true; + for (i = 0; i < count; i++) { + struct map_entry *e, *x = NULL; + __u64 pair = keys[i]; + struct datarec *arr; + + arr = &values[i * nr_cpus]; + hash_for_each_possible(rec->xmit_map, e, node, pair) { + if (e->pair == pair) { + x = e; + break; + } + } + if (!x) { + x = calloc(1, sizeof(*x)); + if (!x) + goto cleanup; + if (map_entry_init(x, pair) < 0) { + free(x); + goto cleanup; + } + hash_add(rec->xmit_map, &x->node, pair); + } + map_collect_percpu(arr, &x->val); + } + + if (exit) + break; + count = 32; + } + + free(values); + free(keys); + return 0; +cleanup: + free(values); + free(keys); + return -ENOMEM; +} + static struct stats_record *alloc_stats_record(void) { struct stats_record *rec; @@ -408,6 +498,16 @@ static struct stats_record *alloc_stats_record(void) } } } + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) { + rec->devmap_xmit.cpu = alloc_record_per_cpu(); + if (!rec->devmap_xmit.cpu) { + fprintf(stderr, + "Failed to allocate devmap_xmit per-CPU array\n"); + goto end_exception; + } + } + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + hash_init(rec->xmit_map); if (sample_mask & SAMPLE_CPUMAP_ENQUEUE_CNT) { for (i = 0; i < sample_n_cpus; i++) { rec->enq[i].cpu = alloc_record_per_cpu(); @@ -418,13 +518,15 @@ static struct stats_record *alloc_stats_record(void) i); while (i--) free(rec->enq[i].cpu); - goto end_exception; + goto end_devmap_xmit; } } } return rec; +end_devmap_xmit: + free(rec->devmap_xmit.cpu); end_exception: for (i = 0; i < XDP_ACTION_MAX; i++) free(rec->exception[i].cpu); @@ -448,6 +550,12 @@ static void free_stats_record(struct stats_record *r) for (i = 0; i < sample_n_cpus; i++) free(r->enq[i].cpu); + hash_for_each_safe(r->xmit_map, i, tmp, e, node) { + hash_del(&e->node); + free(e->val.cpu); + free(e); + } + free(r->devmap_xmit.cpu); for (i = 0; i < XDP_ACTION_MAX; i++) free(r->exception[i].cpu); free(r->kthread.cpu); @@ -835,6 +943,160 @@ static void stats_get_exception_cnt(struct stats_record *stats_rec, } } +static void stats_get_devmap_xmit(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out) +{ + double pps, drop, info, err; + struct record *rec, *prev; + double t; + int i; + + rec = &stats_rec->devmap_xmit; + prev = &stats_prev->devmap_xmit; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + char str[64]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + info = calc_info_pps(r, p, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", + str, XMIT(pps), DROP(drop), err, "drv_err/s", + info, "bulk-avg"); + } + if (out) { + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + info = calc_info_pps(&rec->total, &prev->total, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + err = calc_errs_pps(&rec->total, &prev->total, t); + + out->xmit_cnt.pps = pps; + out->xmit_cnt.drop = drop; + out->xmit_cnt.bavg = info; + out->xmit_cnt.err = err; + out->totals.xmit += pps; + out->totals.drop_xmit += drop; + out->totals.err += err; + } +} + +static void stats_get_devmap_xmit_multi(struct stats_record *stats_rec, + struct stats_record *stats_prev, + unsigned int nr_cpus, + struct sample_output *out, + bool xmit_total) +{ + double pps, drop, info, err; + struct map_entry *entry; + struct record *r, *p; + double t; + int bkt; + + hash_for_each(stats_rec->xmit_map, bkt, entry, node) { + struct map_entry *e, *x = NULL; + char ifname_from[IFNAMSIZ]; + char ifname_to[IFNAMSIZ]; + const char *fstr, *tstr; + unsigned long prev_time; + struct record beg = {}; + __u32 from_idx, to_idx; + char str[128]; + __u64 pair; + int i; + + prev_time = sample_interval * NANOSEC_PER_SEC; + + pair = entry->pair; + from_idx = pair >> 32; + to_idx = pair & 0xFFFFFFFF; + + r = &entry->val; + beg.timestamp = r->timestamp - prev_time; + + /* Find matching entry from stats_prev map */ + hash_for_each_possible(stats_prev->xmit_map, e, node, pair) { + if (e->pair == pair) { + x = e; + break; + } + } + if (x) + p = &x->val; + else + p = &beg; + t = calc_period(r, p); + pps = calc_pps(&r->total, &p->total, t); + drop = calc_drop_pps(&r->total, &p->total, t); + info = calc_info_pps(&r->total, &p->total, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + err = calc_errs_pps(&r->total, &p->total, t); + + if (out) { + /* We are responsible for filling out totals */ + out->totals.xmit += pps; + out->totals.drop_xmit += drop; + out->totals.err += err; + continue; + } + + fstr = tstr = NULL; + if (if_indextoname(from_idx, ifname_from)) + fstr = ifname_from; + if (if_indextoname(to_idx, ifname_to)) + tstr = ifname_to; + + snprintf(str, sizeof(str), "xmit %s->%s", fstr ?: "?", + tstr ?: "?"); + /* Skip idle streams of redirection */ + if (pps || drop || err) { + print_err(drop, + " %-20s " FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", str, XMIT(pps), DROP(drop), + err, "drv_err/s", info, "bulk-avg"); + } + + for (i = 0; i < nr_cpus; i++) { + struct datarec *rc = &r->cpu[i]; + struct datarec *pc, p_beg = {}; + char str[64]; + + pc = p == &beg ? &p_beg : &p->cpu[i]; + + pps = calc_pps(rc, pc, t); + drop = calc_drop_pps(rc, pc, t); + err = calc_errs_pps(rc, pc, t); + + if (!pps && !drop && !err) + continue; + + snprintf(str, sizeof(str), "cpu:%d", i); + info = calc_info_pps(rc, pc, t); + if (info > 0) + info = (pps + drop) / info; /* calc avg bulk */ + + print_default(" %-18s" FMT_COLUMNf FMT_COLUMNf FMT_COLUMNf + __COLUMN(".2f") "\n", str, XMIT(pps), + DROP(drop), err, "drv_err/s", info, "bulk-avg"); + } + } +} + static void stats_print(const char *prefix, int mask, struct stats_record *r, struct stats_record *p, struct sample_output *out) { @@ -849,6 +1111,9 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, printf(FMT_COLUMNl, out->totals.err + out->totals.drop + out->totals.drop_xmit, "err,drop/s"); + if (mask & SAMPLE_DEVMAP_XMIT_CNT || + mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + printf(FMT_COLUMNl, XMIT(out->totals.xmit)); printf("\n"); if (mask & SAMPLE_RX_CNT) { @@ -899,6 +1164,25 @@ static void stats_print(const char *prefix, int mask, struct stats_record *r, stats_get_exception_cnt(r, p, nr_cpus, NULL); } + if (mask & SAMPLE_DEVMAP_XMIT_CNT) { + str = (sample_log_level & LL_DEFAULT) && out->xmit_cnt.pps ? + "devmap_xmit total" : + "devmap_xmit"; + + print_err(out->xmit_cnt.err || out->xmit_cnt.drop, + " %-20s " FMT_COLUMNl FMT_COLUMNl FMT_COLUMNl + __COLUMN(".2f") "\n", + str, XMIT(out->xmit_cnt.pps), + DROP(out->xmit_cnt.drop), out->xmit_cnt.err, + "drv_err/s", out->xmit_cnt.bavg, "bulk-avg"); + + stats_get_devmap_xmit(r, p, nr_cpus, NULL); + } + + if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + stats_get_devmap_xmit_multi(r, p, nr_cpus, NULL, + mask & SAMPLE_DEVMAP_XMIT_CNT); + if (sample_log_level & LL_DEFAULT || ((sample_log_level & LL_SIMPLE) && sample_err_exp)) { sample_err_exp = false; @@ -910,12 +1194,13 @@ int sample_setup_maps(struct bpf_map **maps) { sample_n_cpus = libbpf_num_possible_cpus(); - for (int i = 0; i < NUM_MAP; i++) { + for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { sample_map[i] = maps[i]; switch (i) { case MAP_RX: case MAP_CPUMAP_KTHREAD: + case MAP_DEVMAP_XMIT: sample_map_count[i] = sample_n_cpus; break; case MAP_REDIRECT_ERR: @@ -933,12 +1218,13 @@ int sample_setup_maps(struct bpf_map **maps) if (bpf_map__resize(sample_map[i], sample_map_count[i]) < 0) return -errno; } + sample_map[MAP_DEVMAP_XMIT_MULTI] = maps[MAP_DEVMAP_XMIT_MULTI]; return 0; } static int sample_setup_maps_mappings(void) { - for (int i = 0; i < NUM_MAP; i++) { + for (int i = 0; i < MAP_DEVMAP_XMIT_MULTI; i++) { size_t size = sample_map_count[i] * sizeof(struct datarec); sample_mmap[i] = mmap(NULL, size, PROT_READ | PROT_WRITE, @@ -1057,9 +1343,20 @@ static void sample_summary_print(void) if (sample_out.totals.drop) print_always(" Rx dropped : %'-10llu\n", sample_out.totals.drop); + if (sample_out.totals.drop_xmit) + print_always(" Tx dropped : %'-10llu\n", + sample_out.totals.drop_xmit); if (sample_out.totals.err) print_always(" Errors recorded : %'-10llu\n", sample_out.totals.err); + if (sample_out.totals.xmit) { + double pkts = sample_out.totals.xmit; + + print_always(" Packets transmitted : %'-10llu\n", + sample_out.totals.xmit); + print_always(" Average transmit/s : %'-10.0f\n", + sample_round(pkts / period)); + } } void sample_exit(int status) @@ -1115,6 +1412,13 @@ static int sample_stats_collect(struct stats_record *rec) map_collect_percpu(&sample_mmap[MAP_EXCEPTION][i * sample_n_cpus], &rec->exception[i]); + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT) + map_collect_percpu(sample_mmap[MAP_DEVMAP_XMIT], &rec->devmap_xmit); + + if (sample_mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) { + if (map_collect_percpu_devmap(bpf_map__fd(sample_map[MAP_DEVMAP_XMIT_MULTI]), rec) < 0) + return -EINVAL; + } return 0; } @@ -1123,7 +1427,9 @@ static void sample_summary_update(struct sample_output *out, int interval) sample_out.totals.rx += out->totals.rx; sample_out.totals.redir += out->totals.redir; sample_out.totals.drop += out->totals.drop; + sample_out.totals.drop_xmit += out->totals.drop_xmit; sample_out.totals.err += out->totals.err; + sample_out.totals.xmit += out->totals.xmit; sample_out.rx_cnt.pps += interval; } @@ -1141,6 +1447,11 @@ static void sample_stats_print(int mask, struct stats_record *cur, stats_get_redirect_err_cnt(cur, prev, 0, &out); if (mask & SAMPLE_EXCEPTION_CNT) stats_get_exception_cnt(cur, prev, 0, &out); + if (mask & SAMPLE_DEVMAP_XMIT_CNT) + stats_get_devmap_xmit(cur, prev, 0, &out); + else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) + stats_get_devmap_xmit_multi(cur, prev, 0, &out, + mask & SAMPLE_DEVMAP_XMIT_CNT); sample_summary_update(&out, interval); stats_print(prog_name, mask, cur, prev, &out); diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h index 203732615fee..3a678986cce2 100644 --- a/samples/bpf/xdp_sample_user.h +++ b/samples/bpf/xdp_sample_user.h @@ -14,9 +14,11 @@ enum stats_mask { SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, SAMPLE_EXCEPTION_CNT = 1U << 5, + SAMPLE_DEVMAP_XMIT_CNT = 1U << 6, SAMPLE_REDIRECT_CNT = 1U << 7, SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8, }; /* Exit return codes */ @@ -63,6 +65,17 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size) return -errno; \ }) +#define sample_init_pre_load(skel) \ + ({ \ + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); \ + sample_setup_maps((struct bpf_map *[]){ \ + skel->maps.rx_cnt, skel->maps.redir_err_cnt, \ + skel->maps.cpumap_enqueue_cnt, \ + skel->maps.cpumap_kthread_cnt, \ + skel->maps.exception_cnt, skel->maps.devmap_xmit_cnt, \ + skel->maps.devmap_xmit_cnt_multi }); \ + }) + #define DEFINE_SAMPLE_INIT(name) \ static int sample_init(struct name *skel, int mask) \ { \ @@ -84,6 +97,10 @@ static inline char *safe_strncpy(char *dst, const char *src, size_t size) __attach_tp(tp_xdp_cpumap_kthread); \ if (mask & SAMPLE_EXCEPTION_CNT) \ __attach_tp(tp_xdp_exception); \ + if (mask & SAMPLE_DEVMAP_XMIT_CNT) \ + __attach_tp(tp_xdp_devmap_xmit); \ + if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) \ + __attach_tp(tp_xdp_devmap_xmit_multi); \ return 0; \ } -- cgit v1.2.3 From 384b6b3bbf0d3b60ca118459a91b7b8ce1dcd6bd Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:00 +0530 Subject: samples: bpf: Add vmlinux.h generation support Also, take this opportunity to depend on in-tree bpftool, so that we can use static linking support in subsequent commits for XDP samples BPF helper object. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-13-memxor@gmail.com --- samples/bpf/Makefile | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 036998d11ded..ff1932e16bc5 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -276,6 +276,11 @@ $(LIBBPF): FORCE $(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O= +BPFTOOLDIR := $(TOOLS_PATH)/bpf/bpftool +BPFTOOL := $(BPFTOOLDIR)/bpftool +$(BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) + $(MAKE) -C $(BPFTOOLDIR) srctree=$(BPF_SAMPLES_PATH)/../../ + $(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE $(call filechk,offsets,__SYSCALL_NRS_H__) @@ -313,6 +318,26 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h -include $(BPF_SAMPLES_PATH)/Makefile.target +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +$(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) +ifeq ($(VMLINUX_H),) + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif + +clean-files += vmlinux.h + # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. -- cgit v1.2.3 From 3f19956010d26906e84baec4cd9c48bd8808de96 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:01 +0530 Subject: samples: bpf: Convert xdp_monitor_kern.o to XDP samples helper We already moved all the functionality it provided in XDP samples helper userspace and kernel BPF object, so just delete the unneeded code. We also add generation of BPF skeleton and compilation using clang -target bpf for files ending with .bpf.c suffix (to denote that they use vmlinux.h). Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-14-memxor@gmail.com --- samples/bpf/Makefile | 42 ++++++- samples/bpf/xdp_monitor.bpf.c | 8 ++ samples/bpf/xdp_monitor_kern.c | 257 ----------------------------------------- 3 files changed, 49 insertions(+), 258 deletions(-) create mode 100644 samples/bpf/xdp_monitor.bpf.c delete mode 100644 samples/bpf/xdp_monitor_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ff1932e16bc5..0d7086a2a393 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -164,7 +164,6 @@ always-y += xdp_redirect_kern.o always-y += xdp_redirect_map_kern.o always-y += xdp_redirect_map_multi_kern.o always-y += xdp_redirect_cpu_kern.o -always-y += xdp_monitor_kern.o always-y += xdp_rxq_info_kern.o always-y += xdp2skb_meta_kern.o always-y += syscall_tp_kern.o @@ -338,6 +337,47 @@ endif clean-files += vmlinux.h +# Get Clang's default includes on this system, as opposed to those seen by +# '-target bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ + # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is # useless for BPF samples. diff --git a/samples/bpf/xdp_monitor.bpf.c b/samples/bpf/xdp_monitor.bpf.c new file mode 100644 index 000000000000..cfb41e2205f4 --- /dev/null +++ b/samples/bpf/xdp_monitor.bpf.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * XDP monitor tool, based on tracepoints + */ +#include "xdp_sample.bpf.h" + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c deleted file mode 100644 index 5c955b812c47..000000000000 --- a/samples/bpf/xdp_monitor_kern.c +++ /dev/null @@ -1,257 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. - * - * XDP monitor tool, based on tracepoints - */ -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, 2); - /* TODO: have entries for all possible errno's */ -} redirect_err_cnt SEC(".maps"); - -#define XDP_UNKNOWN XDP_REDIRECT + 1 -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, XDP_UNKNOWN + 1); -} exception_cnt SEC(".maps"); - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_redirect_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 - -enum { - XDP_REDIRECT_SUCCESS = 0, - XDP_REDIRECT_ERROR = 1 -}; - -static __always_inline -int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) -{ - u32 key = XDP_REDIRECT_ERROR; - int err = ctx->err; - u64 *cnt; - - if (!err) - key = XDP_REDIRECT_SUCCESS; - - cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); - if (!cnt) - return 1; - *cnt += 1; - - return 0; /* Indicate event was filtered (no further processing)*/ - /* - * Returning 1 here would allow e.g. a perf-record tracepoint - * to see and record these events, but it doesn't work well - * in-practice as stopping perf-record also unload this - * bpf_prog. Plus, there is additional overhead of doing so. - */ -} - -SEC("tracepoint/xdp/xdp_redirect_err") -int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - - -SEC("tracepoint/xdp/xdp_redirect_map_err") -int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Likely unloaded when prog starts */ -SEC("tracepoint/xdp/xdp_redirect") -int trace_xdp_redirect(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Likely unloaded when prog starts */ -SEC("tracepoint/xdp/xdp_redirect_map") -int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_exception_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int ifindex; // offset:16; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_exception") -int trace_xdp_exception(struct xdp_exception_ctx *ctx) -{ - u64 *cnt; - u32 key; - - key = ctx->act; - if (key > XDP_REDIRECT) - key = XDP_UNKNOWN; - - cnt = bpf_map_lookup_elem(&exception_cnt, &key); - if (!cnt) - return 1; - *cnt += 1; - - return 0; -} - -/* Common stats data record shared with _user.c */ -struct datarec { - u64 processed; - u64 dropped; - u64 info; - u64 err; -}; -#define MAX_CPUS 64 - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, MAX_CPUS); -} cpumap_enqueue_cnt SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} cpumap_kthread_cnt SEC(".maps"); - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_enqueue_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int to_cpu; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_enqueue") -int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) -{ - u32 to_cpu = ctx->to_cpu; - struct datarec *rec; - - if (to_cpu >= MAX_CPUS) - return 1; - - rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Record bulk events, then userspace can calc average bulk size */ - if (ctx->processed > 0) - rec->info += 1; - - return 0; -} - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_kthread_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int sched; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_kthread") -int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Count times kthread yielded CPU via schedule call */ - if (ctx->sched) - rec->info++; - - return 0; -} - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} devmap_xmit_cnt SEC(".maps"); - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct devmap_xmit_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int from_ifindex; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int to_ifindex; // offset:16; size:4; signed:1; - int drops; // offset:20; size:4; signed:1; - int sent; // offset:24; size:4; signed:1; - int err; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_devmap_xmit") -int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key); - if (!rec) - return 0; - rec->processed += ctx->sent; - rec->dropped += ctx->drops; - - /* Record bulk events, then userspace can calc average bulk size */ - rec->info += 1; - - /* Record error cases, where no frame were sent */ - if (ctx->err) - rec->err++; - - /* Catch API error of drv ndo_xdp_xmit sent more than count */ - if (ctx->drops < 0) - rec->err++; - - return 1; -} -- cgit v1.2.3 From 6e1051a54e3100df59dae01c24ff4a6d6027e303 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:02 +0530 Subject: samples: bpf: Convert xdp_monitor to XDP samples helper Use the libbpf skeleton facility and other utilities provided by XDP samples helper. A lot of the code in xdp_monitor and xdp_redirect_cpu has been moved to the xdp_sample_user.o helper, so we remove the duplicate functions here that are no longer needed. Thanks to BPF skeleton, we no longer depend on order of tracepoints to uninstall them on startup. Instead, the sample mask is used to install the needed tracepoints. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-15-memxor@gmail.com --- samples/bpf/Makefile | 9 +- samples/bpf/Makefile.target | 11 + samples/bpf/xdp_monitor_user.c | 798 ++++------------------------------------- 3 files changed, 83 insertions(+), 735 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0d7086a2a393..479778439f5e 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -43,7 +43,6 @@ tprogs-y += xdp_redirect tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect_map_multi tprogs-y += xdp_redirect_cpu -tprogs-y += xdp_monitor tprogs-y += xdp_rxq_info tprogs-y += syscall_tp tprogs-y += cpustat @@ -57,11 +56,14 @@ tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += xdp_monitor + # Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o +XDP_SAMPLE := xdp_sample_user.o fds_example-objs := fds_example.o sockex1-objs := sockex1_user.o @@ -102,7 +104,6 @@ xdp_redirect-objs := xdp_redirect_user.o xdp_redirect_map-objs := xdp_redirect_map_user.o xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o -xdp_monitor-objs := xdp_monitor_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o @@ -116,6 +117,8 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) +xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE) + # Tell kbuild to always build the programs always-y := $(tprogs-y) always-y += sockex1_kern.o @@ -310,6 +313,8 @@ verify_target_bpf: verify_cmds $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) +$(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h + $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target index 7621f55e2947..5a368affa038 100644 --- a/samples/bpf/Makefile.target +++ b/samples/bpf/Makefile.target @@ -73,3 +73,14 @@ quiet_cmd_tprog-cobjs = CC $@ cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $< $(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE $(call if_changed_dep,tprog-cobjs) + +# Override includes for xdp_sample_user.o because $(srctree)/usr/include in +# TPROGS_CFLAGS causes conflicts +XDP_SAMPLE_CFLAGS += -Wall -O2 -lm \ + -I./tools/include \ + -I./tools/include/uapi \ + -I./tools/lib \ + -I./tools/testing/selftests/bpf +$(obj)/xdp_sample_user.o: $(src)/xdp_sample_user.c \ + $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h + $(CC) $(XDP_SAMPLE_CFLAGS) -c -o $@ $< diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index 49ebc49aefc3..fb9391a5ec62 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -1,15 +1,12 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. - */ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= - "XDP monitor tool, based on tracepoints\n" -; +"XDP monitor tool, based on tracepoints\n"; static const char *__doc_err_only__= - " NOTICE: Only tracking XDP redirect errors\n" - " Enable TX success stats via '--stats'\n" - " (which comes with a per packet processing overhead)\n" -; +" NOTICE: Only tracking XDP redirect errors\n" +" Enable redirect success stats via '-s/--stats'\n" +" (which comes with a per packet processing overhead)\n"; #include #include @@ -20,768 +17,103 @@ static const char *__doc_err_only__= #include #include #include - #include #include #include #include - #include #include #include #include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_monitor.skel.h" -enum map_type { - REDIRECT_ERR_CNT, - EXCEPTION_CNT, - CPUMAP_ENQUEUE_CNT, - CPUMAP_KTHREAD_CNT, - DEVMAP_XMIT_CNT, -}; +static int mask = SAMPLE_REDIRECT_ERR_CNT | SAMPLE_CPUMAP_ENQUEUE_CNT | + SAMPLE_CPUMAP_KTHREAD_CNT | SAMPLE_EXCEPTION_CNT | + SAMPLE_DEVMAP_XMIT_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; -static const char *const map_type_strings[] = { - [REDIRECT_ERR_CNT] = "redirect_err_cnt", - [EXCEPTION_CNT] = "exception_cnt", - [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", - [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", - [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt", -}; - -#define NUM_MAP 5 -#define NUM_TP 8 - -static int tp_cnt; -static int map_cnt; -static int verbose = 1; -static bool debug = false; -struct bpf_map *map_data[NUM_MAP] = {}; -struct bpf_link *tp_links[NUM_TP] = {}; -struct bpf_object *obj; +DEFINE_SAMPLE_INIT(xdp_monitor); static const struct option long_options[] = { - {"help", no_argument, NULL, 'h' }, - {"debug", no_argument, NULL, 'D' }, - {"stats", no_argument, NULL, 'S' }, - {"sec", required_argument, NULL, 's' }, - {0, 0, NULL, 0 } -}; - -static void int_exit(int sig) -{ - /* Detach tracepoints */ - while (tp_cnt) - bpf_link__destroy(tp_links[--tp_cnt]); - - bpf_object__close(obj); - exit(0); -} - -/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ -#define EXIT_FAIL_MEM 5 - -static void usage(char *argv[]) -{ - int i; - printf("\nDOCUMENTATION:\n%s\n", __doc__); - printf("\n"); - printf(" Usage: %s (options-see-below)\n", - argv[0]); - printf(" Listing options:\n"); - for (i = 0; long_options[i].name != 0; i++) { - printf(" --%-15s", long_options[i].name); - if (long_options[i].flag != NULL) - printf(" flag (internal value:%d)", - *long_options[i].flag); - else - printf("short-option: -%c", - long_options[i].val); - printf("\n"); - } - printf("\n"); -} - -#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -static __u64 gettime(void) -{ - struct timespec t; - int res; - - res = clock_gettime(CLOCK_MONOTONIC, &t); - if (res < 0) { - fprintf(stderr, "Error with gettimeofday! (%i)\n", res); - exit(EXIT_FAILURE); - } - return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; -} - -enum { - REDIR_SUCCESS = 0, - REDIR_ERROR = 1, -}; -#define REDIR_RES_MAX 2 -static const char *redir_names[REDIR_RES_MAX] = { - [REDIR_SUCCESS] = "Success", - [REDIR_ERROR] = "Error", -}; -static const char *err2str(int err) -{ - if (err < REDIR_RES_MAX) - return redir_names[err]; - return NULL; -} -/* enum xdp_action */ -#define XDP_UNKNOWN XDP_REDIRECT + 1 -#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) -static const char *xdp_action_names[XDP_ACTION_MAX] = { - [XDP_ABORTED] = "XDP_ABORTED", - [XDP_DROP] = "XDP_DROP", - [XDP_PASS] = "XDP_PASS", - [XDP_TX] = "XDP_TX", - [XDP_REDIRECT] = "XDP_REDIRECT", - [XDP_UNKNOWN] = "XDP_UNKNOWN", -}; -static const char *action2str(int action) -{ - if (action < XDP_ACTION_MAX) - return xdp_action_names[action]; - return NULL; -} - -/* Common stats data record shared with _kern.c */ -struct datarec { - __u64 processed; - __u64 dropped; - __u64 info; - __u64 err; -}; -#define MAX_CPUS 64 - -/* Userspace structs for collection of stats from maps */ -struct record { - __u64 timestamp; - struct datarec total; - struct datarec *cpu; + { "help", no_argument, NULL, 'h' }, + { "stats", no_argument, NULL, 's' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + {} }; -struct u64rec { - __u64 processed; -}; -struct record_u64 { - /* record for _kern side __u64 values */ - __u64 timestamp; - struct u64rec total; - struct u64rec *cpu; -}; - -struct stats_record { - struct record_u64 xdp_redirect[REDIR_RES_MAX]; - struct record_u64 xdp_exception[XDP_ACTION_MAX]; - struct record xdp_cpumap_kthread; - struct record xdp_cpumap_enqueue[MAX_CPUS]; - struct record xdp_devmap_xmit; -}; - -static bool map_collect_record(int fd, __u32 key, struct record *rec) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct datarec values[nr_cpus]; - __u64 sum_processed = 0; - __u64 sum_dropped = 0; - __u64 sum_info = 0; - __u64 sum_err = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return false; - } - /* Get time as close as possible to reading map contents */ - rec->timestamp = gettime(); - - /* Record and sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - rec->cpu[i].processed = values[i].processed; - sum_processed += values[i].processed; - rec->cpu[i].dropped = values[i].dropped; - sum_dropped += values[i].dropped; - rec->cpu[i].info = values[i].info; - sum_info += values[i].info; - rec->cpu[i].err = values[i].err; - sum_err += values[i].err; - } - rec->total.processed = sum_processed; - rec->total.dropped = sum_dropped; - rec->total.info = sum_info; - rec->total.err = sum_err; - return true; -} - -static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct u64rec values[nr_cpus]; - __u64 sum_total = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return false; - } - /* Get time as close as possible to reading map contents */ - rec->timestamp = gettime(); - - /* Record and sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - rec->cpu[i].processed = values[i].processed; - sum_total += values[i].processed; - } - rec->total.processed = sum_total; - return true; -} - -static double calc_period(struct record *r, struct record *p) -{ - double period_ = 0; - __u64 period = 0; - - period = r->timestamp - p->timestamp; - if (period > 0) - period_ = ((double) period / NANOSEC_PER_SEC); - - return period_; -} - -static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) -{ - double period_ = 0; - __u64 period = 0; - - period = r->timestamp - p->timestamp; - if (period > 0) - period_ = ((double) period / NANOSEC_PER_SEC); - - return period_; -} - -static double calc_pps(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->processed - p->processed; - pps = packets / period; - } - return pps; -} - -static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->processed - p->processed; - pps = packets / period; - } - return pps; -} - -static double calc_drop(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->dropped - p->dropped; - pps = packets / period; - } - return pps; -} - -static double calc_info(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->info - p->info; - pps = packets / period; - } - return pps; -} - -static double calc_err(struct datarec *r, struct datarec *p, double period) -{ - __u64 packets = 0; - double pps = 0; - - if (period > 0) { - packets = r->err - p->err; - pps = packets / period; - } - return pps; -} - -static void stats_print(struct stats_record *stats_rec, - struct stats_record *stats_prev, - bool err_only) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - int rec_i = 0, i, to_cpu; - double t = 0, pps = 0; - - /* Header */ - printf("%-15s %-7s %-12s %-12s %-9s\n", - "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); - - /* tracepoint: xdp:xdp_redirect_* */ - if (err_only) - rec_i = REDIR_ERROR; - - for (; rec_i < REDIR_RES_MAX; rec_i++) { - struct record_u64 *rec, *prev; - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; - - rec = &stats_rec->xdp_redirect[rec_i]; - prev = &stats_prev->xdp_redirect[rec_i]; - t = calc_period_u64(rec, prev); - - for (i = 0; i < nr_cpus; i++) { - struct u64rec *r = &rec->cpu[i]; - struct u64rec *p = &prev->cpu[i]; - - pps = calc_pps_u64(r, p, t); - if (pps > 0) - printf(fmt1, "XDP_REDIRECT", i, - rec_i ? 0.0: pps, rec_i ? pps : 0.0, - err2str(rec_i)); - } - pps = calc_pps_u64(&rec->total, &prev->total, t); - printf(fmt2, "XDP_REDIRECT", "total", - rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); - } - - /* tracepoint: xdp:xdp_exception */ - for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { - struct record_u64 *rec, *prev; - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; - - rec = &stats_rec->xdp_exception[rec_i]; - prev = &stats_prev->xdp_exception[rec_i]; - t = calc_period_u64(rec, prev); - - for (i = 0; i < nr_cpus; i++) { - struct u64rec *r = &rec->cpu[i]; - struct u64rec *p = &prev->cpu[i]; - - pps = calc_pps_u64(r, p, t); - if (pps > 0) - printf(fmt1, "Exception", i, - 0.0, pps, action2str(rec_i)); - } - pps = calc_pps_u64(&rec->total, &prev->total, t); - if (pps > 0) - printf(fmt2, "Exception", "total", - 0.0, pps, action2str(rec_i)); - } - - /* cpumap enqueue stats */ - for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { - char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; - char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; - struct record *rec, *prev; - char *info_str = ""; - double drop, info; - - rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; - prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop(r, p, t); - info = calc_info(r, p, t); - if (info > 0) { - info_str = "bulk-average"; - info = pps / info; /* calc average bulk size */ - } - if (pps > 0) - printf(fmt1, "cpumap-enqueue", - i, to_cpu, pps, drop, info, info_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - if (pps > 0) { - drop = calc_drop(&rec->total, &prev->total, t); - info = calc_info(&rec->total, &prev->total, t); - if (info > 0) { - info_str = "bulk-average"; - info = pps / info; /* calc average bulk size */ - } - printf(fmt2, "cpumap-enqueue", - "sum", to_cpu, pps, drop, info, info_str); - } - } - - /* cpumap kthread stats */ - { - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; - struct record *rec, *prev; - double drop, info; - char *i_str = ""; - - rec = &stats_rec->xdp_cpumap_kthread; - prev = &stats_prev->xdp_cpumap_kthread; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop(r, p, t); - info = calc_info(r, p, t); - if (info > 0) - i_str = "sched"; - if (pps > 0 || drop > 0) - printf(fmt1, "cpumap-kthread", - i, pps, drop, info, i_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop(&rec->total, &prev->total, t); - info = calc_info(&rec->total, &prev->total, t); - if (info > 0) - i_str = "sched-sum"; - printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); - } - - /* devmap ndo_xdp_xmit stats */ - { - char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n"; - char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n"; - struct record *rec, *prev; - double drop, info, err; - char *i_str = ""; - char *err_str = ""; - - rec = &stats_rec->xdp_devmap_xmit; - prev = &stats_prev->xdp_devmap_xmit; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop(r, p, t); - info = calc_info(r, p, t); - err = calc_err(r, p, t); - if (info > 0) { - i_str = "bulk-average"; - info = (pps+drop) / info; /* calc avg bulk */ - } - if (err > 0) - err_str = "drv-err"; - if (pps > 0 || drop > 0) - printf(fmt1, "devmap-xmit", - i, pps, drop, info, i_str, err_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop(&rec->total, &prev->total, t); - info = calc_info(&rec->total, &prev->total, t); - err = calc_err(&rec->total, &prev->total, t); - if (info > 0) { - i_str = "bulk-average"; - info = (pps+drop) / info; /* calc avg bulk */ - } - if (err > 0) - err_str = "drv-err"; - printf(fmt2, "devmap-xmit", "total", pps, drop, - info, i_str, err_str); - } - - printf("\n"); -} - -static bool stats_collect(struct stats_record *rec) -{ - int fd; - int i; - - /* TODO: Detect if someone unloaded the perf event_fd's, as - * this can happen by someone running perf-record -e - */ - - fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]); - for (i = 0; i < REDIR_RES_MAX; i++) - map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); - - fd = bpf_map__fd(map_data[EXCEPTION_CNT]); - for (i = 0; i < XDP_ACTION_MAX; i++) { - map_collect_record_u64(fd, i, &rec->xdp_exception[i]); - } - - fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]); - for (i = 0; i < MAX_CPUS; i++) - map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); - - fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]); - map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); - - fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]); - map_collect_record(fd, 0, &rec->xdp_devmap_xmit); - - return true; -} - -static void *alloc_rec_per_cpu(int record_size) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - void *array; - - array = calloc(nr_cpus, record_size); - if (!array) { - fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); - exit(EXIT_FAIL_MEM); - } - return array; -} - -static struct stats_record *alloc_stats_record(void) -{ - struct stats_record *rec; - int rec_sz; - int i; - - /* Alloc main stats_record structure */ - rec = calloc(1, sizeof(*rec)); - if (!rec) { - fprintf(stderr, "Mem alloc error\n"); - exit(EXIT_FAIL_MEM); - } - - /* Alloc stats stored per CPU for each record */ - rec_sz = sizeof(struct u64rec); - for (i = 0; i < REDIR_RES_MAX; i++) - rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); - - for (i = 0; i < XDP_ACTION_MAX; i++) - rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); - - rec_sz = sizeof(struct datarec); - rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); - rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz); - - for (i = 0; i < MAX_CPUS; i++) - rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); - - return rec; -} - -static void free_stats_record(struct stats_record *r) -{ - int i; - - for (i = 0; i < REDIR_RES_MAX; i++) - free(r->xdp_redirect[i].cpu); - - for (i = 0; i < XDP_ACTION_MAX; i++) - free(r->xdp_exception[i].cpu); - - free(r->xdp_cpumap_kthread.cpu); - free(r->xdp_devmap_xmit.cpu); - - for (i = 0; i < MAX_CPUS; i++) - free(r->xdp_cpumap_enqueue[i].cpu); - - free(r); -} - -/* Pointer swap trick */ -static inline void swap(struct stats_record **a, struct stats_record **b) -{ - struct stats_record *tmp; - - tmp = *a; - *a = *b; - *b = tmp; -} - -static void stats_poll(int interval, bool err_only) -{ - struct stats_record *rec, *prev; - - rec = alloc_stats_record(); - prev = alloc_stats_record(); - stats_collect(rec); - - if (err_only) - printf("\n%s\n", __doc_err_only__); - - /* Trick to pretty printf with thousands separators use %' */ - setlocale(LC_NUMERIC, "en_US"); - - /* Header */ - if (verbose) - printf("\n%s", __doc__); - - /* TODO Need more advanced stats on error types */ - if (verbose) { - printf(" - Stats map0: %s\n", bpf_map__name(map_data[0])); - printf(" - Stats map1: %s\n", bpf_map__name(map_data[1])); - printf("\n"); - } - fflush(stdout); - - while (1) { - swap(&prev, &rec); - stats_collect(rec); - stats_print(rec, prev, err_only); - fflush(stdout); - sleep(interval); - } - - free_stats_record(rec); - free_stats_record(prev); -} - -static void print_bpf_prog_info(void) -{ - struct bpf_program *prog; - struct bpf_map *map; - int i = 0; - - /* Prog info */ - printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt); - bpf_object__for_each_program(prog, obj) { - printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog)); - i++; - } - - i = 0; - /* Maps info */ - printf("Loaded BPF prog have %d map(s)\n", map_cnt); - bpf_object__for_each_map(map, obj) { - const char *name = bpf_map__name(map); - int fd = bpf_map__fd(map); - - printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); - i++; - } - - /* Event info */ - printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt); - for (i = 0; i < tp_cnt; i++) { - int fd = bpf_link__fd(tp_links[i]); - - if (fd != -1) - printf(" - event_fd[%d] = fd(%d)\n", i, fd); - } -} int main(int argc, char **argv) { - struct bpf_program *prog; - int longindex = 0, opt; - int ret = EXIT_FAILURE; - enum map_type type; - char filename[256]; - - /* Default settings: */ + unsigned long interval = 2; + int ret = EXIT_FAIL_OPTION; + struct xdp_monitor *skel; bool errors_only = true; - int interval = 2; + int longindex = 0, opt; + bool error = true; /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "hDSs:", + while ((opt = getopt_long(argc, argv, "si:vh", long_options, &longindex)) != -1) { switch (opt) { - case 'D': - debug = true; - break; - case 'S': + case 's': errors_only = false; + mask |= SAMPLE_REDIRECT_CNT; break; - case 's': - interval = atoi(optarg); + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); break; case 'h': + error = false; default: - usage(argv); + sample_usage(argv, long_options, __doc__, mask, error); return ret; } } - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - /* Remove tracepoint program when program is interrupted or killed */ - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - - obj = bpf_object__open_file(filename, NULL); - if (libbpf_get_error(obj)) { - printf("ERROR: opening BPF object file failed\n"); - obj = NULL; - goto cleanup; - } - - /* load BPF program */ - if (bpf_object__load(obj)) { - printf("ERROR: loading BPF object file failed\n"); - goto cleanup; + skel = xdp_monitor__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_monitor__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; } - for (type = 0; type < NUM_MAP; type++) { - map_data[type] = - bpf_object__find_map_by_name(obj, map_type_strings[type]); - - if (libbpf_get_error(map_data[type])) { - printf("ERROR: finding a map in obj file failed\n"); - goto cleanup; - } - map_cnt++; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - bpf_object__for_each_program(prog, obj) { - tp_links[tp_cnt] = bpf_program__attach(prog); - if (libbpf_get_error(tp_links[tp_cnt])) { - printf("ERROR: bpf_program__attach failed\n"); - tp_links[tp_cnt] = NULL; - goto cleanup; - } - tp_cnt++; + ret = xdp_monitor__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_monitor__load: %s\n", strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (debug) { - print_bpf_prog_info(); + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - /* Unload/stop tracepoint event by closing bpf_link's */ - if (errors_only) { - /* The bpf_link[i] depend on the order of - * the functions was defined in _kern.c - */ - bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */ - tp_links[2] = NULL; + if (errors_only) + printf("%s", __doc_err_only__); - bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */ - tp_links[3] = NULL; + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - - stats_poll(interval, errors_only); - - ret = EXIT_SUCCESS; - -cleanup: - /* Detach tracepoints */ - while (tp_cnt) - bpf_link__destroy(tp_links[--tp_cnt]); - - bpf_object__close(obj); - return ret; + ret = EXIT_OK; +end_destroy: + xdp_monitor__destroy(skel); +end: + sample_exit(ret); } -- cgit v1.2.3 From 66fc4ca85d910bdeecf019c3999bc2df7c80b726 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:03 +0530 Subject: samples: bpf: Convert xdp_redirect_kern.o to XDP samples helper We moved swap_src_dst_mac to xdp_sample.bpf.h to be shared with other potential users, so drop it while moving code to the new file. Also, consistently use SEC("xdp") naming instead. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-16-memxor@gmail.com --- samples/bpf/Makefile | 5 ++- samples/bpf/xdp_redirect.bpf.c | 49 ++++++++++++++++++++++ samples/bpf/xdp_redirect_kern.c | 90 ----------------------------------------- 3 files changed, 52 insertions(+), 92 deletions(-) create mode 100644 samples/bpf/xdp_redirect.bpf.c delete mode 100644 samples/bpf/xdp_redirect_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 479778439f5e..0b94a6acb348 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -163,7 +163,6 @@ always-y += tcp_clamp_kern.o always-y += tcp_basertt_kern.o always-y += tcp_tos_reflect_kern.o always-y += tcp_dumpstats_kern.o -always-y += xdp_redirect_kern.o always-y += xdp_redirect_map_kern.o always-y += xdp_redirect_map_multi_kern.o always-y += xdp_redirect_cpu_kern.o @@ -356,6 +355,7 @@ endef CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) +$(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h @@ -366,9 +366,10 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x -I$(srctree)/tools/lib $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ -LINKED_SKELS := xdp_monitor.skel.h +LINKED_SKELS := xdp_redirect.skel.h xdp_monitor.skel.h clean-files += $(LINKED_SKELS) +xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) diff --git a/samples/bpf/xdp_redirect.bpf.c b/samples/bpf/xdp_redirect.bpf.c new file mode 100644 index 000000000000..7c02bacfe96b --- /dev/null +++ b/samples/bpf/xdp_redirect.bpf.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2016 John Fastabend + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +const volatile int ifindex_out; + +SEC("xdp") +int xdp_redirect_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + struct datarec *rec; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + swap_src_dst_mac(data); + return bpf_redirect(ifindex_out, 0); +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c deleted file mode 100644 index d26ec3aa215e..000000000000 --- a/samples/bpf/xdp_redirect_kern.c +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2016 John Fastabend - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#define KBUILD_MODNAME "foo" -#include -#include -#include -#include -#include -#include -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, int); - __type(value, int); - __uint(max_entries, 1); -} tx_port SEC(".maps"); - -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success - * feedback. Redirect TX errors can be caught via a tracepoint. - */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, long); - __uint(max_entries, 1); -} rxcnt SEC(".maps"); - -static void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -SEC("xdp_redirect") -int xdp_redirect_prog(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - int rc = XDP_DROP; - int *ifindex, port = 0; - long *value; - u32 key = 0; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - ifindex = bpf_map_lookup_elem(&tx_port, &port); - if (!ifindex) - return rc; - - value = bpf_map_lookup_elem(&rxcnt, &key); - if (value) - *value += 1; - - swap_src_dst_mac(data); - return bpf_redirect(*ifindex, 0); -} - -/* Redirect require an XDP bpf_prog loaded on the TX device */ -SEC("xdp_redirect_dummy") -int xdp_redirect_dummy_prog(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From b926c55d856cbe2593c44e783b31e2cdb0ef6371 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:04 +0530 Subject: samples: bpf: Convert xdp_redirect to XDP samples helper Use the libbpf skeleton facility and other utilities provided by XDP samples helper. One important note: The XDP samples helper handles ownership of installed XDP programs on devices, including responding to SIGINT and SIGTERM, so drop the code here and use the helpers we provide going forward for all xdp_redirect* conversions. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-17-memxor@gmail.com --- samples/bpf/Makefile | 5 +- samples/bpf/xdp_redirect_user.c | 270 +++++++++++++++++----------------------- 2 files changed, 116 insertions(+), 159 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0b94a6acb348..d05105227ec5 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -39,7 +39,6 @@ tprogs-y += lwt_len_hist tprogs-y += xdp_tx_iptunnel tprogs-y += test_map_in_map tprogs-y += per_socket_stats_example -tprogs-y += xdp_redirect tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect_map_multi tprogs-y += xdp_redirect_cpu @@ -56,6 +55,7 @@ tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += xdp_redirect tprogs-y += xdp_monitor # Libbpf dependencies @@ -100,7 +100,6 @@ lwt_len_hist-objs := lwt_len_hist_user.o xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o -xdp_redirect-objs := xdp_redirect_user.o xdp_redirect_map-objs := xdp_redirect_map_user.o xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o @@ -117,6 +116,7 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) +xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE) xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE) # Tell kbuild to always build the programs @@ -312,6 +312,7 @@ verify_target_bpf: verify_cmds $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) +$(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 93854e135134..7af5b07a7523 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 John Fastabend */ +static const char *__doc__ = +"XDP redirect tool, using bpf_redirect helper\n" +"Usage: xdp_redirect _IN _OUT\n"; + #include #include #include @@ -13,126 +17,73 @@ #include #include #include +#include #include - -#include "bpf_util.h" #include #include +#include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect.skel.h" -static int ifindex_in; -static int ifindex_out; -static bool ifindex_out_xdp_dummy_attached = true; -static __u32 prog_id; -static __u32 dummy_prog_id; - -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int rxcnt_map_fd; - -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; - - if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface IN\n"); - else - printf("program on iface IN changed, not removing\n"); - - if (ifindex_out_xdp_dummy_attached) { - curr_prog_id = 0; - if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, - xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (dummy_prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface OUT\n"); - else - printf("program on iface OUT changed, not removing\n"); - } - exit(0); -} - -static void poll_stats(int interval, int ifindex) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus], prev[nr_cpus]; - - memset(prev, 0, sizeof(prev)); - - while (1) { - __u64 sum = 0; - __u32 key = 0; - int i; - - sleep(interval); - assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[i]); - if (sum) - printf("ifindex %i: %10llu pkt/s\n", - ifindex, sum / interval); - memcpy(prev, values, sizeof(values)); - } -} +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_CNT | + SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; -static void usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] _IN _OUT\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -N enforce native mode\n" - " -F force loading prog\n", - prog); -} +DEFINE_SAMPLE_INIT(xdp_redirect); +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"force", no_argument, NULL, 'F' }, + {"stats", no_argument, NULL, 's' }, + {"interval", required_argument, NULL, 'i' }, + {"verbose", no_argument, NULL, 'v' }, + {} +}; int main(int argc, char **argv) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct bpf_program *prog, *dummy_prog; - int prog_fd, tx_port_map_fd, opt; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - const char *optstr = "FSN"; - struct bpf_object *obj; - char filename[256]; - int dummy_prog_fd; - int ret, key = 0; - - while ((opt = getopt(argc, argv, optstr)) != -1) { + int ifindex_in, ifindex_out, opt; + char str[2 * IF_NAMESIZE + 1]; + char ifname_out[IF_NAMESIZE]; + char ifname_in[IF_NAMESIZE]; + int ret = EXIT_FAIL_OPTION; + unsigned long interval = 2; + struct xdp_redirect *skel; + bool generic = false; + bool force = false; + bool error = true; + + while ((opt = getopt_long(argc, argv, "hSFi:vs", + long_options, NULL)) != -1) { switch (opt) { case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - case 'N': - /* default, set below */ + generic = true; + mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); + break; + case 's': + mask |= SAMPLE_REDIRECT_CNT; break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) - xdp_flags |= XDP_FLAGS_DRV_MODE; - - if (optind + 2 != argc) { - printf("usage: %s _IN _OUT\n", argv[0]); - return 1; + if (argc <= optind + 1) { + sample_usage(argv, long_options, __doc__, mask, true); + return ret; } ifindex_in = if_nametoindex(argv[optind]); @@ -143,75 +94,80 @@ int main(int argc, char **argv) if (!ifindex_out) ifindex_out = strtoul(argv[optind + 1], NULL, 0); - printf("input: %d output: %d\n", ifindex_in, ifindex_out); - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return 1; - - prog = bpf_program__next(NULL, obj); - dummy_prog = bpf_program__next(prog, obj); - if (!prog || !dummy_prog) { - printf("finding a prog in obj file failed\n"); - return 1; + if (!ifindex_in || !ifindex_out) { + fprintf(stderr, "Bad interface index or name\n"); + sample_usage(argv, long_options, __doc__, mask, true); + goto end; } - /* bpf_prog_load_xattr gives us the pointer to first prog's fd, - * so we're missing only the fd for dummy prog - */ - dummy_prog_fd = bpf_program__fd(dummy_prog); - if (prog_fd < 0 || dummy_prog_fd < 0) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); - return 1; + + skel = xdp_redirect__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect__open: %s\n", strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; } - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - return 1; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { - printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); - return 1; + skel->rodata->from_match[0] = ifindex_in; + skel->rodata->to_match[0] = ifindex_out; + skel->rodata->ifindex_out = ifindex_out; + + ret = xdp_redirect__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect__load: %s\n", strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - prog_id = info.id; + + ret = EXIT_FAIL_XDP; + if (sample_install_xdp(skel->progs.xdp_redirect_prog, ifindex_in, + generic, force) < 0) + goto end_destroy; /* Loading dummy XDP prog on out-device */ - if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, - (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { - printf("WARN: link set xdp fd failed on %d\n", ifindex_out); - ifindex_out_xdp_dummy_attached = false; + sample_install_xdp(skel->progs.xdp_redirect_dummy_prog, ifindex_out, + generic, force); + + ret = EXIT_FAIL; + if (!if_indextoname(ifindex_in, ifname_in)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_in, + strerror(errno)); + goto end_destroy; } - memset(&info, 0, sizeof(info)); - ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + if (!if_indextoname(ifindex_out, ifname_out)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_out, + strerror(errno)); + goto end_destroy; } - dummy_prog_id = info.id; - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + safe_strncpy(str, get_driver_name(ifindex_in), sizeof(str)); + printf("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n", + ifname_in, ifindex_in, str, ifname_out, ifindex_out, get_driver_name(ifindex_out)); + snprintf(str, sizeof(str), "%s->%s", ifname_in, ifname_out); - /* bpf redirect port */ - ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); - if (ret) { - perror("bpf_update_elem"); - goto out; + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - - poll_stats(2, ifindex_out); - -out: - return ret; + ret = EXIT_OK; +end_destroy: + xdp_redirect__destroy(skel); +end: + sample_exit(ret); } -- cgit v1.2.3 From 79ccf4529ee67510fdad8ecdfcf37528d353a36c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:05 +0530 Subject: samples: bpf: Convert xdp_redirect_cpu_kern.o to XDP samples helper Similar to xdp_monitor_kern, a lot of these BPF programs have been reimplemented properly consolidating missing features from other XDP samples. Hence, drop the unneeded code and rename to .bpf.c suffix. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-18-memxor@gmail.com --- samples/bpf/Makefile | 5 +- samples/bpf/xdp_redirect_cpu.bpf.c | 541 ++++++++++++++++++++++++++ samples/bpf/xdp_redirect_cpu_kern.c | 730 ------------------------------------ 3 files changed, 544 insertions(+), 732 deletions(-) create mode 100644 samples/bpf/xdp_redirect_cpu.bpf.c delete mode 100644 samples/bpf/xdp_redirect_cpu_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index d05105227ec5..231cdbc773a7 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -165,7 +165,6 @@ always-y += tcp_tos_reflect_kern.o always-y += tcp_dumpstats_kern.o always-y += xdp_redirect_map_kern.o always-y += xdp_redirect_map_multi_kern.o -always-y += xdp_redirect_cpu_kern.o always-y += xdp_rxq_info_kern.o always-y += xdp2skb_meta_kern.o always-y += syscall_tp_kern.o @@ -356,6 +355,7 @@ endef CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) +$(obj)/xdp_redirect_cpu.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o @@ -367,9 +367,10 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x -I$(srctree)/tools/lib $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ -LINKED_SKELS := xdp_redirect.skel.h xdp_monitor.skel.h +LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect.skel.h xdp_monitor.skel.h clean-files += $(LINKED_SKELS) +xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o diff --git a/samples/bpf/xdp_redirect_cpu.bpf.c b/samples/bpf/xdp_redirect_cpu.bpf.c new file mode 100644 index 000000000000..f10fe3cf25f6 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu.bpf.c @@ -0,0 +1,541 @@ +/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) + * + * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" +#include "hash_func01.h" + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct { + __uint(type, BPF_MAP_TYPE_CPUMAP); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct bpf_cpumap_val)); +} cpu_map SEC(".maps"); + +/* Set of maps controlling available CPU, and for iterating through + * selectable redirect CPUs. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); +} cpus_available SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cpus_count SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cpus_iterator SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 1); +} tx_port SEC(".maps"); + +char tx_mac_addr[ETH_ALEN]; + +/* Helper parse functions */ + +static __always_inline +bool parse_eth(struct ethhdr *eth, void *data_end, + u16 *eth_proto, u64 *l3_offset) +{ + u16 eth_type; + u64 offset; + + offset = sizeof(*eth); + if ((void *)eth + offset > data_end) + return false; + + eth_type = eth->h_proto; + + /* Skip non 802.3 Ethertypes */ + if (__builtin_expect(bpf_ntohs(eth_type) < ETH_P_802_3_MIN, 0)) + return false; + + /* Handle VLAN tagged packet */ + if (eth_type == bpf_htons(ETH_P_8021Q) || + eth_type == bpf_htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + /* Handle double VLAN tagged packet */ + if (eth_type == bpf_htons(ETH_P_8021Q) || + eth_type == bpf_htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + + *eth_proto = bpf_ntohs(eth_type); + *l3_offset = offset; + return true; +} + +static __always_inline +u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + struct udphdr *udph; + u16 dport; + + if (iph + 1 > data_end) + return 0; + if (!(iph->protocol == IPPROTO_UDP)) + return 0; + + udph = (void *)(iph + 1); + if (udph + 1 > data_end) + return 0; + + dport = bpf_ntohs(udph->dest); + return dport; +} + +static __always_inline +int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static __always_inline +int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp") +int xdp_prognum0_no_touch(struct xdp_md *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest = 0; + u32 key0 = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); + return XDP_ABORTED; + } + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp") +int xdp_prognum1_touch_data(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest = 0; + u32 key0 = 0; + u16 eth_type; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key0); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Validate packet length is minimum Eth header size */ + if (eth + 1 > data_end) + return XDP_ABORTED; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + /* Read packet data, and use it (drop non 802.3 Ethertypes) */ + eth_type = eth->h_proto; + if (bpf_ntohs(eth_type) < ETH_P_802_3_MIN) { + NO_TEAR_INC(rec->dropped); + return XDP_DROP; + } + + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); + return XDP_ABORTED; + } + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp") +int xdp_prognum2_round_robin(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct datarec *rec; + u32 cpu_dest = 0; + u32 key0 = 0; + + u32 *cpu_selected; + u32 *cpu_iterator; + u32 *cpu_max; + u32 cpu_idx; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); + if (!cpu_iterator) + return XDP_ABORTED; + cpu_idx = *cpu_iterator; + + *cpu_iterator += 1; + if (*cpu_iterator == *cpu_max) + *cpu_iterator = 0; + + cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); + return XDP_ABORTED; + } + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp") +int xdp_prognum3_proto_separate(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 *cpu_lookup; + u32 cpu_idx = 0; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); + return XDP_ABORTED; + } + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp") +int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 *cpu_lookup; + u32 cpu_idx = 0; + u16 dest_port; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + /* DDoS filter UDP port 9 (pktgen) */ + dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); + if (dest_port == 9) { + NO_TEAR_INC(rec->dropped); + return XDP_DROP; + } + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); + return XDP_ABORTED; + } + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +/* Hashing initval */ +#define INITVAL 15485863 + +static __always_inline +u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + u32 cpu_hash; + + if (iph + 1 > data_end) + return 0; + + cpu_hash = iph->saddr + iph->daddr; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); + + return cpu_hash; +} + +static __always_inline +u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + u32 cpu_hash; + + if (ip6h + 1 > data_end) + return 0; + + cpu_hash = ip6h->saddr.in6_u.u6_addr32[0] + ip6h->daddr.in6_u.u6_addr32[0]; + cpu_hash += ip6h->saddr.in6_u.u6_addr32[1] + ip6h->daddr.in6_u.u6_addr32[1]; + cpu_hash += ip6h->saddr.in6_u.u6_addr32[2] + ip6h->daddr.in6_u.u6_addr32[2]; + cpu_hash += ip6h->saddr.in6_u.u6_addr32[3] + ip6h->daddr.in6_u.u6_addr32[3]; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); + + return cpu_hash; +} + +/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The + * hashing scheme is symmetric, meaning swapping IP src/dest still hit + * same CPU. + */ +SEC("xdp") +int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 key0 = 0; + u32 *cpu_max; + u32 cpu_hash; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Hash for IPv4 and IPv6 */ + switch (eth_proto) { + case ETH_P_IP: + cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_IPV6: + cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ + default: + cpu_hash = 0; + } + + /* Choose CPU based on hash */ + cpu_idx = cpu_hash % *cpu_max; + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= nr_cpus) { + NO_TEAR_INC(rec->issue); + return XDP_ABORTED; + } + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpumap/redirect") +int xdp_redirect_cpu_devmap(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + swap_src_dst_mac(data); + return bpf_redirect_map(&tx_port, 0, 0); +} + +SEC("xdp_cpumap/pass") +int xdp_redirect_cpu_pass(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +SEC("xdp_cpumap/drop") +int xdp_redirect_cpu_drop(struct xdp_md *ctx) +{ + return XDP_DROP; +} + +SEC("xdp_devmap/egress") +int xdp_redirect_egress_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c deleted file mode 100644 index 8255025dea97..000000000000 --- a/samples/bpf/xdp_redirect_cpu_kern.c +++ /dev/null @@ -1,730 +0,0 @@ -/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) - * - * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include "hash_func01.h" - -#define MAX_CPUS NR_CPUS - -/* Special map type that can XDP_REDIRECT frames to another CPU */ -struct { - __uint(type, BPF_MAP_TYPE_CPUMAP); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(struct bpf_cpumap_val)); - __uint(max_entries, MAX_CPUS); -} cpu_map SEC(".maps"); - -/* Common stats data record to keep userspace more simple */ -struct datarec { - __u64 processed; - __u64 dropped; - __u64 issue; - __u64 xdp_pass; - __u64 xdp_drop; - __u64 xdp_redirect; -}; - -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success - * feedback. Redirect TX errors can be caught via a tracepoint. - */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} rx_cnt SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 2); - /* TODO: have entries for all possible errno's */ -} redirect_err_cnt SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, MAX_CPUS); -} cpumap_enqueue_cnt SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} cpumap_kthread_cnt SEC(".maps"); - -/* Set of maps controlling available CPU, and for iterating through - * selectable redirect CPUs. - */ -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, MAX_CPUS); -} cpus_available SEC(".maps"); -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, 1); -} cpus_count SEC(".maps"); -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, 1); -} cpus_iterator SEC(".maps"); - -/* Used by trace point */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, struct datarec); - __uint(max_entries, 1); -} exception_cnt SEC(".maps"); - -/* Helper parse functions */ - -/* Parse Ethernet layer 2, extract network layer 3 offset and protocol - * - * Returns false on error and non-supported ether-type - */ -struct vlan_hdr { - __be16 h_vlan_TCI; - __be16 h_vlan_encapsulated_proto; -}; - -static __always_inline -bool parse_eth(struct ethhdr *eth, void *data_end, - u16 *eth_proto, u64 *l3_offset) -{ - u16 eth_type; - u64 offset; - - offset = sizeof(*eth); - if ((void *)eth + offset > data_end) - return false; - - eth_type = eth->h_proto; - - /* Skip non 802.3 Ethertypes */ - if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) - return false; - - /* Handle VLAN tagged packet */ - if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vlan_hdr; - - vlan_hdr = (void *)eth + offset; - offset += sizeof(*vlan_hdr); - if ((void *)eth + offset > data_end) - return false; - eth_type = vlan_hdr->h_vlan_encapsulated_proto; - } - /* Handle double VLAN tagged packet */ - if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vlan_hdr; - - vlan_hdr = (void *)eth + offset; - offset += sizeof(*vlan_hdr); - if ((void *)eth + offset > data_end) - return false; - eth_type = vlan_hdr->h_vlan_encapsulated_proto; - } - - *eth_proto = ntohs(eth_type); - *l3_offset = offset; - return true; -} - -static __always_inline -u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct iphdr *iph = data + nh_off; - struct udphdr *udph; - u16 dport; - - if (iph + 1 > data_end) - return 0; - if (!(iph->protocol == IPPROTO_UDP)) - return 0; - - udph = (void *)(iph + 1); - if (udph + 1 > data_end) - return 0; - - dport = ntohs(udph->dest); - return dport; -} - -static __always_inline -int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct iphdr *iph = data + nh_off; - - if (iph + 1 > data_end) - return 0; - return iph->protocol; -} - -static __always_inline -int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ipv6hdr *ip6h = data + nh_off; - - if (ip6h + 1 > data_end) - return 0; - return ip6h->nexthdr; -} - -SEC("xdp_cpu_map0") -int xdp_prognum0_no_touch(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct datarec *rec; - u32 *cpu_selected; - u32 cpu_dest; - u32 key = 0; - - /* Only use first entry in cpus_available */ - cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); - if (!cpu_selected) - return XDP_ABORTED; - cpu_dest = *cpu_selected; - - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key); - if (!rec) - return XDP_ABORTED; - rec->processed++; - - if (cpu_dest >= MAX_CPUS) { - rec->issue++; - return XDP_ABORTED; - } - - return bpf_redirect_map(&cpu_map, cpu_dest, 0); -} - -SEC("xdp_cpu_map1_touch_data") -int xdp_prognum1_touch_data(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - struct datarec *rec; - u32 *cpu_selected; - u32 cpu_dest; - u16 eth_type; - u32 key = 0; - - /* Only use first entry in cpus_available */ - cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); - if (!cpu_selected) - return XDP_ABORTED; - cpu_dest = *cpu_selected; - - /* Validate packet length is minimum Eth header size */ - if (eth + 1 > data_end) - return XDP_ABORTED; - - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key); - if (!rec) - return XDP_ABORTED; - rec->processed++; - - /* Read packet data, and use it (drop non 802.3 Ethertypes) */ - eth_type = eth->h_proto; - if (ntohs(eth_type) < ETH_P_802_3_MIN) { - rec->dropped++; - return XDP_DROP; - } - - if (cpu_dest >= MAX_CPUS) { - rec->issue++; - return XDP_ABORTED; - } - - return bpf_redirect_map(&cpu_map, cpu_dest, 0); -} - -SEC("xdp_cpu_map2_round_robin") -int xdp_prognum2_round_robin(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - struct datarec *rec; - u32 cpu_dest; - u32 *cpu_lookup; - u32 key0 = 0; - - u32 *cpu_selected; - u32 *cpu_iterator; - u32 *cpu_max; - u32 cpu_idx; - - cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); - if (!cpu_max) - return XDP_ABORTED; - - cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); - if (!cpu_iterator) - return XDP_ABORTED; - cpu_idx = *cpu_iterator; - - *cpu_iterator += 1; - if (*cpu_iterator == *cpu_max) - *cpu_iterator = 0; - - cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); - if (!cpu_selected) - return XDP_ABORTED; - cpu_dest = *cpu_selected; - - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key0); - if (!rec) - return XDP_ABORTED; - rec->processed++; - - if (cpu_dest >= MAX_CPUS) { - rec->issue++; - return XDP_ABORTED; - } - - return bpf_redirect_map(&cpu_map, cpu_dest, 0); -} - -SEC("xdp_cpu_map3_proto_separate") -int xdp_prognum3_proto_separate(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - u8 ip_proto = IPPROTO_UDP; - struct datarec *rec; - u16 eth_proto = 0; - u64 l3_offset = 0; - u32 cpu_dest = 0; - u32 cpu_idx = 0; - u32 *cpu_lookup; - u32 key = 0; - - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key); - if (!rec) - return XDP_ABORTED; - rec->processed++; - - if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) - return XDP_PASS; /* Just skip */ - - /* Extract L4 protocol */ - switch (eth_proto) { - case ETH_P_IP: - ip_proto = get_proto_ipv4(ctx, l3_offset); - break; - case ETH_P_IPV6: - ip_proto = get_proto_ipv6(ctx, l3_offset); - break; - case ETH_P_ARP: - cpu_idx = 0; /* ARP packet handled on separate CPU */ - break; - default: - cpu_idx = 0; - } - - /* Choose CPU based on L4 protocol */ - switch (ip_proto) { - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: - cpu_idx = 2; - break; - case IPPROTO_TCP: - cpu_idx = 0; - break; - case IPPROTO_UDP: - cpu_idx = 1; - break; - default: - cpu_idx = 0; - } - - cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); - if (!cpu_lookup) - return XDP_ABORTED; - cpu_dest = *cpu_lookup; - - if (cpu_dest >= MAX_CPUS) { - rec->issue++; - return XDP_ABORTED; - } - - return bpf_redirect_map(&cpu_map, cpu_dest, 0); -} - -SEC("xdp_cpu_map4_ddos_filter_pktgen") -int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - u8 ip_proto = IPPROTO_UDP; - struct datarec *rec; - u16 eth_proto = 0; - u64 l3_offset = 0; - u32 cpu_dest = 0; - u32 cpu_idx = 0; - u16 dest_port; - u32 *cpu_lookup; - u32 key = 0; - - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key); - if (!rec) - return XDP_ABORTED; - rec->processed++; - - if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) - return XDP_PASS; /* Just skip */ - - /* Extract L4 protocol */ - switch (eth_proto) { - case ETH_P_IP: - ip_proto = get_proto_ipv4(ctx, l3_offset); - break; - case ETH_P_IPV6: - ip_proto = get_proto_ipv6(ctx, l3_offset); - break; - case ETH_P_ARP: - cpu_idx = 0; /* ARP packet handled on separate CPU */ - break; - default: - cpu_idx = 0; - } - - /* Choose CPU based on L4 protocol */ - switch (ip_proto) { - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: - cpu_idx = 2; - break; - case IPPROTO_TCP: - cpu_idx = 0; - break; - case IPPROTO_UDP: - cpu_idx = 1; - /* DDoS filter UDP port 9 (pktgen) */ - dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); - if (dest_port == 9) { - if (rec) - rec->dropped++; - return XDP_DROP; - } - break; - default: - cpu_idx = 0; - } - - cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); - if (!cpu_lookup) - return XDP_ABORTED; - cpu_dest = *cpu_lookup; - - if (cpu_dest >= MAX_CPUS) { - rec->issue++; - return XDP_ABORTED; - } - - return bpf_redirect_map(&cpu_map, cpu_dest, 0); -} - -/* Hashing initval */ -#define INITVAL 15485863 - -static __always_inline -u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct iphdr *iph = data + nh_off; - u32 cpu_hash; - - if (iph + 1 > data_end) - return 0; - - cpu_hash = iph->saddr + iph->daddr; - cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); - - return cpu_hash; -} - -static __always_inline -u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ipv6hdr *ip6h = data + nh_off; - u32 cpu_hash; - - if (ip6h + 1 > data_end) - return 0; - - cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; - cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; - cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; - cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; - cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); - - return cpu_hash; -} - -/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The - * hashing scheme is symmetric, meaning swapping IP src/dest still hit - * same CPU. - */ -SEC("xdp_cpu_map5_lb_hash_ip_pairs") -int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - u8 ip_proto = IPPROTO_UDP; - struct datarec *rec; - u16 eth_proto = 0; - u64 l3_offset = 0; - u32 cpu_dest = 0; - u32 cpu_idx = 0; - u32 *cpu_lookup; - u32 *cpu_max; - u32 cpu_hash; - u32 key = 0; - - /* Count RX packet in map */ - rec = bpf_map_lookup_elem(&rx_cnt, &key); - if (!rec) - return XDP_ABORTED; - rec->processed++; - - cpu_max = bpf_map_lookup_elem(&cpus_count, &key); - if (!cpu_max) - return XDP_ABORTED; - - if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) - return XDP_PASS; /* Just skip */ - - /* Hash for IPv4 and IPv6 */ - switch (eth_proto) { - case ETH_P_IP: - cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); - break; - case ETH_P_IPV6: - cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); - break; - case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ - default: - cpu_hash = 0; - } - - /* Choose CPU based on hash */ - cpu_idx = cpu_hash % *cpu_max; - - cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); - if (!cpu_lookup) - return XDP_ABORTED; - cpu_dest = *cpu_lookup; - - if (cpu_dest >= MAX_CPUS) { - rec->issue++; - return XDP_ABORTED; - } - - return bpf_redirect_map(&cpu_map, cpu_dest, 0); -} - -char _license[] SEC("license") = "GPL"; - -/*** Trace point code ***/ - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_redirect_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 - -enum { - XDP_REDIRECT_SUCCESS = 0, - XDP_REDIRECT_ERROR = 1 -}; - -static __always_inline -int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) -{ - u32 key = XDP_REDIRECT_ERROR; - struct datarec *rec; - int err = ctx->err; - - if (!err) - key = XDP_REDIRECT_SUCCESS; - - rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); - if (!rec) - return 0; - rec->dropped += 1; - - return 0; /* Indicate event was filtered (no further processing)*/ - /* - * Returning 1 here would allow e.g. a perf-record tracepoint - * to see and record these events, but it doesn't work well - * in-practice as stopping perf-record also unload this - * bpf_prog. Plus, there is additional overhead of doing so. - */ -} - -SEC("tracepoint/xdp/xdp_redirect_err") -int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -SEC("tracepoint/xdp/xdp_redirect_map_err") -int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) -{ - return xdp_redirect_collect_stat(ctx); -} - -/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct xdp_exception_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int ifindex; // offset:16; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_exception") -int trace_xdp_exception(struct xdp_exception_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&exception_cnt, &key); - if (!rec) - return 1; - rec->dropped += 1; - - return 0; -} - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_enqueue_ctx { - u64 __pad; // First 8 bytes are not accessible by bpf code - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int to_cpu; // offset:28; size:4; signed:1; -}; - -SEC("tracepoint/xdp/xdp_cpumap_enqueue") -int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) -{ - u32 to_cpu = ctx->to_cpu; - struct datarec *rec; - - if (to_cpu >= MAX_CPUS) - return 1; - - rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - - /* Record bulk events, then userspace can calc average bulk size */ - if (ctx->processed > 0) - rec->issue += 1; - - /* Inception: It's possible to detect overload situations, via - * this tracepoint. This can be used for creating a feedback - * loop to XDP, which can take appropriate actions to mitigate - * this overload situation. - */ - return 0; -} - -/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format - * Code in: kernel/include/trace/events/xdp.h - */ -struct cpumap_kthread_ctx { - u64 __pad; // First 8 bytes are not accessible - int map_id; // offset:8; size:4; signed:1; - u32 act; // offset:12; size:4; signed:0; - int cpu; // offset:16; size:4; signed:1; - unsigned int drops; // offset:20; size:4; signed:0; - unsigned int processed; // offset:24; size:4; signed:0; - int sched; // offset:28; size:4; signed:1; - unsigned int xdp_pass; // offset:32; size:4; signed:0; - unsigned int xdp_drop; // offset:36; size:4; signed:0; - unsigned int xdp_redirect; // offset:40; size:4; signed:0; -}; - -SEC("tracepoint/xdp/xdp_cpumap_kthread") -int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) -{ - struct datarec *rec; - u32 key = 0; - - rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); - if (!rec) - return 0; - rec->processed += ctx->processed; - rec->dropped += ctx->drops; - rec->xdp_pass += ctx->xdp_pass; - rec->xdp_drop += ctx->xdp_drop; - rec->xdp_redirect += ctx->xdp_redirect; - - /* Count times kthread yielded CPU via schedule call */ - if (ctx->sched) - rec->issue++; - - return 0; -} -- cgit v1.2.3 From e531a220cc59282a3f371608f1a6fa960416e231 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:06 +0530 Subject: samples: bpf: Convert xdp_redirect_cpu to XDP samples helper Use the libbpf skeleton facility and other utilities provided by XDP samples helper. Similar to xdp_monitor, xdp_redirect_cpu was quite featureful except a few minor omissions (e.g. redirect errno reporting). All of these have been moved to XDP samples helper, hence drop the unneeded code and convert to usage of helpers provided by it. One of the important changes here is dropping of mprog-disable option, as we make that the default. Also, we support built-in programs for some common actions on the packet when it reaches kthread (pass, drop, redirect to device). If the user still needs to install a custom program, they can still supply a BPF object, however the program should be suitably tagged with SEC("xdp_cpumap") annotation so that the expected attach type is correct when updating our cpumap map element. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-19-memxor@gmail.com --- samples/bpf/Makefile | 5 +- samples/bpf/xdp_redirect_cpu_user.c | 1105 +++++++++++------------------------ 2 files changed, 343 insertions(+), 767 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 231cdbc773a7..43d3e52a8659 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -41,7 +41,6 @@ tprogs-y += test_map_in_map tprogs-y += per_socket_stats_example tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect_map_multi -tprogs-y += xdp_redirect_cpu tprogs-y += xdp_rxq_info tprogs-y += syscall_tp tprogs-y += cpustat @@ -55,6 +54,7 @@ tprogs-y += xdp_sample_pkts tprogs-y += ibumad tprogs-y += hbm +tprogs-y += xdp_redirect_cpu tprogs-y += xdp_redirect tprogs-y += xdp_monitor @@ -102,7 +102,6 @@ test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o xdp_redirect_map-objs := xdp_redirect_map_user.o xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o -xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o @@ -116,6 +115,7 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) +xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE) xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE) xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE) @@ -311,6 +311,7 @@ verify_target_bpf: verify_cmds $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) +$(obj)/xdp_redirect_cpu_user.o: $(obj)/xdp_redirect_cpu.skel.h $(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 9e225c96b77e..631700aef69c 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -2,7 +2,16 @@ /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__ = - " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; +"XDP CPU redirect tool, using BPF_MAP_TYPE_CPUMAP\n" +"Usage: xdp_redirect_cpu -d -c 0 ... -c N\n" +"Valid specification for CPUMAP BPF program:\n" +" --mprog-name/-e pass (use built-in XDP_PASS program)\n" +" --mprog-name/-e drop (use built-in XDP_DROP program)\n" +" --redirect-device/-r (use built-in DEVMAP redirect program)\n" +" Custom CPUMAP BPF program:\n" +" --mprog-filename/-f --mprog-name/-e \n" +" Optionally, also pass --redirect-map/-m and --redirect-device/-r together\n" +" to configure DEVMAP in BPF object \n"; #include #include @@ -18,558 +27,62 @@ static const char *__doc__ = #include #include #include - #include #include - -/* How many xdp_progs are defined in _kern.c */ -#define MAX_PROG 6 - #include #include - #include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect_cpu.skel.h" -static int ifindex = -1; -static char ifname_buf[IF_NAMESIZE]; -static char *ifname; -static __u32 prog_id; - -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int n_cpus; - -enum map_type { - CPU_MAP, - RX_CNT, - REDIRECT_ERR_CNT, - CPUMAP_ENQUEUE_CNT, - CPUMAP_KTHREAD_CNT, - CPUS_AVAILABLE, - CPUS_COUNT, - CPUS_ITERATOR, - EXCEPTION_CNT, -}; +static int map_fd; +static int avail_fd; +static int count_fd; -static const char *const map_type_strings[] = { - [CPU_MAP] = "cpu_map", - [RX_CNT] = "rx_cnt", - [REDIRECT_ERR_CNT] = "redirect_err_cnt", - [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", - [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", - [CPUS_AVAILABLE] = "cpus_available", - [CPUS_COUNT] = "cpus_count", - [CPUS_ITERATOR] = "cpus_iterator", - [EXCEPTION_CNT] = "exception_cnt", -}; +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_CPUMAP_ENQUEUE_CNT | SAMPLE_CPUMAP_KTHREAD_CNT | + SAMPLE_EXCEPTION_CNT; -#define NUM_TP 5 -#define NUM_MAP 9 -struct bpf_link *tp_links[NUM_TP] = {}; -static int map_fds[NUM_MAP]; -static int tp_cnt = 0; - -/* Exit return codes */ -#define EXIT_OK 0 -#define EXIT_FAIL 1 -#define EXIT_FAIL_OPTION 2 -#define EXIT_FAIL_XDP 3 -#define EXIT_FAIL_BPF 4 -#define EXIT_FAIL_MEM 5 +DEFINE_SAMPLE_INIT(xdp_redirect_cpu); static const struct option long_options[] = { - {"help", no_argument, NULL, 'h' }, - {"dev", required_argument, NULL, 'd' }, - {"skb-mode", no_argument, NULL, 'S' }, - {"sec", required_argument, NULL, 's' }, - {"progname", required_argument, NULL, 'p' }, - {"qsize", required_argument, NULL, 'q' }, - {"cpu", required_argument, NULL, 'c' }, - {"stress-mode", no_argument, NULL, 'x' }, - {"no-separators", no_argument, NULL, 'z' }, - {"force", no_argument, NULL, 'F' }, - {"mprog-disable", no_argument, NULL, 'n' }, - {"mprog-name", required_argument, NULL, 'e' }, - {"mprog-filename", required_argument, NULL, 'f' }, - {"redirect-device", required_argument, NULL, 'r' }, - {"redirect-map", required_argument, NULL, 'm' }, - {0, 0, NULL, 0 } + { "help", no_argument, NULL, 'h' }, + { "dev", required_argument, NULL, 'd' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "progname", required_argument, NULL, 'p' }, + { "qsize", required_argument, NULL, 'q' }, + { "cpu", required_argument, NULL, 'c' }, + { "stress-mode", no_argument, NULL, 'x' }, + { "force", no_argument, NULL, 'F' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + { "stats", no_argument, NULL, 's' }, + { "mprog-name", required_argument, NULL, 'e' }, + { "mprog-filename", required_argument, NULL, 'f' }, + { "redirect-device", required_argument, NULL, 'r' }, + { "redirect-map", required_argument, NULL, 'm' }, + {} }; -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; - - if (ifindex > -1) { - if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(EXIT_FAIL); - } - if (prog_id == curr_prog_id) { - fprintf(stderr, - "Interrupted: Removing XDP program on ifindex:%d device:%s\n", - ifindex, ifname); - bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); - } else if (!curr_prog_id) { - printf("couldn't find a prog id on a given iface\n"); - } else { - printf("program on interface changed, not removing\n"); - } - } - /* Detach tracepoints */ - while (tp_cnt) - bpf_link__destroy(tp_links[--tp_cnt]); - - exit(EXIT_OK); -} - static void print_avail_progs(struct bpf_object *obj) { struct bpf_program *pos; + printf(" Programs to be used for -p/--progname:\n"); bpf_object__for_each_program(pos, obj) { - if (bpf_program__is_xdp(pos)) - printf(" %s\n", bpf_program__section_name(pos)); - } -} - -static void usage(char *argv[], struct bpf_object *obj) -{ - int i; - - printf("\nDOCUMENTATION:\n%s\n", __doc__); - printf("\n"); - printf(" Usage: %s (options-see-below)\n", argv[0]); - printf(" Listing options:\n"); - for (i = 0; long_options[i].name != 0; i++) { - printf(" --%-12s", long_options[i].name); - if (long_options[i].flag != NULL) - printf(" flag (internal value:%d)", - *long_options[i].flag); - else - printf(" short-option: -%c", - long_options[i].val); - printf("\n"); - } - printf("\n Programs to be used for --progname:\n"); - print_avail_progs(obj); - printf("\n"); -} - -/* gettime returns the current time of day in nanoseconds. - * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) - * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) - */ -#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -static __u64 gettime(void) -{ - struct timespec t; - int res; - - res = clock_gettime(CLOCK_MONOTONIC, &t); - if (res < 0) { - fprintf(stderr, "Error with gettimeofday! (%i)\n", res); - exit(EXIT_FAIL); - } - return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; -} - -/* Common stats data record shared with _kern.c */ -struct datarec { - __u64 processed; - __u64 dropped; - __u64 issue; - __u64 xdp_pass; - __u64 xdp_drop; - __u64 xdp_redirect; -}; -struct record { - __u64 timestamp; - struct datarec total; - struct datarec *cpu; -}; -struct stats_record { - struct record rx_cnt; - struct record redir_err; - struct record kthread; - struct record exception; - struct record enq[]; -}; - -static bool map_collect_percpu(int fd, __u32 key, struct record *rec) -{ - /* For percpu maps, userspace gets a value per possible CPU */ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct datarec values[nr_cpus]; - __u64 sum_xdp_redirect = 0; - __u64 sum_xdp_pass = 0; - __u64 sum_xdp_drop = 0; - __u64 sum_processed = 0; - __u64 sum_dropped = 0; - __u64 sum_issue = 0; - int i; - - if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { - fprintf(stderr, - "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return false; - } - /* Get time as close as possible to reading map contents */ - rec->timestamp = gettime(); - - /* Record and sum values from each CPU */ - for (i = 0; i < nr_cpus; i++) { - rec->cpu[i].processed = values[i].processed; - sum_processed += values[i].processed; - rec->cpu[i].dropped = values[i].dropped; - sum_dropped += values[i].dropped; - rec->cpu[i].issue = values[i].issue; - sum_issue += values[i].issue; - rec->cpu[i].xdp_pass = values[i].xdp_pass; - sum_xdp_pass += values[i].xdp_pass; - rec->cpu[i].xdp_drop = values[i].xdp_drop; - sum_xdp_drop += values[i].xdp_drop; - rec->cpu[i].xdp_redirect = values[i].xdp_redirect; - sum_xdp_redirect += values[i].xdp_redirect; - } - rec->total.processed = sum_processed; - rec->total.dropped = sum_dropped; - rec->total.issue = sum_issue; - rec->total.xdp_pass = sum_xdp_pass; - rec->total.xdp_drop = sum_xdp_drop; - rec->total.xdp_redirect = sum_xdp_redirect; - return true; -} - -static struct datarec *alloc_record_per_cpu(void) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - struct datarec *array; - - array = calloc(nr_cpus, sizeof(struct datarec)); - if (!array) { - fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); - exit(EXIT_FAIL_MEM); - } - return array; -} - -static struct stats_record *alloc_stats_record(void) -{ - struct stats_record *rec; - int i, size; - - size = sizeof(*rec) + n_cpus * sizeof(struct record); - rec = malloc(size); - if (!rec) { - fprintf(stderr, "Mem alloc error\n"); - exit(EXIT_FAIL_MEM); - } - memset(rec, 0, size); - rec->rx_cnt.cpu = alloc_record_per_cpu(); - rec->redir_err.cpu = alloc_record_per_cpu(); - rec->kthread.cpu = alloc_record_per_cpu(); - rec->exception.cpu = alloc_record_per_cpu(); - for (i = 0; i < n_cpus; i++) - rec->enq[i].cpu = alloc_record_per_cpu(); - - return rec; -} - -static void free_stats_record(struct stats_record *r) -{ - int i; - - for (i = 0; i < n_cpus; i++) - free(r->enq[i].cpu); - free(r->exception.cpu); - free(r->kthread.cpu); - free(r->redir_err.cpu); - free(r->rx_cnt.cpu); - free(r); -} - -static double calc_period(struct record *r, struct record *p) -{ - double period_ = 0; - __u64 period = 0; - - period = r->timestamp - p->timestamp; - if (period > 0) - period_ = ((double) period / NANOSEC_PER_SEC); - - return period_; -} - -static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) -{ - __u64 packets = 0; - __u64 pps = 0; - - if (period_ > 0) { - packets = r->processed - p->processed; - pps = packets / period_; - } - return pps; -} - -static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) -{ - __u64 packets = 0; - __u64 pps = 0; - - if (period_ > 0) { - packets = r->dropped - p->dropped; - pps = packets / period_; - } - return pps; -} - -static __u64 calc_errs_pps(struct datarec *r, - struct datarec *p, double period_) -{ - __u64 packets = 0; - __u64 pps = 0; - - if (period_ > 0) { - packets = r->issue - p->issue; - pps = packets / period_; - } - return pps; -} - -static void calc_xdp_pps(struct datarec *r, struct datarec *p, - double *xdp_pass, double *xdp_drop, - double *xdp_redirect, double period_) -{ - *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; - if (period_ > 0) { - *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; - *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; - *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; - } -} - -static void stats_print(struct stats_record *stats_rec, - struct stats_record *stats_prev, - char *prog_name, char *mprog_name, int mprog_fd) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - double pps = 0, drop = 0, err = 0; - bool mprog_enabled = false; - struct record *rec, *prev; - int to_cpu; - double t; - int i; - - if (mprog_fd > 0) - mprog_enabled = true; - - /* Header */ - printf("Running XDP/eBPF prog_name:%s\n", prog_name); - printf("%-15s %-7s %-14s %-11s %-9s\n", - "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); - - /* XDP rx_cnt */ - { - char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; - char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; - char *errstr = ""; - - rec = &stats_rec->rx_cnt; - prev = &stats_prev->rx_cnt; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - err = calc_errs_pps(r, p, t); - if (err > 0) - errstr = "cpu-dest/err"; - if (pps > 0) - printf(fmt_rx, "XDP-RX", - i, pps, drop, err, errstr); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - err = calc_errs_pps(&rec->total, &prev->total, t); - printf(fm2_rx, "XDP-RX", "total", pps, drop); - } - - /* cpumap enqueue stats */ - for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) { - char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; - char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; - char *errstr = ""; - - rec = &stats_rec->enq[to_cpu]; - prev = &stats_prev->enq[to_cpu]; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - err = calc_errs_pps(r, p, t); - if (err > 0) { - errstr = "bulk-average"; - err = pps / err; /* calc average bulk size */ - } - if (pps > 0) - printf(fmt, "cpumap-enqueue", - i, to_cpu, pps, drop, err, errstr); - } - pps = calc_pps(&rec->total, &prev->total, t); - if (pps > 0) { - drop = calc_drop_pps(&rec->total, &prev->total, t); - err = calc_errs_pps(&rec->total, &prev->total, t); - if (err > 0) { - errstr = "bulk-average"; - err = pps / err; /* calc average bulk size */ - } - printf(fm2, "cpumap-enqueue", - "sum", to_cpu, pps, drop, err, errstr); - } - } - - /* cpumap kthread stats */ - { - char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; - char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; - char *e_str = ""; - - rec = &stats_rec->kthread; - prev = &stats_prev->kthread; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - err = calc_errs_pps(r, p, t); - if (err > 0) - e_str = "sched"; - if (pps > 0) - printf(fmt_k, "cpumap_kthread", - i, pps, drop, err, e_str); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - err = calc_errs_pps(&rec->total, &prev->total, t); - if (err > 0) - e_str = "sched-sum"; - printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); - } - - /* XDP redirect err tracepoints (very unlikely) */ - { - char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; - char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; - - rec = &stats_rec->redir_err; - prev = &stats_prev->redir_err; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - if (pps > 0) - printf(fmt_err, "redirect_err", i, pps, drop); + if (bpf_program__is_xdp(pos)) { + if (!strncmp(bpf_program__name(pos), "xdp_prognum", + sizeof("xdp_prognum") - 1)) + printf(" %s\n", bpf_program__name(pos)); } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - printf(fm2_err, "redirect_err", "total", pps, drop); } - - /* XDP general exception tracepoints */ - { - char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; - char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; - - rec = &stats_rec->exception; - prev = &stats_prev->exception; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - pps = calc_pps(r, p, t); - drop = calc_drop_pps(r, p, t); - if (pps > 0) - printf(fmt_err, "xdp_exception", i, pps, drop); - } - pps = calc_pps(&rec->total, &prev->total, t); - drop = calc_drop_pps(&rec->total, &prev->total, t); - printf(fm2_err, "xdp_exception", "total", pps, drop); - } - - /* CPUMAP attached XDP program that runs on remote/destination CPU */ - if (mprog_enabled) { - char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f\n"; - char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f\n"; - double xdp_pass, xdp_drop, xdp_redirect; - - printf("\n2nd remote XDP/eBPF prog_name: %s\n", mprog_name); - printf("%-15s %-7s %-14s %-11s %-9s\n", - "XDP-cpumap", "CPU:to", "xdp-pass", "xdp-drop", "xdp-redir"); - - rec = &stats_rec->kthread; - prev = &stats_prev->kthread; - t = calc_period(rec, prev); - for (i = 0; i < nr_cpus; i++) { - struct datarec *r = &rec->cpu[i]; - struct datarec *p = &prev->cpu[i]; - - calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, - &xdp_redirect, t); - if (xdp_pass > 0 || xdp_drop > 0 || xdp_redirect > 0) - printf(fmt_k, "xdp-in-kthread", i, xdp_pass, xdp_drop, - xdp_redirect); - } - calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, - &xdp_redirect, t); - printf(fm2_k, "xdp-in-kthread", "total", xdp_pass, xdp_drop, xdp_redirect); - } - - printf("\n"); - fflush(stdout); -} - -static void stats_collect(struct stats_record *rec) -{ - int fd, i; - - fd = map_fds[RX_CNT]; - map_collect_percpu(fd, 0, &rec->rx_cnt); - - fd = map_fds[REDIRECT_ERR_CNT]; - map_collect_percpu(fd, 1, &rec->redir_err); - - fd = map_fds[CPUMAP_ENQUEUE_CNT]; - for (i = 0; i < n_cpus; i++) - map_collect_percpu(fd, i, &rec->enq[i]); - - fd = map_fds[CPUMAP_KTHREAD_CNT]; - map_collect_percpu(fd, 0, &rec->kthread); - - fd = map_fds[EXCEPTION_CNT]; - map_collect_percpu(fd, 0, &rec->exception); } - -/* Pointer swap trick */ -static inline void swap(struct stats_record **a, struct stats_record **b) +static void usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error, struct bpf_object *obj) { - struct stats_record *tmp; - - tmp = *a; - *a = *b; - *b = tmp; + sample_usage(argv, long_options, doc, mask, error); + print_avail_progs(obj); } static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, @@ -582,39 +95,41 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, /* Add a CPU entry to cpumap, as this allocate a cpu entry in * the kernel for the cpu. */ - ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0); - if (ret) { - fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); - exit(EXIT_FAIL_BPF); + ret = bpf_map_update_elem(map_fd, &cpu, value, 0); + if (ret < 0) { + fprintf(stderr, "Create CPU entry failed: %s\n", strerror(errno)); + return ret; } /* Inform bpf_prog's that a new CPU is available to select * from via some control maps. */ - ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0); - if (ret) { - fprintf(stderr, "Add to avail CPUs failed\n"); - exit(EXIT_FAIL_BPF); + ret = bpf_map_update_elem(avail_fd, &avail_idx, &cpu, 0); + if (ret < 0) { + fprintf(stderr, "Add to avail CPUs failed: %s\n", strerror(errno)); + return ret; } /* When not replacing/updating existing entry, bump the count */ - ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count); - if (ret) { - fprintf(stderr, "Failed reading curr cpus_count\n"); - exit(EXIT_FAIL_BPF); + ret = bpf_map_lookup_elem(count_fd, &key, &curr_cpus_count); + if (ret < 0) { + fprintf(stderr, "Failed reading curr cpus_count: %s\n", + strerror(errno)); + return ret; } if (new) { curr_cpus_count++; - ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key, + ret = bpf_map_update_elem(count_fd, &key, &curr_cpus_count, 0); - if (ret) { - fprintf(stderr, "Failed write curr cpus_count\n"); - exit(EXIT_FAIL_BPF); + if (ret < 0) { + fprintf(stderr, "Failed write curr cpus_count: %s\n", + strerror(errno)); + return ret; } } - /* map_fd[7] = cpus_iterator */ - printf("%s CPU:%u as idx:%u qsize:%d prog_fd: %d (cpus_count:%u)\n", - new ? "Add-new":"Replace", cpu, avail_idx, + + printf("%s CPU: %u as idx: %u qsize: %d cpumap_prog_fd: %d (cpus_count: %u)\n", + new ? "Add new" : "Replace", cpu, avail_idx, value->qsize, value->bpf_prog.fd, curr_cpus_count); return 0; @@ -623,24 +138,29 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, /* CPUs are zero-indexed. Thus, add a special sentinel default value * in map cpus_available to mark CPU index'es not configured */ -static void mark_cpus_unavailable(void) +static int mark_cpus_unavailable(void) { - __u32 invalid_cpu = n_cpus; - int ret, i; + int ret, i, n_cpus = libbpf_num_possible_cpus(); + __u32 invalid_cpu; for (i = 0; i < n_cpus; i++) { - ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i, + ret = bpf_map_update_elem(avail_fd, &i, &invalid_cpu, 0); - if (ret) { - fprintf(stderr, "Failed marking CPU unavailable\n"); - exit(EXIT_FAIL_BPF); + if (ret < 0) { + fprintf(stderr, "Failed marking CPU unavailable: %s\n", + strerror(errno)); + return ret; } } + + return 0; } /* Stress cpumap management code by concurrently changing underlying cpumap */ -static void stress_cpumap(struct bpf_cpumap_val *value) +static void stress_cpumap(void *ctx) { + struct bpf_cpumap_val *value = ctx; + /* Changing qsize will cause kernel to free and alloc a new * bpf_cpu_map_entry, with an associated/complicated tear-down * procedure. @@ -653,144 +173,163 @@ static void stress_cpumap(struct bpf_cpumap_val *value) create_cpu_entry(1, value, 0, false); } -static void stats_poll(int interval, bool use_separators, char *prog_name, - char *mprog_name, struct bpf_cpumap_val *value, - bool stress_mode) -{ - struct stats_record *record, *prev; - int mprog_fd; - - record = alloc_stats_record(); - prev = alloc_stats_record(); - stats_collect(record); - - /* Trick to pretty printf with thousands separators use %' */ - if (use_separators) - setlocale(LC_NUMERIC, "en_US"); - - while (1) { - swap(&prev, &record); - mprog_fd = value->bpf_prog.fd; - stats_collect(record); - stats_print(record, prev, prog_name, mprog_name, mprog_fd); - sleep(interval); - if (stress_mode) - stress_cpumap(value); - } - - free_stats_record(record); - free_stats_record(prev); -} - -static int init_tracepoints(struct bpf_object *obj) +static int set_cpumap_prog(struct xdp_redirect_cpu *skel, + const char *redir_interface, const char *redir_map, + const char *mprog_filename, const char *mprog_name) { - struct bpf_program *prog; - - bpf_object__for_each_program(prog, obj) { - if (bpf_program__is_tracepoint(prog) != true) - continue; - - tp_links[tp_cnt] = bpf_program__attach(prog); - if (libbpf_get_error(tp_links[tp_cnt])) { - tp_links[tp_cnt] = NULL; - return -EINVAL; + if (mprog_filename) { + struct bpf_program *prog; + struct bpf_object *obj; + int ret; + + if (!mprog_name) { + fprintf(stderr, "BPF program not specified for file %s\n", + mprog_filename); + goto end; + } + if ((redir_interface && !redir_map) || (!redir_interface && redir_map)) { + fprintf(stderr, "--redirect-%s specified but --redirect-%s not specified\n", + redir_interface ? "device" : "map", redir_interface ? "map" : "device"); + goto end; } - tp_cnt++; - } - - return 0; -} - -static int init_map_fds(struct bpf_object *obj) -{ - enum map_type type; - - for (type = 0; type < NUM_MAP; type++) { - map_fds[type] = - bpf_object__find_map_fd_by_name(obj, - map_type_strings[type]); - - if (map_fds[type] < 0) - return -ENOENT; - } - - return 0; -} -static int load_cpumap_prog(char *file_name, char *prog_name, - char *redir_interface, char *redir_map) -{ - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - .expected_attach_type = BPF_XDP_CPUMAP, - .file = file_name, - }; - struct bpf_program *prog; - struct bpf_object *obj; - int fd; + /* Custom BPF program */ + obj = bpf_object__open_file(mprog_filename, NULL); + if (!obj) { + ret = -errno; + fprintf(stderr, "Failed to bpf_prog_load_xattr: %s\n", + strerror(errno)); + return ret; + } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &fd)) - return -1; + ret = bpf_object__load(obj); + if (ret < 0) { + ret = -errno; + fprintf(stderr, "Failed to bpf_object__load: %s\n", + strerror(errno)); + return ret; + } - if (fd < 0) { - fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", - strerror(errno)); - return fd; - } + if (redir_map) { + int err, redir_map_fd, ifindex_out, key = 0; - if (redir_interface && redir_map) { - int err, map_fd, ifindex_out, key = 0; + redir_map_fd = bpf_object__find_map_fd_by_name(obj, redir_map); + if (redir_map_fd < 0) { + fprintf(stderr, "Failed to bpf_object__find_map_fd_by_name: %s\n", + strerror(errno)); + return redir_map_fd; + } - map_fd = bpf_object__find_map_fd_by_name(obj, redir_map); - if (map_fd < 0) - return map_fd; + ifindex_out = if_nametoindex(redir_interface); + if (!ifindex_out) + ifindex_out = strtoul(redir_interface, NULL, 0); + if (!ifindex_out) { + fprintf(stderr, "Bad interface name or index\n"); + return -EINVAL; + } - ifindex_out = if_nametoindex(redir_interface); - if (!ifindex_out) - return -1; + err = bpf_map_update_elem(redir_map_fd, &key, &ifindex_out, 0); + if (err < 0) + return err; + } - err = bpf_map_update_elem(map_fd, &key, &ifindex_out, 0); - if (err < 0) - return err; - } + prog = bpf_object__find_program_by_name(obj, mprog_name); + if (!prog) { + ret = -errno; + fprintf(stderr, "Failed to bpf_object__find_program_by_name: %s\n", + strerror(errno)); + return ret; + } - prog = bpf_object__find_program_by_title(obj, prog_name); - if (!prog) { - fprintf(stderr, "bpf_object__find_program_by_title failed\n"); - return EXIT_FAIL; + return bpf_program__fd(prog); + } else { + if (mprog_name) { + if (redir_interface || redir_map) { + fprintf(stderr, "Need to specify --mprog-filename/-f\n"); + goto end; + } + if (!strcmp(mprog_name, "pass") || !strcmp(mprog_name, "drop")) { + /* Use built-in pass/drop programs */ + return *mprog_name == 'p' ? bpf_program__fd(skel->progs.xdp_redirect_cpu_pass) + : bpf_program__fd(skel->progs.xdp_redirect_cpu_drop); + } else { + fprintf(stderr, "Unknown name \"%s\" for built-in BPF program\n", + mprog_name); + goto end; + } + } else { + if (redir_map) { + fprintf(stderr, "Need to specify --mprog-filename, --mprog-name and" + " --redirect-device with --redirect-map\n"); + goto end; + } + if (redir_interface) { + /* Use built-in devmap redirect */ + struct bpf_devmap_val val = {}; + int ifindex_out, err; + __u32 key = 0; + + if (!redir_interface) + return 0; + + ifindex_out = if_nametoindex(redir_interface); + if (!ifindex_out) + ifindex_out = strtoul(redir_interface, NULL, 0); + if (!ifindex_out) { + fprintf(stderr, "Bad interface name or index\n"); + return -EINVAL; + } + + if (get_mac_addr(ifindex_out, skel->bss->tx_mac_addr) < 0) { + printf("Get interface %d mac failed\n", ifindex_out); + return -EINVAL; + } + + val.ifindex = ifindex_out; + val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_redirect_egress_prog); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.tx_port), &key, &val, 0); + if (err < 0) + return -errno; + + return bpf_program__fd(skel->progs.xdp_redirect_cpu_devmap); + } + } } - return bpf_program__fd(prog); + /* Disabled */ + return 0; +end: + fprintf(stderr, "Invalid options for CPUMAP BPF program\n"); + return -EINVAL; } int main(int argc, char **argv) { - char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; - char *mprog_filename = "xdp_redirect_kern.o"; - char *redir_interface = NULL, *redir_map = NULL; - char *mprog_name = "xdp_redirect_dummy"; - bool mprog_disable = false; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); + const char *redir_interface = NULL, *redir_map = NULL; + const char *mprog_filename = NULL, *mprog_name = NULL; + struct xdp_redirect_cpu *skel; + struct bpf_map_info info = {}; + char ifname_buf[IF_NAMESIZE]; struct bpf_cpumap_val value; - bool use_separators = true; + __u32 infosz = sizeof(info); + int ret = EXIT_FAIL_OPTION; + unsigned long interval = 2; bool stress_mode = false; struct bpf_program *prog; - struct bpf_object *obj; - int err = EXIT_FAIL; - char filename[256]; + const char *prog_name; + bool generic = false; + bool force = false; int added_cpus = 0; + bool error = true; int longindex = 0; - int interval = 2; int add_cpu = -1; - int opt, prog_fd; - int *cpu, i; + int ifindex = -1; + int *cpu, i, opt; + char *ifname; __u32 qsize; + int n_cpus; - n_cpus = get_nprocs_conf(); + n_cpus = libbpf_num_possible_cpus(); /* Notice: Choosing the queue size is very important when CPU is * configured with power-saving states. @@ -810,73 +349,87 @@ int main(int argc, char **argv) */ qsize = 2048; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return err; - - if (prog_fd < 0) { - fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", + skel = xdp_redirect_cpu__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect_cpu__open: %s\n", strerror(errno)); - return err; + ret = EXIT_FAIL_BPF; + goto end; + } + + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (init_tracepoints(obj) < 0) { - fprintf(stderr, "ERR: bpf_program__attach failed\n"); - return err; + if (bpf_map__set_max_entries(skel->maps.cpu_map, n_cpus) < 0) { + fprintf(stderr, "Failed to set max entries for cpu_map map: %s", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - if (init_map_fds(obj) < 0) { - fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); - return err; + if (bpf_map__set_max_entries(skel->maps.cpus_available, n_cpus) < 0) { + fprintf(stderr, "Failed to set max entries for cpus_available map: %s", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - mark_cpus_unavailable(); - cpu = malloc(n_cpus * sizeof(int)); + cpu = calloc(n_cpus, sizeof(int)); if (!cpu) { - fprintf(stderr, "failed to allocate cpu array\n"); - return err; + fprintf(stderr, "Failed to allocate cpu array\n"); + goto end_destroy; } - memset(cpu, 0, n_cpus * sizeof(int)); - /* Parse commands line args */ - while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:n", + prog = skel->progs.xdp_prognum5_lb_hash_ip_pairs; + while ((opt = getopt_long(argc, argv, "d:si:Sxp:f:e:r:m:c:q:Fvh", long_options, &longindex)) != -1) { switch (opt) { case 'd': if (strlen(optarg) >= IF_NAMESIZE) { - fprintf(stderr, "ERR: --dev name too long\n"); - goto error; + fprintf(stderr, "-d/--dev name too long\n"); + goto end_cpu; } ifname = (char *)&ifname_buf; - strncpy(ifname, optarg, IF_NAMESIZE); + safe_strncpy(ifname, optarg, sizeof(ifname)); ifindex = if_nametoindex(ifname); - if (ifindex == 0) { - fprintf(stderr, - "ERR: --dev name unknown err(%d):%s\n", + if (!ifindex) + ifindex = strtoul(optarg, NULL, 0); + if (!ifindex) { + fprintf(stderr, "Bad interface index or name (%d): %s\n", errno, strerror(errno)); - goto error; + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } break; case 's': - interval = atoi(optarg); + mask |= SAMPLE_REDIRECT_MAP_CNT; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); break; case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; + generic = true; break; case 'x': stress_mode = true; break; - case 'z': - use_separators = false; - break; case 'p': /* Selecting eBPF prog to load */ prog_name = optarg; - break; - case 'n': - mprog_disable = true; + prog = bpf_object__find_program_by_name(skel->obj, + prog_name); + if (!prog) { + fprintf(stderr, + "Failed to find program %s specified by" + " option -p/--progname\n", + prog_name); + print_avail_progs(skel->obj); + goto end_cpu; + } break; case 'f': mprog_filename = optarg; @@ -886,6 +439,7 @@ int main(int argc, char **argv) break; case 'r': redir_interface = optarg; + mask |= SAMPLE_DEVMAP_XMIT_CNT_MULTI; break; case 'm': redir_map = optarg; @@ -897,91 +451,112 @@ int main(int argc, char **argv) fprintf(stderr, "--cpu nr too large for cpumap err(%d):%s\n", errno, strerror(errno)); - goto error; + goto end_cpu; } cpu[added_cpus++] = add_cpu; break; case 'q': - qsize = atoi(optarg); + qsize = strtoul(optarg, NULL, 0); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; + break; + case 'v': + sample_switch_mode(); break; case 'h': - error: + error = false; default: - free(cpu); - usage(argv, obj); - return EXIT_FAIL_OPTION; + usage(argv, long_options, __doc__, mask, error, skel->obj); + goto end_cpu; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) - xdp_flags |= XDP_FLAGS_DRV_MODE; - - /* Required option */ + ret = EXIT_FAIL_OPTION; if (ifindex == -1) { - fprintf(stderr, "ERR: required option --dev missing\n"); - usage(argv, obj); - err = EXIT_FAIL_OPTION; - goto out; + fprintf(stderr, "Required option --dev missing\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } - /* Required option */ + if (add_cpu == -1) { - fprintf(stderr, "ERR: required option --cpu missing\n"); - fprintf(stderr, " Specify multiple --cpu option to add more\n"); - usage(argv, obj); - err = EXIT_FAIL_OPTION; - goto out; + fprintf(stderr, "Required option --cpu missing\n" + "Specify multiple --cpu option to add more\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; } - value.bpf_prog.fd = 0; - if (!mprog_disable) - value.bpf_prog.fd = load_cpumap_prog(mprog_filename, mprog_name, - redir_interface, redir_map); - if (value.bpf_prog.fd < 0) { - err = value.bpf_prog.fd; - goto out; + skel->rodata->from_match[0] = ifindex; + if (redir_interface) + skel->rodata->to_match[0] = if_nametoindex(redir_interface); + + ret = xdp_redirect_cpu__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect_cpu__load: %s\n", + strerror(errno)); + goto end_cpu; } - value.qsize = qsize; - for (i = 0; i < added_cpus; i++) - create_cpu_entry(cpu[i], &value, i, true); + ret = bpf_obj_get_info_by_fd(bpf_map__fd(skel->maps.cpu_map), &info, &infosz); + if (ret < 0) { + fprintf(stderr, "Failed bpf_obj_get_info_by_fd for cpumap: %s\n", + strerror(errno)); + goto end_cpu; + } - /* Remove XDP program when program is interrupted or killed */ - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + skel->bss->cpumap_map_id = info.id; - prog = bpf_object__find_program_by_title(obj, prog_name); - if (!prog) { - fprintf(stderr, "bpf_object__find_program_by_title failed\n"); - goto out; + map_fd = bpf_map__fd(skel->maps.cpu_map); + avail_fd = bpf_map__fd(skel->maps.cpus_available); + count_fd = bpf_map__fd(skel->maps.cpus_count); + + ret = mark_cpus_unavailable(); + if (ret < 0) { + fprintf(stderr, "Unable to mark CPUs as unavailable\n"); + goto end_cpu; } - prog_fd = bpf_program__fd(prog); - if (prog_fd < 0) { - fprintf(stderr, "bpf_program__fd failed\n"); - goto out; + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_cpu; } - if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { - fprintf(stderr, "link set xdp fd failed\n"); - err = EXIT_FAIL_XDP; - goto out; + value.bpf_prog.fd = set_cpumap_prog(skel, redir_interface, redir_map, + mprog_filename, mprog_name); + if (value.bpf_prog.fd < 0) { + fprintf(stderr, "Failed to set CPUMAP BPF program: %s\n", + strerror(-value.bpf_prog.fd)); + usage(argv, long_options, __doc__, mask, true, skel->obj); + ret = EXIT_FAIL_BPF; + goto end_cpu; } + value.qsize = qsize; - err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (err) { - printf("can't get prog info - %s\n", strerror(errno)); - goto out; + for (i = 0; i < added_cpus; i++) { + if (create_cpu_entry(cpu[i], &value, i, true) < 0) { + fprintf(stderr, "Cannot proceed, exiting\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_cpu; + } } - prog_id = info.id; - stats_poll(interval, use_separators, prog_name, mprog_name, - &value, stress_mode); + ret = EXIT_FAIL_XDP; + if (sample_install_xdp(prog, ifindex, generic, force) < 0) + goto end_cpu; - err = EXIT_OK; -out: + ret = sample_run(interval, stress_mode ? stress_cpumap : NULL, &value); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_cpu; + } + ret = EXIT_OK; +end_cpu: free(cpu); - return err; +end_destroy: + xdp_redirect_cpu__destroy(skel); +end: + sample_exit(ret); } -- cgit v1.2.3 From 54af769db92a47be8a9d23a4434dbd343b36f216 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:07 +0530 Subject: samples: bpf: Convert xdp_redirect_map_kern.o to XDP samples helper Also update it to use consistent SEC("xdp") and SEC("xdp_devmap") naming, and use global variable instead of BPF map for copying the mac address. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-20-memxor@gmail.com --- samples/bpf/Makefile | 6 +- samples/bpf/xdp_redirect_map.bpf.c | 95 ++++++++++++++++++++++++ samples/bpf/xdp_redirect_map_kern.c | 142 ------------------------------------ 3 files changed, 99 insertions(+), 144 deletions(-) create mode 100644 samples/bpf/xdp_redirect_map.bpf.c delete mode 100644 samples/bpf/xdp_redirect_map_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 43d3e52a8659..8faef4bcead4 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -163,7 +163,6 @@ always-y += tcp_clamp_kern.o always-y += tcp_basertt_kern.o always-y += tcp_tos_reflect_kern.o always-y += tcp_dumpstats_kern.o -always-y += xdp_redirect_map_kern.o always-y += xdp_redirect_map_multi_kern.o always-y += xdp_rxq_info_kern.o always-y += xdp2skb_meta_kern.o @@ -357,6 +356,7 @@ endef CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) $(obj)/xdp_redirect_cpu.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_redirect_map.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o @@ -368,10 +368,12 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x -I$(srctree)/tools/lib $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ -LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect.skel.h xdp_monitor.skel.h +LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map.skel.h \ + xdp_redirect.skel.h xdp_monitor.skel.h clean-files += $(LINKED_SKELS) xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o +xdp_redirect_map.skel.h-deps := xdp_redirect_map.bpf.o xdp_sample.bpf.o xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o diff --git a/samples/bpf/xdp_redirect_map.bpf.c b/samples/bpf/xdp_redirect_map.bpf.c new file mode 100644 index 000000000000..59efd656e1b2 --- /dev/null +++ b/samples/bpf/xdp_redirect_map.bpf.c @@ -0,0 +1,95 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define KBUILD_MODNAME "foo" + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +/* The 2nd xdp prog on egress does not support skb mode, so we define two + * maps, tx_port_general and tx_port_native. + */ +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 1); +} tx_port_general SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 1); +} tx_port_native SEC(".maps"); + +/* store egress interface mac address */ +const volatile char tx_mac_addr[ETH_ALEN]; + +static __always_inline int xdp_redirect_map(struct xdp_md *ctx, void *redirect_map) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = bpf_get_smp_processor_id(); + struct ethhdr *eth = data; + struct datarec *rec; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + swap_src_dst_mac(data); + return bpf_redirect_map(redirect_map, 0, 0); +} + +SEC("xdp") +int xdp_redirect_map_general(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &tx_port_general); +} + +SEC("xdp") +int xdp_redirect_map_native(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &tx_port_native); +} + +SEC("xdp_devmap/egress") +int xdp_redirect_map_egress(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + __builtin_memcpy(eth->h_source, (const char *)tx_mac_addr, ETH_ALEN); + + return XDP_PASS; +} + +/* Redirect require an XDP bpf_prog loaded on the TX device */ +SEC("xdp") +int xdp_redirect_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c deleted file mode 100644 index a92b8e567bdd..000000000000 --- a/samples/bpf/xdp_redirect_map_kern.c +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#define KBUILD_MODNAME "foo" -#include -#include -#include -#include -#include -#include -#include -#include - -/* The 2nd xdp prog on egress does not support skb mode, so we define two - * maps, tx_port_general and tx_port_native. - */ -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); - __uint(max_entries, 100); -} tx_port_general SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(struct bpf_devmap_val)); - __uint(max_entries, 100); -} tx_port_native SEC(".maps"); - -/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success - * feedback. Redirect TX errors can be caught via a tracepoint. - */ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, long); - __uint(max_entries, 1); -} rxcnt SEC(".maps"); - -/* map to store egress interface mac address */ -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, __be64); - __uint(max_entries, 1); -} tx_mac SEC(".maps"); - -static void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -static __always_inline int xdp_redirect_map(struct xdp_md *ctx, void *redirect_map) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - int rc = XDP_DROP; - long *value; - u32 key = 0; - u64 nh_off; - int vport; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - /* constant virtual port */ - vport = 0; - - /* count packet in global counter */ - value = bpf_map_lookup_elem(&rxcnt, &key); - if (value) - *value += 1; - - swap_src_dst_mac(data); - - /* send packet out physical port */ - return bpf_redirect_map(redirect_map, vport, 0); -} - -SEC("xdp_redirect_general") -int xdp_redirect_map_general(struct xdp_md *ctx) -{ - return xdp_redirect_map(ctx, &tx_port_general); -} - -SEC("xdp_redirect_native") -int xdp_redirect_map_native(struct xdp_md *ctx) -{ - return xdp_redirect_map(ctx, &tx_port_native); -} - -SEC("xdp_devmap/map_prog") -int xdp_redirect_map_egress(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - __be64 *mac; - u32 key = 0; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return XDP_DROP; - - mac = bpf_map_lookup_elem(&tx_mac, &key); - if (mac) - __builtin_memcpy(eth->h_source, mac, ETH_ALEN); - - return XDP_PASS; -} - -/* Redirect require an XDP bpf_prog loaded on the TX device */ -SEC("xdp_redirect_dummy") -int xdp_redirect_dummy_prog(struct xdp_md *ctx) -{ - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bbe65865aa05fdbd20e37bbd3b2c95a0e9e24416 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:08 +0530 Subject: samples: bpf: Convert xdp_redirect_map to XDP samples helper Use the libbpf skeleton facility and other utilities provided by XDP samples helper. Since get_mac_addr is already provided by XDP samples helper, we drop it. Also convert to XDP samples helper similar to prior samples to minimize duplication of code. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-21-memxor@gmail.com --- samples/bpf/Makefile | 5 +- samples/bpf/xdp_redirect_map_user.c | 385 +++++++++++++++--------------------- 2 files changed, 161 insertions(+), 229 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 8faef4bcead4..6decc8f9bcc2 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -39,7 +39,6 @@ tprogs-y += lwt_len_hist tprogs-y += xdp_tx_iptunnel tprogs-y += test_map_in_map tprogs-y += per_socket_stats_example -tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect_map_multi tprogs-y += xdp_rxq_info tprogs-y += syscall_tp @@ -55,6 +54,7 @@ tprogs-y += ibumad tprogs-y += hbm tprogs-y += xdp_redirect_cpu +tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect tprogs-y += xdp_monitor @@ -100,7 +100,6 @@ lwt_len_hist-objs := lwt_len_hist_user.o xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o -xdp_redirect_map-objs := xdp_redirect_map_user.o xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o @@ -116,6 +115,7 @@ ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE) +xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE) xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE) xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE) @@ -311,6 +311,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/xdp_redirect_cpu_user.o: $(obj)/xdp_redirect_cpu.skel.h +$(obj)/xdp_redirect_map_user.o: $(obj)/xdp_redirect_map.skel.h $(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 0e8192688dfc..b6e4fc849577 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ +static const char *__doc__ = +"XDP redirect tool, using BPF_MAP_TYPE_DEVMAP\n" +"Usage: xdp_redirect_map _IN _OUT\n"; + #include #include #include @@ -13,165 +17,83 @@ #include #include #include -#include -#include -#include -#include -#include - -#include "bpf_util.h" +#include #include #include +#include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect_map.skel.h" -static int ifindex_in; -static int ifindex_out; -static bool ifindex_out_xdp_dummy_attached = true; -static bool xdp_devmap_attached; -static __u32 prog_id; -static __u32 dummy_prog_id; - -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int rxcnt_map_fd; - -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; - - if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface IN\n"); - else - printf("program on iface IN changed, not removing\n"); - - if (ifindex_out_xdp_dummy_attached) { - curr_prog_id = 0; - if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, - xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (dummy_prog_id == curr_prog_id) - bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on iface OUT\n"); - else - printf("program on iface OUT changed, not removing\n"); - } - exit(0); -} - -static void poll_stats(int interval, int ifindex) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus], prev[nr_cpus]; - - memset(prev, 0, sizeof(prev)); - - while (1) { - __u64 sum = 0; - __u32 key = 0; - int i; - - sleep(interval); - assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[i]); - if (sum) - printf("ifindex %i: %10llu pkt/s\n", - ifindex, sum / interval); - memcpy(prev, values, sizeof(values)); - } -} - -static int get_mac_addr(unsigned int ifindex_out, void *mac_addr) -{ - char ifname[IF_NAMESIZE]; - struct ifreq ifr; - int fd, ret = -1; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - if (fd < 0) - return ret; - - if (!if_indextoname(ifindex_out, ifname)) - goto err_out; - - strcpy(ifr.ifr_name, ifname); - - if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) - goto err_out; - - memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); - ret = 0; +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI; -err_out: - close(fd); - return ret; -} +DEFINE_SAMPLE_INIT(xdp_redirect_map); -static void usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] _IN _OUT\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -N enforce native mode\n" - " -F force loading prog\n" - " -X load xdp program on egress\n", - prog); -} +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "load-egress", no_argument, NULL, 'X' }, + { "stats", no_argument, NULL, 's' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + {} +}; int main(int argc, char **argv) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - struct bpf_program *prog, *dummy_prog, *devmap_prog; - int prog_fd, dummy_prog_fd, devmap_prog_fd = 0; - int tx_port_map_fd, tx_mac_map_fd; - struct bpf_devmap_val devmap_val; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - const char *optstr = "FSNX"; - struct bpf_object *obj; - int ret, opt, key = 0; - char filename[256]; - - while ((opt = getopt(argc, argv, optstr)) != -1) { + struct bpf_devmap_val devmap_val = {}; + bool xdp_devmap_attached = false; + struct xdp_redirect_map *skel; + char str[2 * IF_NAMESIZE + 1]; + char ifname_out[IF_NAMESIZE]; + struct bpf_map *tx_port_map; + char ifname_in[IF_NAMESIZE]; + int ifindex_in, ifindex_out; + unsigned long interval = 2; + int ret = EXIT_FAIL_OPTION; + struct bpf_program *prog; + bool generic = false; + bool force = false; + bool tried = false; + bool error = true; + int opt, key = 0; + + while ((opt = getopt_long(argc, argv, "hSFXi:vs", + long_options, NULL)) != -1) { switch (opt) { case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - case 'N': - /* default, set below */ + generic = true; + /* devmap_xmit tracepoint not available */ + mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; break; case 'X': xdp_devmap_attached = true; break; + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); + break; + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) { - xdp_flags |= XDP_FLAGS_DRV_MODE; - } else if (xdp_devmap_attached) { - printf("Load xdp program on egress with SKB mode not supported yet\n"); - return 1; - } - - if (optind == argc) { - printf("usage: %s _IN _OUT\n", argv[0]); - return 1; + if (argc <= optind + 1) { + sample_usage(argv, long_options, __doc__, mask, true); + goto end; } ifindex_in = if_nametoindex(argv[optind]); @@ -182,107 +104,116 @@ int main(int argc, char **argv) if (!ifindex_out) ifindex_out = strtoul(argv[optind + 1], NULL, 0); - printf("input: %d output: %d\n", ifindex_in, ifindex_out); - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - return 1; - - if (xdp_flags & XDP_FLAGS_SKB_MODE) { - prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general"); - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port_general"); - } else { - prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native"); - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port_native"); - } - dummy_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_dummy_prog"); - if (!prog || dummy_prog < 0 || tx_port_map_fd < 0) { - printf("finding prog/dummy_prog/tx_port_map in obj file failed\n"); - goto out; - } - prog_fd = bpf_program__fd(prog); - dummy_prog_fd = bpf_program__fd(dummy_prog); - if (prog_fd < 0 || dummy_prog_fd < 0 || tx_port_map_fd < 0) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); - return 1; - } - - tx_mac_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_mac"); - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - if (tx_mac_map_fd < 0 || rxcnt_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - return 1; + if (!ifindex_in || !ifindex_out) { + fprintf(stderr, "Bad interface index or name\n"); + sample_usage(argv, long_options, __doc__, mask, true); + goto end; } - if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { - printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); - return 1; + skel = xdp_redirect_map__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect_map__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; } - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - prog_id = info.id; - - /* Loading dummy XDP prog on out-device */ - if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, - (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { - printf("WARN: link set xdp fd failed on %d\n", ifindex_out); - ifindex_out_xdp_dummy_attached = false; - } - - memset(&info, 0, sizeof(info)); - ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return ret; - } - dummy_prog_id = info.id; /* Load 2nd xdp prog on egress. */ if (xdp_devmap_attached) { - unsigned char mac_addr[6]; - - devmap_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_egress"); - if (!devmap_prog) { - printf("finding devmap_prog in obj file failed\n"); - goto out; - } - devmap_prog_fd = bpf_program__fd(devmap_prog); - if (devmap_prog_fd < 0) { - printf("finding devmap_prog fd failed\n"); - goto out; - } - - if (get_mac_addr(ifindex_out, mac_addr) < 0) { - printf("get interface %d mac failed\n", ifindex_out); - goto out; + ret = get_mac_addr(ifindex_out, skel->rodata->tx_mac_addr); + if (ret < 0) { + fprintf(stderr, "Failed to get interface %d mac address: %s\n", + ifindex_out, strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } + } - ret = bpf_map_update_elem(tx_mac_map_fd, &key, mac_addr, 0); - if (ret) { - perror("bpf_update_elem tx_mac_map_fd"); - goto out; + skel->rodata->from_match[0] = ifindex_in; + skel->rodata->to_match[0] = ifindex_out; + + ret = xdp_redirect_map__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect_map__load: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + + prog = skel->progs.xdp_redirect_map_native; + tx_port_map = skel->maps.tx_port_native; +restart: + if (sample_install_xdp(prog, ifindex_in, generic, force) < 0) { + /* First try with struct bpf_devmap_val as value for generic + * mode, then fallback to sizeof(int) for older kernels. + */ + fprintf(stderr, + "Trying fallback to sizeof(int) as value_size for devmap in generic mode\n"); + if (generic && !tried) { + prog = skel->progs.xdp_redirect_map_general; + tx_port_map = skel->maps.tx_port_general; + tried = true; + goto restart; } + ret = EXIT_FAIL_XDP; + goto end_destroy; } - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + /* Loading dummy XDP prog on out-device */ + sample_install_xdp(skel->progs.xdp_redirect_dummy_prog, ifindex_out, generic, force); devmap_val.ifindex = ifindex_out; - devmap_val.bpf_prog.fd = devmap_prog_fd; - ret = bpf_map_update_elem(tx_port_map_fd, &key, &devmap_val, 0); - if (ret) { - perror("bpf_update_elem"); - goto out; - } - - poll_stats(2, ifindex_out); - -out: - return 0; + if (xdp_devmap_attached) + devmap_val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_redirect_map_egress); + ret = bpf_map_update_elem(bpf_map__fd(tx_port_map), &key, &devmap_val, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update devmap value: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = EXIT_FAIL; + if (!if_indextoname(ifindex_in, ifname_in)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_in, + strerror(errno)); + goto end_destroy; + } + + if (!if_indextoname(ifindex_out, ifname_out)) { + fprintf(stderr, "Failed to if_indextoname for %d: %s\n", ifindex_out, + strerror(errno)); + goto end_destroy; + } + + safe_strncpy(str, get_driver_name(ifindex_in), sizeof(str)); + printf("Redirecting from %s (ifindex %d; driver %s) to %s (ifindex %d; driver %s)\n", + ifname_in, ifindex_in, str, ifname_out, ifindex_out, get_driver_name(ifindex_out)); + snprintf(str, sizeof(str), "%s->%s", ifname_in, ifname_out); + + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + ret = EXIT_OK; +end_destroy: + xdp_redirect_map__destroy(skel); +end: + sample_exit(ret); } -- cgit v1.2.3 From a29b3ca17ee69e3e5182f1ed29be6b6ec306c149 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:09 +0530 Subject: samples: bpf: Convert xdp_redirect_map_multi_kern.o to XDP samples helper One of the notable changes is using a BPF_MAP_TYPE_HASH instead of array map to store mac addresses of devices, as the resizing behavior was based on max_ifindex, which unecessarily maximized the capacity of map beyond what was needed. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-22-memxor@gmail.com --- samples/bpf/Makefile | 7 +-- samples/bpf/xdp_redirect_map_multi.bpf.c | 82 ++++++++++++++++++++++++++++ samples/bpf/xdp_redirect_map_multi_kern.c | 88 ------------------------------- 3 files changed, 86 insertions(+), 91 deletions(-) create mode 100644 samples/bpf/xdp_redirect_map_multi.bpf.c delete mode 100644 samples/bpf/xdp_redirect_map_multi_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 6decc8f9bcc2..2b3d9e39c4f3 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -163,7 +163,6 @@ always-y += tcp_clamp_kern.o always-y += tcp_basertt_kern.o always-y += tcp_tos_reflect_kern.o always-y += tcp_dumpstats_kern.o -always-y += xdp_redirect_map_multi_kern.o always-y += xdp_rxq_info_kern.o always-y += xdp2skb_meta_kern.o always-y += syscall_tp_kern.o @@ -357,6 +356,7 @@ endef CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG)) $(obj)/xdp_redirect_cpu.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_redirect_map_multi.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_redirect_map.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o @@ -369,11 +369,12 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x -I$(srctree)/tools/lib $(CLANG_SYS_INCLUDES) \ -c $(filter %.bpf.c,$^) -o $@ -LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map.skel.h \ - xdp_redirect.skel.h xdp_monitor.skel.h +LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map_multi.skel.h \ + xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h clean-files += $(LINKED_SKELS) xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o +xdp_redirect_map_multi.skel.h-deps := xdp_redirect_map_multi.bpf.o xdp_sample.bpf.o xdp_redirect_map.skel.h-deps := xdp_redirect_map.bpf.o xdp_sample.bpf.o xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o diff --git a/samples/bpf/xdp_redirect_map_multi.bpf.c b/samples/bpf/xdp_redirect_map_multi.bpf.c new file mode 100644 index 000000000000..8f59d430cb64 --- /dev/null +++ b/samples/bpf/xdp_redirect_map_multi.bpf.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KBUILD_MODNAME "foo" + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 32); +} forward_map_general SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(struct bpf_devmap_val)); + __uint(max_entries, 32); +} forward_map_native SEC(".maps"); + +/* map to store egress interfaces mac addresses */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, __be64); + __uint(max_entries, 32); +} mac_map SEC(".maps"); + +static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map) +{ + u32 key = bpf_get_smp_processor_id(); + struct datarec *rec; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_PASS; + NO_TEAR_INC(rec->processed); + + return bpf_redirect_map(forward_map, 0, + BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); +} + +SEC("xdp") +int xdp_redirect_map_general(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &forward_map_general); +} + +SEC("xdp") +int xdp_redirect_map_native(struct xdp_md *ctx) +{ + return xdp_redirect_map(ctx, &forward_map_native); +} + +SEC("xdp_devmap/egress") +int xdp_devmap_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + u32 key = ctx->egress_ifindex; + struct ethhdr *eth = data; + __be64 *mac; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return XDP_DROP; + + mac = bpf_map_lookup_elem(&mac_map, &key); + if (mac) + __builtin_memcpy(eth->h_source, mac, ETH_ALEN); + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_map_multi_kern.c b/samples/bpf/xdp_redirect_map_multi_kern.c deleted file mode 100644 index 71aa23d1cb2b..000000000000 --- a/samples/bpf/xdp_redirect_map_multi_kern.c +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define KBUILD_MODNAME "foo" -#include -#include -#include -#include -#include -#include - -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); - __uint(max_entries, 32); -} forward_map_general SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(struct bpf_devmap_val)); - __uint(max_entries, 32); -} forward_map_native SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, long); - __uint(max_entries, 1); -} rxcnt SEC(".maps"); - -/* map to store egress interfaces mac addresses, set the - * max_entries to 1 and extend it in user sapce prog. - */ -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __type(key, u32); - __type(value, __be64); - __uint(max_entries, 1); -} mac_map SEC(".maps"); - -static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map) -{ - long *value; - u32 key = 0; - - /* count packet in global counter */ - value = bpf_map_lookup_elem(&rxcnt, &key); - if (value) - *value += 1; - - return bpf_redirect_map(forward_map, key, - BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); -} - -SEC("xdp_redirect_general") -int xdp_redirect_map_general(struct xdp_md *ctx) -{ - return xdp_redirect_map(ctx, &forward_map_general); -} - -SEC("xdp_redirect_native") -int xdp_redirect_map_native(struct xdp_md *ctx) -{ - return xdp_redirect_map(ctx, &forward_map_native); -} - -SEC("xdp_devmap/map_prog") -int xdp_devmap_prog(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - u32 key = ctx->egress_ifindex; - struct ethhdr *eth = data; - __be64 *mac; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return XDP_DROP; - - mac = bpf_map_lookup_elem(&mac_map, &key); - if (mac) - __builtin_memcpy(eth->h_source, mac, ETH_ALEN); - - return XDP_PASS; -} - -char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From 594a116b2aa1985dbb5318c2be39b64b74ebff84 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 21 Aug 2021 05:50:10 +0530 Subject: samples: bpf: Convert xdp_redirect_map_multi to XDP samples helper Use the libbpf skeleton facility and other utilities provided by XDP samples helper. Also adapt to change of type of mac address map, so that no resizing is required. Add a new flag for sample mask that skips priting the from_device->to_device heading for each line, as xdp_redirect_map_multi may have two devices but the flow of data may be bidirectional, so the output would be confusing. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210821002010.845777-23-memxor@gmail.com --- samples/bpf/Makefile | 5 +- samples/bpf/xdp_redirect_map_multi_user.c | 345 ++++++++++++------------------ samples/bpf/xdp_sample_user.c | 2 +- samples/bpf/xdp_sample_user.h | 21 +- 4 files changed, 153 insertions(+), 220 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 2b3d9e39c4f3..4dc20be5fb96 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -39,7 +39,6 @@ tprogs-y += lwt_len_hist tprogs-y += xdp_tx_iptunnel tprogs-y += test_map_in_map tprogs-y += per_socket_stats_example -tprogs-y += xdp_redirect_map_multi tprogs-y += xdp_rxq_info tprogs-y += syscall_tp tprogs-y += cpustat @@ -54,6 +53,7 @@ tprogs-y += ibumad tprogs-y += hbm tprogs-y += xdp_redirect_cpu +tprogs-y += xdp_redirect_map_multi tprogs-y += xdp_redirect_map tprogs-y += xdp_redirect tprogs-y += xdp_monitor @@ -100,7 +100,6 @@ lwt_len_hist-objs := lwt_len_hist_user.o xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o test_map_in_map-objs := test_map_in_map_user.o per_socket_stats_example-objs := cookie_uid_helper_example.o -xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o xdp_rxq_info-objs := xdp_rxq_info_user.o syscall_tp-objs := syscall_tp_user.o cpustat-objs := cpustat_user.o @@ -114,6 +113,7 @@ xdp_sample_pkts-objs := xdp_sample_pkts_user.o ibumad-objs := ibumad_user.o hbm-objs := hbm.o $(CGROUP_HELPERS) +xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o $(XDP_SAMPLE) xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE) xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE) xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE) @@ -310,6 +310,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/xdp_redirect_cpu_user.o: $(obj)/xdp_redirect_cpu.skel.h +$(obj)/xdp_redirect_map_multi_user.o: $(obj)/xdp_redirect_map_multi.skel.h $(obj)/xdp_redirect_map_user.o: $(obj)/xdp_redirect_map.skel.h $(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h diff --git a/samples/bpf/xdp_redirect_map_multi_user.c b/samples/bpf/xdp_redirect_map_multi_user.c index 84cdbbed20b7..315314716121 100644 --- a/samples/bpf/xdp_redirect_map_multi_user.c +++ b/samples/bpf/xdp_redirect_map_multi_user.c @@ -1,7 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 +static const char *__doc__ = +"XDP multi redirect tool, using BPF_MAP_TYPE_DEVMAP and BPF_F_BROADCAST flag for bpf_redirect_map\n" +"Usage: xdp_redirect_map_multi ... \n"; + #include #include #include +#include #include #include #include @@ -15,106 +20,54 @@ #include #include #include - -#include "bpf_util.h" +#include #include #include +#include "bpf_util.h" +#include "xdp_sample_user.h" +#include "xdp_redirect_map_multi.skel.h" #define MAX_IFACE_NUM 32 - -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static int ifaces[MAX_IFACE_NUM] = {}; -static int rxcnt_map_fd; - -static void int_exit(int sig) -{ - __u32 prog_id = 0; - int i; - - for (i = 0; ifaces[i] > 0; i++) { - if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(1); - } - if (prog_id) - bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags); - } - - exit(0); -} - -static void poll_stats(int interval) -{ - unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus], prev[nr_cpus]; - - memset(prev, 0, sizeof(prev)); - - while (1) { - __u64 sum = 0; - __u32 key = 0; - int i; - sleep(interval); - assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[i]); - if (sum) - printf("Forwarding %10llu pkt/s\n", sum / interval); - memcpy(prev, values, sizeof(values)); - } -} - -static int get_mac_addr(unsigned int ifindex, void *mac_addr) -{ - char ifname[IF_NAMESIZE]; - struct ifreq ifr; - int fd, ret = -1; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - if (fd < 0) - return ret; - - if (!if_indextoname(ifindex, ifname)) - goto err_out; - - strcpy(ifr.ifr_name, ifname); +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_EXCEPTION_CNT | SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_SKIP_HEADING; - if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) - goto err_out; +DEFINE_SAMPLE_INIT(xdp_redirect_map_multi); - memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char)); - ret = 0; +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "load-egress", no_argument, NULL, 'X' }, + { "stats", no_argument, NULL, 's' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + {} +}; -err_out: - close(fd); - return ret; -} - -static int update_mac_map(struct bpf_object *obj) +static int update_mac_map(struct bpf_map *map) { - int i, ret = -1, mac_map_fd; + int mac_map_fd = bpf_map__fd(map); unsigned char mac_addr[6]; unsigned int ifindex; - - mac_map_fd = bpf_object__find_map_fd_by_name(obj, "mac_map"); - if (mac_map_fd < 0) { - printf("find mac map fd failed\n"); - return ret; - } + int i, ret = -1; for (i = 0; ifaces[i] > 0; i++) { ifindex = ifaces[i]; ret = get_mac_addr(ifindex, mac_addr); if (ret < 0) { - printf("get interface %d mac failed\n", ifindex); + fprintf(stderr, "get interface %d mac failed\n", + ifindex); return ret; } ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0); - if (ret) { - perror("bpf_update_elem mac_map_fd"); + if (ret < 0) { + fprintf(stderr, "Failed to update mac address for ifindex %d\n", + ifindex); return ret; } } @@ -122,181 +75,159 @@ static int update_mac_map(struct bpf_object *obj) return 0; } -static void usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] ...\n" - "OPTS:\n" - " -S use skb-mode\n" - " -N enforce native mode\n" - " -F force loading prog\n" - " -X load xdp program on egress\n", - prog); -} - int main(int argc, char **argv) { - int i, ret, opt, forward_map_fd, max_ifindex = 0; - struct bpf_program *ingress_prog, *egress_prog; - int ingress_prog_fd, egress_prog_fd = 0; - struct bpf_devmap_val devmap_val; - bool attach_egress_prog = false; + struct bpf_devmap_val devmap_val = {}; + struct xdp_redirect_map_multi *skel; + struct bpf_program *ingress_prog; + bool xdp_devmap_attached = false; + struct bpf_map *forward_map; + int ret = EXIT_FAIL_OPTION; + unsigned long interval = 2; char ifname[IF_NAMESIZE]; - struct bpf_map *mac_map; - struct bpf_object *obj; unsigned int ifindex; - char filename[256]; - - while ((opt = getopt(argc, argv, "SNFX")) != -1) { + bool generic = false; + bool force = false; + bool tried = false; + bool error = true; + int i, opt; + + while ((opt = getopt_long(argc, argv, "hSFXi:vs", + long_options, NULL)) != -1) { switch (opt) { case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - case 'N': - /* default, set below */ + generic = true; + /* devmap_xmit tracepoint not available */ + mask &= ~(SAMPLE_DEVMAP_XMIT_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI); break; case 'F': - xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; break; case 'X': - attach_egress_prog = true; + xdp_devmap_attached = true; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + break; + case 'v': + sample_switch_mode(); break; + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } } - if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) { - xdp_flags |= XDP_FLAGS_DRV_MODE; - } else if (attach_egress_prog) { - printf("Load xdp program on egress with SKB mode not supported yet\n"); - return 1; + if (argc <= optind + 1) { + sample_usage(argv, long_options, __doc__, mask, error); + return ret; } - if (optind == argc) { - printf("usage: %s ...\n", argv[0]); - return 1; + skel = xdp_redirect_map_multi__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_redirect_map_multi__open: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end; } - printf("Get interfaces"); + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = EXIT_FAIL_OPTION; for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) { ifaces[i] = if_nametoindex(argv[optind + i]); if (!ifaces[i]) ifaces[i] = strtoul(argv[optind + i], NULL, 0); if (!if_indextoname(ifaces[i], ifname)) { - perror("Invalid interface name or i"); - return 1; + fprintf(stderr, "Bad interface index or name\n"); + sample_usage(argv, long_options, __doc__, mask, true); + goto end_destroy; } - /* Find the largest index number */ - if (ifaces[i] > max_ifindex) - max_ifindex = ifaces[i]; - - printf(" %d", ifaces[i]); - } - printf("\n"); - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - obj = bpf_object__open(filename); - if (libbpf_get_error(obj)) { - printf("ERROR: opening BPF object file failed\n"); - obj = NULL; - goto err_out; + skel->rodata->from_match[i] = ifaces[i]; + skel->rodata->to_match[i] = ifaces[i]; } - /* Reset the map size to max ifindex + 1 */ - if (attach_egress_prog) { - mac_map = bpf_object__find_map_by_name(obj, "mac_map"); - ret = bpf_map__resize(mac_map, max_ifindex + 1); - if (ret < 0) { - printf("ERROR: reset mac map size failed\n"); - goto err_out; - } + ret = xdp_redirect_map_multi__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_redirect_map_multi__load: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } - /* load BPF program */ - if (bpf_object__load(obj)) { - printf("ERROR: loading BPF object file failed\n"); - goto err_out; - } - - if (xdp_flags & XDP_FLAGS_SKB_MODE) { - ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general"); - forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_general"); - } else { - ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native"); - forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_native"); - } - if (!ingress_prog || forward_map_fd < 0) { - printf("finding ingress_prog/forward_map in obj file failed\n"); - goto err_out; - } - - ingress_prog_fd = bpf_program__fd(ingress_prog); - if (ingress_prog_fd < 0) { - printf("find ingress_prog fd failed\n"); - goto err_out; - } - - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - if (rxcnt_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - goto err_out; - } - - if (attach_egress_prog) { + if (xdp_devmap_attached) { /* Update mac_map with all egress interfaces' mac addr */ - if (update_mac_map(obj) < 0) { - printf("Error: update mac map failed"); - goto err_out; + if (update_mac_map(skel->maps.mac_map) < 0) { + fprintf(stderr, "Updating mac address failed\n"); + ret = EXIT_FAIL; + goto end_destroy; } + } - /* Find egress prog fd */ - egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog"); - if (!egress_prog) { - printf("finding egress_prog in obj file failed\n"); - goto err_out; - } - egress_prog_fd = bpf_program__fd(egress_prog); - if (egress_prog_fd < 0) { - printf("find egress_prog fd failed\n"); - goto err_out; - } + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } - /* Remove attached program when program is interrupted or killed */ - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); + ingress_prog = skel->progs.xdp_redirect_map_native; + forward_map = skel->maps.forward_map_native; - /* Init forward multicast groups */ for (i = 0; ifaces[i] > 0; i++) { ifindex = ifaces[i]; + ret = EXIT_FAIL_XDP; +restart: /* bind prog_fd to each interface */ - ret = bpf_set_link_xdp_fd(ifindex, ingress_prog_fd, xdp_flags); - if (ret) { - printf("Set xdp fd failed on %d\n", ifindex); - goto err_out; + if (sample_install_xdp(ingress_prog, ifindex, generic, force) < 0) { + if (generic && !tried) { + fprintf(stderr, + "Trying fallback to sizeof(int) as value_size for devmap in generic mode\n"); + ingress_prog = skel->progs.xdp_redirect_map_general; + forward_map = skel->maps.forward_map_general; + tried = true; + goto restart; + } + goto end_destroy; } /* Add all the interfaces to forward group and attach - * egress devmap programe if exist + * egress devmap program if exist */ devmap_val.ifindex = ifindex; - devmap_val.bpf_prog.fd = egress_prog_fd; - ret = bpf_map_update_elem(forward_map_fd, &ifindex, &devmap_val, 0); - if (ret) { - perror("bpf_map_update_elem forward_map"); - goto err_out; + if (xdp_devmap_attached) + devmap_val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_devmap_prog); + ret = bpf_map_update_elem(bpf_map__fd(forward_map), &ifindex, &devmap_val, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update devmap value: %s\n", + strerror(errno)); + ret = EXIT_FAIL_BPF; + goto end_destroy; } } - poll_stats(2); - - return 0; - -err_out: - return 1; + ret = sample_run(interval, NULL, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + ret = EXIT_OK; +end_destroy: + xdp_redirect_map_multi__destroy(skel); +end: + sample_exit(ret); } diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c index eb484c15492d..b32d82178199 100644 --- a/samples/bpf/xdp_sample_user.c +++ b/samples/bpf/xdp_sample_user.c @@ -1510,7 +1510,7 @@ static int sample_timer_cb(int timerfd, struct stats_record **rec, if (ret < 0) return ret; - if (sample_xdp_cnt == 2) { + if (sample_xdp_cnt == 2 && !(sample_mask & SAMPLE_SKIP_HEADING)) { char fi[IFNAMSIZ]; char to[IFNAMSIZ]; const char *f, *t; diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h index 3a678986cce2..d97465ff8c62 100644 --- a/samples/bpf/xdp_sample_user.h +++ b/samples/bpf/xdp_sample_user.h @@ -8,17 +8,18 @@ #include "xdp_sample_shared.h" enum stats_mask { - _SAMPLE_REDIRECT_MAP = 1U << 0, - SAMPLE_RX_CNT = 1U << 1, - SAMPLE_REDIRECT_ERR_CNT = 1U << 2, - SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, - SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, - SAMPLE_EXCEPTION_CNT = 1U << 5, - SAMPLE_DEVMAP_XMIT_CNT = 1U << 6, - SAMPLE_REDIRECT_CNT = 1U << 7, - SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, - SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, + _SAMPLE_REDIRECT_MAP = 1U << 0, + SAMPLE_RX_CNT = 1U << 1, + SAMPLE_REDIRECT_ERR_CNT = 1U << 2, + SAMPLE_CPUMAP_ENQUEUE_CNT = 1U << 3, + SAMPLE_CPUMAP_KTHREAD_CNT = 1U << 4, + SAMPLE_EXCEPTION_CNT = 1U << 5, + SAMPLE_DEVMAP_XMIT_CNT = 1U << 6, + SAMPLE_REDIRECT_CNT = 1U << 7, + SAMPLE_REDIRECT_MAP_CNT = SAMPLE_REDIRECT_CNT | _SAMPLE_REDIRECT_MAP, + SAMPLE_REDIRECT_ERR_MAP_CNT = SAMPLE_REDIRECT_ERR_CNT | _SAMPLE_REDIRECT_MAP, SAMPLE_DEVMAP_XMIT_CNT_MULTI = 1U << 8, + SAMPLE_SKIP_HEADING = 1U << 9, }; /* Exit return codes */ -- cgit v1.2.3 From 1b07d00a15d6a96d1a36b6a284c4fd5f2e2fa383 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:46 -0700 Subject: bpf: Add BTF_ID_LIST_GLOBAL_SINGLE macro Same as BTF_ID_LIST_SINGLE macro except defines a global ID. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/a867a97517df42fd3953eeb5454402b57e74538f.1629772842.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index bed4b9964581..6d1395030616 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -82,6 +82,9 @@ __BTF_ID_LIST(name, globl) #define BTF_ID_LIST_SINGLE(name, prefix, typename) \ BTF_ID_LIST(name) \ BTF_ID(prefix, typename) +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ + BTF_ID_LIST_GLOBAL(name) \ + BTF_ID(prefix, typename) /* * The BTF_ID_UNUSED macro defines 4 zero bytes. -- cgit v1.2.3 From 33c5cb36015ac1034b50b823fae367e908d05147 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:47 -0700 Subject: bpf: Consolidate task_struct BTF_ID declarations No need to have it defined 5 times. Once is enough. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/6dcefa5bed26fe1226f26683f36819bb53ec19a2.1629772842.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 2 ++ kernel/bpf/bpf_task_storage.c | 6 ++---- kernel/bpf/stackmap.c | 4 +--- kernel/bpf/task_iter.c | 11 +++++------ kernel/trace/bpf_trace.c | 4 ++-- 5 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 6d1395030616..93d881ab0d48 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -188,4 +188,6 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif +extern u32 btf_task_struct_ids[]; + #endif diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 3ce75758d394..ebfa8bc90892 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -317,15 +317,13 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg2_btf_id = &btf_task_struct_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -336,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg2_btf_id = &btf_task_struct_ids[0], }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6fbc2abe9c91..e8eefdf8cf3e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -530,14 +530,12 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return res; } -BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) - const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], + .arg1_btf_id = &btf_task_struct_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b68cb5d6d6eb..b48750bfba5a 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -525,7 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { }; BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, task_struct) BTF_ID(struct, file) BTF_ID(struct, vm_area_struct) @@ -591,19 +590,19 @@ static int __init task_iter_init(void) { int ret; - task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; - task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; - task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cbc73c08c4a4..50d055fc2327 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -714,13 +714,13 @@ BPF_CALL_0(bpf_get_current_task_btf) return (unsigned long) current; } -BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) static const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, - .ret_btf_id = &bpf_get_current_btf_ids[0], + .ret_btf_id = &btf_task_struct_ids[0], }; BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -- cgit v1.2.3 From a396eda5517ac958fb4eb7358f4708eb829058c4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:48 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with bpf_get_current_task_btf() bpf_get_current_task() is already supported so it's natural to also include the _btf() variant for btf-powered helpers. This is required for non-tracing progs to use bpf_task_pt_regs() in the next commit. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/f99870ed5f834c9803d73b3476f8272b1bb987c0.1629772842.git.dxu@dxuuu.xyz --- kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4e8540716187..609674f409ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1322,6 +1322,7 @@ out: } const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; @@ -1407,6 +1408,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 50d055fc2327..4e54f3dc209f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -716,7 +716,7 @@ BPF_CALL_0(bpf_get_current_task_btf) BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) -static const struct bpf_func_proto bpf_get_current_task_btf_proto = { +const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, -- cgit v1.2.3 From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:49 -0700 Subject: bpf: Add bpf_task_pt_regs() helper The motivation behind this helper is to access userspace pt_regs in a kprobe handler. uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a kprobe handler. The final case (kernelspace pt_regs in uprobe) is pretty rare (usermode helper) so I think that can be solved later if necessary. More concretely, this helper is useful in doing BPF-based DWARF stack unwinding. Currently the kernel can only do framepointer based stack unwinds for userspace code. This is because the DWARF state machines are too fragile to be computed in kernelspace [0]. The idea behind DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace stack (while in prog context) and send it up to userspace for unwinding (probably with libunwind) [1]. This would effectively enable profiling applications with -fomit-frame-pointer using kprobes and uprobes. [0]: https://lkml.org/lkml/2012/2/10/356 [1]: https://github.com/danobi/bpf-dwarf-walk Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 19 +++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++++ 4 files changed, 36 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 609674f409ed..c227b7d4f56c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1327,6 +1327,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1424,6 +1425,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e54f3dc209f..580e14ee7ff9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -723,6 +723,23 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = { .ret_btf_id = &btf_task_struct_ids[0], }; +BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) +{ + return (unsigned long) task_pt_regs(task); +} + +BTF_ID_LIST(bpf_task_pt_regs_ids) +BTF_ID(struct, pt_regs) + +const struct bpf_func_proto bpf_task_pt_regs_proto = { + .func = bpf_task_pt_regs, + .gpl_only = true, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_task_struct_ids[0], + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_task_pt_regs_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1032,6 +1049,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 576d47bb1a926fe8162253e0bca28e9bede8cf48 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:50 -0700 Subject: bpf: selftests: Add bpf_task_pt_regs() selftest This test retrieves the uprobe's pt_regs in two different ways and compares the contents in an arch-agnostic way. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/5581eb8800f6625ec8813fe21e9dce1fbdef4937.1629772842.git.dxu@dxuuu.xyz --- .../selftests/bpf/prog_tests/task_pt_regs.c | 47 ++++++++++++++++++++++ .../selftests/bpf/progs/test_task_pt_regs.c | 29 +++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/task_pt_regs.c create mode 100644 tools/testing/selftests/bpf/progs/test_task_pt_regs.c diff --git a/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c new file mode 100644 index 000000000000..53f0e0fa1a53 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_pt_regs.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include "test_task_pt_regs.skel.h" + +void test_task_pt_regs(void) +{ + struct test_task_pt_regs *skel; + struct bpf_link *uprobe_link; + size_t uprobe_offset; + ssize_t base_addr; + bool match; + + base_addr = get_base_addr(); + if (!ASSERT_GT(base_addr, 0, "get_base_addr")) + return; + uprobe_offset = get_uprobe_offset(&get_base_addr, base_addr); + + skel = test_task_pt_regs__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + if (!ASSERT_OK_PTR(skel->bss, "check_bss")) + goto cleanup; + + uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe, + false /* retprobe */, + 0 /* self pid */, + "/proc/self/exe", + uprobe_offset); + if (!ASSERT_OK_PTR(uprobe_link, "attach_uprobe")) + goto cleanup; + skel->links.handle_uprobe = uprobe_link; + + /* trigger & validate uprobe */ + get_base_addr(); + + if (!ASSERT_EQ(skel->bss->uprobe_res, 1, "check_uprobe_res")) + goto cleanup; + + match = !memcmp(&skel->bss->current_regs, &skel->bss->ctx_regs, + sizeof(skel->bss->current_regs)); + ASSERT_TRUE(match, "check_regs_match"); + +cleanup: + test_task_pt_regs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c new file mode 100644 index 000000000000..6c059f1cfa1b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +struct pt_regs current_regs = {}; +struct pt_regs ctx_regs = {}; +int uprobe_res = 0; + +SEC("uprobe/trigger_func") +int handle_uprobe(struct pt_regs *ctx) +{ + struct task_struct *current; + struct pt_regs *regs; + + current = bpf_get_current_task_btf(); + regs = (struct pt_regs *) bpf_task_pt_regs(current); + __builtin_memcpy(¤t_regs, regs, sizeof(*regs)); + __builtin_memcpy(&ctx_regs, ctx, sizeof(*ctx)); + + /* Prove that uprobe was run */ + uprobe_res = 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From d18b09bf67bb821807de202a1b8d239a946118e7 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:07 +0200 Subject: selftests: xsk: Remove color mode Remove color mode since it does not add any value and having less code means less maintenance which is a good thing. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-2-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/test_xsk.sh | 10 +++------- tools/testing/selftests/bpf/xsk_prereqs.sh | 27 ++++++--------------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 46633a3bfb0b..cd7bf32e6a17 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -63,14 +63,11 @@ # ---------------- # Must run with CAP_NET_ADMIN capability. # -# Run (full color-coded output): -# sudo ./test_xsk.sh -c +# Run: +# sudo ./test_xsk.sh # # If running from kselftests: -# sudo make colorconsole=1 run_tests -# -# Run (full output without color-coding): -# sudo ./test_xsk.sh +# sudo make run_tests # # Run with verbose output: # sudo ./test_xsk.sh -v @@ -83,7 +80,6 @@ while getopts "cvD" flag do case "${flag}" in - c) colorconsole=1;; v) verbose=1;; D) dump_pkts=1;; esac diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index dac1c5f78752..8fe022a4dbfa 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -8,11 +8,6 @@ ksft_xfail=2 ksft_xpass=3 ksft_skip=4 -GREEN='\033[0;92m' -YELLOW='\033[0;93m' -RED='\033[0;31m' -NC='\033[0m' -STACK_LIM=131072 SPECFILE=veth.spec XSKOBJ=xdpxceiver NUMPKTS=10000 @@ -50,22 +45,12 @@ validate_veth_spec_file() test_status() { statusval=$1 - if [ -n "${colorconsole+set}" ]; then - if [ $statusval -eq 2 ]; then - echo -e "${YELLOW}$2${NC}: [ ${RED}FAIL${NC} ]" - elif [ $statusval -eq 1 ]; then - echo -e "${YELLOW}$2${NC}: [ ${RED}SKIPPED${NC} ]" - elif [ $statusval -eq 0 ]; then - echo -e "${YELLOW}$2${NC}: [ ${GREEN}PASS${NC} ]" - fi - else - if [ $statusval -eq 2 ]; then - echo -e "$2: [ FAIL ]" - elif [ $statusval -eq 1 ]; then - echo -e "$2: [ SKIPPED ]" - elif [ $statusval -eq 0 ]; then - echo -e "$2: [ PASS ]" - fi + if [ $statusval -eq 2 ]; then + echo -e "$2: [ FAIL ]" + elif [ $statusval -eq 1 ]; then + echo -e "$2: [ SKIPPED ]" + elif [ $statusval -eq 0 ]; then + echo -e "$2: [ PASS ]" fi } -- cgit v1.2.3 From 25c0a30541e4a7ddb4b45c2c923f799c76c95ef5 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:08 +0200 Subject: selftests: xsk: Remove the num_tx_packets option Remove the number of tx packet option as this should be decided by the test itself. Also change the number of packets to be sent to 4096 speeding up the execution. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-3-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 33 +++++++++--------------------- tools/testing/selftests/bpf/xdpxceiver.h | 4 ++-- tools/testing/selftests/bpf/xsk_prereqs.sh | 3 +-- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 1135fb980814..1b0efe566278 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -333,20 +333,19 @@ static struct option long_options[] = { {"queue", optional_argument, 0, 'q'}, {"dump-pkts", optional_argument, 0, 'D'}, {"verbose", no_argument, 0, 'v'}, - {"tx-pkt-count", optional_argument, 0, 'C'}, {0, 0, 0, 0} }; static void usage(const char *prog) { const char *str = - " Usage: %s [OPTIONS]\n" - " Options:\n" - " -i, --interface Use interface\n" - " -q, --queue=n Use queue n (default 0)\n" - " -D, --dump-pkts Dump packets L2 - L5\n" - " -v, --verbose Verbose output\n" - " -C, --tx-pkt-count=n Number of packets to send\n"; + " Usage: %s [OPTIONS]\n" + " Options:\n" + " -i, --interface Use interface\n" + " -q, --queue=n Use queue n (default 0)\n" + " -D, --dump-pkts Dump packets L2 - L5\n" + " -v, --verbose Verbose output\n"; + ksft_print_msg(str, prog); } @@ -392,7 +391,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index); + c = getopt_long(argc, argv, "i:Dv", long_options, &option_index); if (c == -1) break; @@ -415,9 +414,6 @@ static void parse_command_line(int argc, char **argv) case 'D': debug_pkt_dump = 1; break; - case 'C': - opt_pkt_count = atoi(optarg); - break; case 'v': opt_verbose = 1; break; @@ -427,11 +423,6 @@ static void parse_command_line(int argc, char **argv) } } - if (!opt_pkt_count) { - print_verbose("No tx-pkt-count specified, using default %u\n", DEFAULT_PKT_CNT); - opt_pkt_count = DEFAULT_PKT_CNT; - } - if (!validate_interfaces()) { usage(basename(argv[0])); ksft_exit_xfail(); @@ -554,9 +545,6 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) static int get_batch_size(int pkt_cnt) { - if (!opt_pkt_count) - return BATCH_SIZE; - if (pkt_cnt + BATCH_SIZE <= opt_pkt_count) return BATCH_SIZE; @@ -586,7 +574,7 @@ static void tx_only_all(struct ifobject *ifobject) fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLOUT; - while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { + while (pkt_cnt < opt_pkt_count) { int batch_size = get_batch_size(pkt_cnt); if (test_type == TEST_TYPE_POLL) { @@ -602,8 +590,7 @@ static void tx_only_all(struct ifobject *ifobject) pkt_cnt += batch_size; } - if (opt_pkt_count) - complete_tx_only_all(ifobject); + complete_tx_only_all(ifobject); } static void worker_pkt_dump(void) diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 6c428b276ab6..4ce5a18b32e7 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -39,7 +39,7 @@ #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 -#define DEFAULT_PKT_CNT 10000 +#define DEFAULT_PKT_CNT (4 * 1024) #define RX_FULL_RXQSIZE 32 #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -79,7 +79,7 @@ static u32 num_frames; static bool second_step; static int test_type; -static int opt_pkt_count; +static u32 opt_pkt_count = DEFAULT_PKT_CNT; static u8 opt_verbose; static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; diff --git a/tools/testing/selftests/bpf/xsk_prereqs.sh b/tools/testing/selftests/bpf/xsk_prereqs.sh index 8fe022a4dbfa..bf29d2549bee 100755 --- a/tools/testing/selftests/bpf/xsk_prereqs.sh +++ b/tools/testing/selftests/bpf/xsk_prereqs.sh @@ -10,7 +10,6 @@ ksft_skip=4 SPECFILE=veth.spec XSKOBJ=xdpxceiver -NUMPKTS=10000 validate_root_exec() { @@ -92,5 +91,5 @@ validate_ip_utility() execxdpxceiver() { - ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} -C ${NUMPKTS} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} + ./${XSKOBJ} -i ${VETH0} -i ${VETH1},${NS1} ${VERBOSE_ARG} ${DUMP_PKTS_ARG} } -- cgit v1.2.3 From 13a6ebd9084a398b93e3e06ca59254df9aa95336 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:09 +0200 Subject: selftests: xsk: Remove unused variables Remove unused variables and typedefs. The *_npkts variables are incremented but never used. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-4-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 3 --- tools/testing/selftests/bpf/xdpxceiver.h | 8 -------- 2 files changed, 11 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 1b0efe566278..4d8ee636fc24 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -70,7 +70,6 @@ #include #include #include -typedef __u16 __sum16; #include #include #include @@ -454,7 +453,6 @@ static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) if (rcvd) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->tx_npkts += rcvd; } } @@ -512,7 +510,6 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->rx_npkts += rcvd; } static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 4ce5a18b32e7..02b7d0d6f45d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -44,10 +44,6 @@ #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) -typedef __u32 u32; -typedef __u16 u16; -typedef __u8 u8; - enum TEST_MODES { TEST_MODE_UNCONFIGURED = -1, TEST_MODE_SKB, @@ -104,10 +100,6 @@ struct xsk_socket_info { struct xsk_ring_prod tx; struct xsk_umem_info *umem; struct xsk_socket *xsk; - unsigned long rx_npkts; - unsigned long tx_npkts; - unsigned long prev_rx_npkts; - unsigned long prev_tx_npkts; u32 outstanding_tx; }; -- cgit v1.2.3 From 083be682d97695979e1bdfac1d4274234555f77b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:10 +0200 Subject: selftests: xsk: Return correct error codes Return the correct error codes so they can be printed correctly. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-5-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 4d8ee636fc24..f221bc5dae17 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -270,7 +270,7 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, &cfg); if (ret) - exit_with_error(ret); + exit_with_error(-ret); umem->buffer = buffer; @@ -284,7 +284,7 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem) ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) - exit_with_error(ret); + exit_with_error(-ret); for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) *xsk_ring_prod__fill_addr(&umem->fq, idx++) = i * XSK_UMEM__DEFAULT_FRAME_SIZE; xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); @@ -467,7 +467,7 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { ret = poll(fds, 1, POLL_TMOUT); if (ret < 0) - exit_with_error(ret); + exit_with_error(-ret); } return; } @@ -475,11 +475,11 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); while (ret != rcvd) { if (ret < 0) - exit_with_error(ret); + exit_with_error(-ret); if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { ret = poll(fds, 1, POLL_TMOUT); if (ret < 0) - exit_with_error(ret); + exit_with_error(-ret); } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } -- cgit v1.2.3 From 1314c3537f661002a65999784c0f3f42d7de87f6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:11 +0200 Subject: selftests: xsk: Simplify the retry code Simplify the retry code and make it more efficient by waiting first, instead of trying immediately which always fails due to the asynchronous nature of xsk socket close. Also decrease the wait time to significantly lower the run-time of the test suite. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-6-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 23 ++++++++++------------- tools/testing/selftests/bpf/xdpxceiver.h | 2 +- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index f221bc5dae17..b7d193a96083 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -745,24 +745,19 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs) if (bufs == MAP_FAILED) exit_with_error(errno); - xsk_configure_umem(ifobject, bufs, 0); - ifobject->umem = ifobject->umem_arr[0]; - ret = xsk_configure_socket(ifobject, 0); - - /* Retry Create Socket if it fails as xsk_socket__create() - * is asynchronous - */ - while (ret && ctr < SOCK_RECONF_CTR) { + while (ctr++ < SOCK_RECONF_CTR) { xsk_configure_umem(ifobject, bufs, 0); ifobject->umem = ifobject->umem_arr[0]; ret = xsk_configure_socket(ifobject, 0); + if (!ret) + break; + + /* Retry Create Socket if it fails as xsk_socket__create() is asynchronous */ usleep(USLEEP_MAX); - ctr++; + if (ctr >= SOCK_RECONF_CTR) + exit_with_error(-ret); } - if (ctr >= SOCK_RECONF_CTR) - exit_with_error(ret); - ifobject->umem = ifobject->umem_arr[0]; ifobject->xsk = ifobject->xsk_arr[0]; @@ -1125,8 +1120,10 @@ int main(int argc, char **argv) ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); for (i = 0; i < TEST_MODE_MAX; i++) { - for (j = 0; j < TEST_TYPE_MAX; j++) + for (j = 0; j < TEST_TYPE_MAX; j++) { run_pkt_test(i, j); + usleep(USLEEP_MAX); + } } cleanup: diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 02b7d0d6f45d..1c94230c351a 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -35,7 +35,7 @@ #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) #define EOT (-1) -#define USLEEP_MAX 200000 +#define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 -- cgit v1.2.3 From 9c5ce931b16ea83fa01e5e4ca95b5172f1cde01a Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:12 +0200 Subject: selftests: xsk: Remove end-of-test packet Get rid of the end-of-test packet and just count the number of packets received and quit when the expected number as been received. Simplifies the code. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-7-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 42 +++++++++----------------------- tools/testing/selftests/bpf/xdpxceiver.h | 2 -- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index b7d193a96083..b0fee71355bf 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -600,7 +600,7 @@ static void worker_pkt_dump(void) void *ptr; fprintf(stdout, "---------------------------------------\n"); - for (int iter = 0; iter < num_frames - 1; iter++) { + for (int iter = 0; iter < num_frames; iter++) { ptr = pkt_buf[iter]->payload; ethhdr = ptr; iphdr = ptr + sizeof(*ethhdr); @@ -627,11 +627,6 @@ static void worker_pkt_dump(void) /*extract L5 frame */ payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); - if (payload == EOT) { - print_verbose("End-of-transmission frame received\n"); - fprintf(stdout, "---------------------------------------\n"); - break; - } fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); fprintf(stdout, "---------------------------------------\n"); } @@ -694,28 +689,24 @@ static void worker_pkt_validate(void) /*do not increment pktcounter if !(tos=0x9 and ipv4) */ if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { payloadseqnum = *((uint32_t *)(pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); - if (debug_pkt_dump && payloadseqnum != EOT) { + if (debug_pkt_dump) { pkt_obj = malloc(sizeof(*pkt_obj)); pkt_obj->payload = malloc(PKT_SIZE); memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE); pkt_buf[payloadseqnum] = pkt_obj; } - if (payloadseqnum == EOT) { - print_verbose("End-of-transmission frame received: PASS\n"); - sigvar = 1; - break; - } - - if (prev_pkt + 1 != payloadseqnum) { + if (pkt_counter % num_frames != payloadseqnum) { ksft_test_result_fail - ("ERROR: [%s] prev_pkt [%d], payloadseqnum [%d]\n", - __func__, prev_pkt, payloadseqnum); + ("ERROR: [%s] expected counter [%d], payloadseqnum [%d]\n", + __func__, pkt_counter, payloadseqnum); ksft_exit_xfail(); } - prev_pkt = payloadseqnum; - pkt_counter++; + if (++pkt_counter == opt_pkt_count) { + sigvar = 1; + break; + } } else { ksft_print_msg("Invalid frame received: "); ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, @@ -800,11 +791,7 @@ static void *worker_testapp_validate_tx(void *arg) thread_common_ops(ifobject, bufs); for (int i = 0; i < num_frames; i++) { - /*send EOT frame */ - if (i == (num_frames - 1)) - data.seqnum = -1; - else - data.seqnum = i; + data.seqnum = i; gen_udp_hdr(&data, ifobject, udp_hdr); gen_ip_hdr(ifobject, ip_hdr); gen_udp_csum(udp_hdr, ip_hdr); @@ -812,8 +799,7 @@ static void *worker_testapp_validate_tx(void *arg) gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - print_verbose("Sending %d packets on interface %s\n", - (opt_pkt_count - 1), ifobject->ifname); + print_verbose("Sending %d packets on interface %s\n", opt_pkt_count, ifobject->ifname); tx_only_all(ifobject); testapp_cleanup_xsk_res(ifobject); @@ -888,7 +874,7 @@ static void testapp_validate(void) if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { worker_pkt_dump(); - for (int iter = 0; iter < num_frames - 1; iter++) { + for (int iter = 0; iter < num_frames; iter++) { free(pkt_buf[iter]->payload); free(pkt_buf[iter]); } @@ -905,7 +891,6 @@ static void testapp_teardown(void) for (i = 0; i < MAX_TEARDOWN_ITER; i++) { pkt_counter = 0; - prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); @@ -933,7 +918,6 @@ static void testapp_bidi(void) { for (int i = 0; i < MAX_BIDI_ITER; i++) { pkt_counter = 0; - prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); @@ -967,7 +951,6 @@ static void testapp_bpf_res(void) for (i = 0; i < MAX_BPF_ITER; i++) { pkt_counter = 0; - prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); @@ -1043,7 +1026,6 @@ static void run_pkt_test(int mode, int type) xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; pkt_counter = 0; second_step = 0; - prev_pkt = -1; sigvar = 0; stat_test_type = -1; rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 1c94230c351a..a4371d9e2798 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -34,7 +34,6 @@ #define IP_PKT_TOS 0x9 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) -#define EOT (-1) #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 @@ -82,7 +81,6 @@ static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; -static long prev_pkt = -1; static int sigvar; static int stat_test_type; static u32 rxqsize; -- cgit v1.2.3 From d40ba9d33ae8ed937234fd12b7303a997406bd87 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:13 +0200 Subject: selftests: xsk: Disassociate umem size with packets sent Disassociate the number of packets sent with the number of buffers in the umem. This so we can loop over the umem to test more things. Set the size of the umem to be a multiple of 2M. A requirement for huge pages that are needed in unaligned mode. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-8-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 18 ++++++++---------- tools/testing/selftests/bpf/xdpxceiver.h | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index b0fee71355bf..ebed88c13509 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -250,7 +250,7 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); } -static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) +static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size, int idx) { struct xsk_umem_config cfg = { .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, @@ -259,7 +259,6 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) .frame_headroom = frame_headroom, .flags = XSK_UMEM__DEFAULT_FLAGS }; - int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; struct xsk_umem_info *umem; int ret; @@ -722,22 +721,23 @@ static void worker_pkt_validate(void) static void thread_common_ops(struct ifobject *ifobject, void *bufs) { - int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + u64 umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; + size_t mmap_sz = umem_sz; int ctr = 0; int ret; ifobject->ns_fd = switch_namespace(ifobject->nsname); if (test_type == TEST_TYPE_BPF_RES) - umem_sz *= 2; + mmap_sz *= 2; - bufs = mmap(NULL, umem_sz, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (bufs == MAP_FAILED) exit_with_error(errno); while (ctr++ < SOCK_RECONF_CTR) { - xsk_configure_umem(ifobject, bufs, 0); + xsk_configure_umem(ifobject, bufs, umem_sz, 0); ifobject->umem = ifobject->umem_arr[0]; ret = xsk_configure_socket(ifobject, 0); if (!ret) @@ -753,7 +753,7 @@ static void thread_common_ops(struct ifobject *ifobject, void *bufs) ifobject->xsk = ifobject->xsk_arr[0]; if (test_type == TEST_TYPE_BPF_RES) { - xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1); + xsk_configure_umem(ifobject, (u8 *)bufs + umem_sz, umem_sz, 1); ifobject->umem = ifobject->umem_arr[1]; ret = xsk_configure_socket(ifobject, 1); } @@ -1094,8 +1094,6 @@ int main(int argc, char **argv) parse_command_line(argc, argv); - num_frames = ++opt_pkt_count; - init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index a4371d9e2798..131bd998e374 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -70,7 +70,7 @@ enum STAT_TEST_TYPES { static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; -static u32 num_frames; +static u32 num_frames = DEFAULT_PKT_CNT / 4; static bool second_step; static int test_type; -- cgit v1.2.3 From 9da2ea4fe8d10060a417a71f808df7e825660867 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:14 +0200 Subject: selftests: xsk: Rename worker_* functions that are not thread entry points Rename worker_* functions that are not thread entry points to something else. This was confusing. Now only thread entry points are worker_something. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-9-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index ebed88c13509..17956fdeb49e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -589,7 +589,7 @@ static void tx_only_all(struct ifobject *ifobject) complete_tx_only_all(ifobject); } -static void worker_pkt_dump(void) +static void pkt_dump(void) { struct ethhdr *ethhdr; struct iphdr *iphdr; @@ -631,7 +631,7 @@ static void worker_pkt_dump(void) } } -static void worker_stats_validate(struct ifobject *ifobject) +static void stats_validate(struct ifobject *ifobject) { struct xdp_statistics stats; socklen_t optlen; @@ -673,7 +673,7 @@ static void worker_stats_validate(struct ifobject *ifobject) } } -static void worker_pkt_validate(void) +static void pkt_validate(void) { u32 payloadseqnum = -2; struct iphdr *iphdr; @@ -833,9 +833,9 @@ static void *worker_testapp_validate_rx(void *arg) while (1) { if (test_type != TEST_TYPE_STATS) { rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); + pkt_validate(); } else { - worker_stats_validate(ifobject); + stats_validate(ifobject); } if (sigvar) break; @@ -873,7 +873,7 @@ static void testapp_validate(void) pthread_join(t0, NULL); if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { - worker_pkt_dump(); + pkt_dump(); for (int iter = 0; iter < num_frames; iter++) { free(pkt_buf[iter]->payload); free(pkt_buf[iter]); -- cgit v1.2.3 From 0d41f59f458a4f08353f7aba64bbc6388a858265 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:15 +0200 Subject: selftests: xsk: Simplify packet validation in xsk tests Simplify packet validation in the xsk selftests by performing it at once for every packet. The current code performed this per batch and did this on copied packet data. Make it simpler and faster by validating it at once and on the umem packet data thus skipping the copy and the memory allocation for the temprary buffer. The optional packet dump feature is also simplified in the same manner. Memory allocation and copying is removed and the dump is performed directly on the umem data. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-10-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 182 +++++++++++-------------------- tools/testing/selftests/bpf/xdpxceiver.h | 14 --- 2 files changed, 65 insertions(+), 131 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 17956fdeb49e..fe3d281a0575 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -427,6 +427,70 @@ static void parse_command_line(int argc, char **argv) } } +static void pkt_dump(void *pkt, u32 len) +{ + char s[INET_ADDRSTRLEN]; + struct ethhdr *ethhdr; + struct udphdr *udphdr; + struct iphdr *iphdr; + int payload, i; + + ethhdr = pkt; + iphdr = pkt + sizeof(*ethhdr); + udphdr = pkt + sizeof(*ethhdr) + sizeof(*iphdr); + + /*extract L2 frame */ + fprintf(stdout, "DEBUG>> L2: dst mac: "); + for (i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ethhdr->h_dest[i]); + + fprintf(stdout, "\nDEBUG>> L2: src mac: "); + for (i = 0; i < ETH_ALEN; i++) + fprintf(stdout, "%02X", ethhdr->h_source[i]); + + /*extract L3 frame */ + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); + /*extract L4 frame */ + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); + /*extract L5 frame */ + payload = *((uint32_t *)(pkt + PKT_HDR_SIZE)); + + fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); + fprintf(stdout, "---------------------------------------\n"); +} + +static void pkt_validate(void *pkt) +{ + struct iphdr *iphdr = (struct iphdr *)(pkt + sizeof(struct ethhdr)); + + /*do not increment pktcounter if !(tos=0x9 and ipv4) */ + if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { + u32 payloadseqnum = *((uint32_t *)(pkt + PKT_HDR_SIZE)); + + if (debug_pkt_dump && test_type != TEST_TYPE_STATS) + pkt_dump(pkt, PKT_SIZE); + + if (pkt_counter % num_frames != payloadseqnum) { + ksft_test_result_fail + ("ERROR: [%s] expected seqnum [%d], got seqnum [%d]\n", + __func__, pkt_counter, payloadseqnum); + ksft_exit_xfail(); + } + + if (++pkt_counter == opt_pkt_count) + sigvar = 1; + } else { + ksft_print_msg("Invalid frame received: "); + ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, + iphdr->tos); + } +} + static void kick_tx(struct xsk_socket_info *xsk) { int ret; @@ -491,18 +555,7 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) orig = xsk_umem__extract_addr(addr); addr = xsk_umem__add_offset_to_addr(addr); - pkt_node_rx = malloc(sizeof(struct pkt) + PKT_SIZE); - if (!pkt_node_rx) - exit_with_error(errno); - - pkt_node_rx->pkt_frame = malloc(PKT_SIZE); - if (!pkt_node_rx->pkt_frame) - exit_with_error(errno); - - memcpy(pkt_node_rx->pkt_frame, xsk_umem__get_data(xsk->umem->buffer, addr), - PKT_SIZE); - - TAILQ_INSERT_HEAD(&head, pkt_node_rx, pkt_nodes); + pkt_validate(xsk_umem__get_data(xsk->umem->buffer, addr)); *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; } @@ -589,48 +642,6 @@ static void tx_only_all(struct ifobject *ifobject) complete_tx_only_all(ifobject); } -static void pkt_dump(void) -{ - struct ethhdr *ethhdr; - struct iphdr *iphdr; - struct udphdr *udphdr; - char s[128]; - int payload; - void *ptr; - - fprintf(stdout, "---------------------------------------\n"); - for (int iter = 0; iter < num_frames; iter++) { - ptr = pkt_buf[iter]->payload; - ethhdr = ptr; - iphdr = ptr + sizeof(*ethhdr); - udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr); - - /*extract L2 frame */ - fprintf(stdout, "DEBUG>> L2: dst mac: "); - for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ethhdr->h_dest[i]); - - fprintf(stdout, "\nDEBUG>> L2: src mac: "); - for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ethhdr->h_source[i]); - - /*extract L3 frame */ - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", - inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", - inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); - /*extract L4 frame */ - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); - /*extract L5 frame */ - payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); - - fprintf(stdout, "DEBUG>> L5: payload: %d\n", payload); - fprintf(stdout, "---------------------------------------\n"); - } -} - static void stats_validate(struct ifobject *ifobject) { struct xdp_statistics stats; @@ -673,52 +684,6 @@ static void stats_validate(struct ifobject *ifobject) } } -static void pkt_validate(void) -{ - u32 payloadseqnum = -2; - struct iphdr *iphdr; - - while (1) { - pkt_node_rx_q = TAILQ_LAST(&head, head_s); - if (!pkt_node_rx_q) - break; - - iphdr = (struct iphdr *)(pkt_node_rx_q->pkt_frame + sizeof(struct ethhdr)); - - /*do not increment pktcounter if !(tos=0x9 and ipv4) */ - if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { - payloadseqnum = *((uint32_t *)(pkt_node_rx_q->pkt_frame + PKT_HDR_SIZE)); - if (debug_pkt_dump) { - pkt_obj = malloc(sizeof(*pkt_obj)); - pkt_obj->payload = malloc(PKT_SIZE); - memcpy(pkt_obj->payload, pkt_node_rx_q->pkt_frame, PKT_SIZE); - pkt_buf[payloadseqnum] = pkt_obj; - } - - if (pkt_counter % num_frames != payloadseqnum) { - ksft_test_result_fail - ("ERROR: [%s] expected counter [%d], payloadseqnum [%d]\n", - __func__, pkt_counter, payloadseqnum); - ksft_exit_xfail(); - } - - if (++pkt_counter == opt_pkt_count) { - sigvar = 1; - break; - } - } else { - ksft_print_msg("Invalid frame received: "); - ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, - iphdr->tos); - } - - TAILQ_REMOVE(&head, pkt_node_rx_q, pkt_nodes); - free(pkt_node_rx_q->pkt_frame); - free(pkt_node_rx_q); - pkt_node_rx_q = NULL; - } -} - static void thread_common_ops(struct ifobject *ifobject, void *bufs) { u64 umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; @@ -818,13 +783,6 @@ static void *worker_testapp_validate_rx(void *arg) if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) xsk_populate_fill_ring(ifobject->umem); - TAILQ_INIT(&head); - if (debug_pkt_dump) { - pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); - if (!pkt_buf) - exit_with_error(errno); - } - fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLIN; @@ -833,7 +791,6 @@ static void *worker_testapp_validate_rx(void *arg) while (1) { if (test_type != TEST_TYPE_STATS) { rx_pkt(ifobject->xsk, fds); - pkt_validate(); } else { stats_validate(ifobject); } @@ -872,15 +829,6 @@ static void testapp_validate(void) pthread_join(t1, NULL); pthread_join(t0, NULL); - if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { - pkt_dump(); - for (int iter = 0; iter < num_frames; iter++) { - free(pkt_buf[iter]->payload); - free(pkt_buf[iter]); - } - free(pkt_buf); - } - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 131bd998e374..0fb657b505ae 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -139,18 +139,4 @@ static struct ifobject *ifdict_tx; pthread_barrier_t barr; pthread_t t0, t1; -TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); -struct head_s *head_p; -struct pkt { - char *pkt_frame; - - TAILQ_ENTRY(pkt) pkt_nodes; -} *pkt_node_rx, *pkt_node_rx_q; - -struct pkt_frame { - char *payload; -} *pkt_obj; - -struct pkt_frame **pkt_buf; - #endif /* XDPXCEIVER_H */ -- cgit v1.2.3 From b04fdc4ce31fe5ae411737ec6705fcdfc493d6c8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:16 +0200 Subject: selftests: xsk: Validate tx stats on tx thread Validate the tx stats on the Tx thread instead of the Rx thread. Depending on your settings, you might not be allowed to query the statistics of a socket you do not own, so better to do this on the correct thread to start with. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-11-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 55 ++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index fe3d281a0575..8ff24472ef1e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -642,23 +642,22 @@ static void tx_only_all(struct ifobject *ifobject) complete_tx_only_all(ifobject); } -static void stats_validate(struct ifobject *ifobject) +static bool rx_stats_are_valid(struct ifobject *ifobject) { + u32 xsk_stat = 0, expected_stat = opt_pkt_count; + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); struct xdp_statistics stats; socklen_t optlen; int err; - struct xsk_socket *xsk = stat_test_type == STAT_TEST_TX_INVALID ? - ifdict[!ifobject->ifdict_index]->xsk->xsk : - ifobject->xsk->xsk; - int fd = xsk_socket__fd(xsk); - unsigned long xsk_stat = 0, expected_stat = opt_pkt_count; - - sigvar = 0; optlen = sizeof(stats); err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); - if (err) - return; + if (err) { + ksft_test_result_fail("ERROR: [%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return true; + } if (optlen == sizeof(struct xdp_statistics)) { switch (stat_test_type) { @@ -666,8 +665,7 @@ static void stats_validate(struct ifobject *ifobject) xsk_stat = stats.rx_dropped; break; case STAT_TEST_TX_INVALID: - xsk_stat = stats.tx_invalid_descs; - break; + return true; case STAT_TEST_RX_FULL: xsk_stat = stats.rx_ring_full; expected_stat -= RX_FULL_RXQSIZE; @@ -680,8 +678,33 @@ static void stats_validate(struct ifobject *ifobject) } if (xsk_stat == expected_stat) - sigvar = 1; + return true; + } + + return false; +} + +static void tx_stats_validate(struct ifobject *ifobject) +{ + struct xsk_socket *xsk = ifobject->xsk->xsk; + int fd = xsk_socket__fd(xsk); + struct xdp_statistics stats; + socklen_t optlen; + int err; + + optlen = sizeof(stats); + err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); + if (err) { + ksft_test_result_fail("ERROR: [%s] getsockopt(XDP_STATISTICS) error %u %s\n", + __func__, -err, strerror(-err)); + return; } + + if (stats.tx_invalid_descs == opt_pkt_count) + return; + + ksft_test_result_fail("ERROR: [%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", + __func__, stats.tx_invalid_descs, opt_pkt_count); } static void thread_common_ops(struct ifobject *ifobject, void *bufs) @@ -767,6 +790,9 @@ static void *worker_testapp_validate_tx(void *arg) print_verbose("Sending %d packets on interface %s\n", opt_pkt_count, ifobject->ifname); tx_only_all(ifobject); + if (stat_test_type == STAT_TEST_TX_INVALID) + tx_stats_validate(ifobject); + testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); } @@ -792,7 +818,8 @@ static void *worker_testapp_validate_rx(void *arg) if (test_type != TEST_TYPE_STATS) { rx_pkt(ifobject->xsk, fds); } else { - stats_validate(ifobject); + if (rx_stats_are_valid(ifobject)) + break; } if (sigvar) break; -- cgit v1.2.3 From ab7c95abb5f9d05470ede8e75a105c81c2dbf2c1 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:17 +0200 Subject: selftests: xsk: Decrease sending speed Decrease sending speed to avoid potentially overflowing some buffers in the skb case that leads to dropped packets we cannot control (and thus the tests may generate false negatives). Decrease batch size and introduce a usleep in the transmit thread to not overflow the receiver. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-12-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 1 + tools/testing/selftests/bpf/xdpxceiver.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 8ff24472ef1e..bc7d6bbbb867 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -637,6 +637,7 @@ static void tx_only_all(struct ifobject *ifobject) tx_only(ifobject->xsk, &frame_nb, batch_size); pkt_cnt += batch_size; + usleep(10); } complete_tx_only_all(ifobject); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 0fb657b505ae..1c5457e9f1d6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -36,7 +36,7 @@ #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) #define USLEEP_MAX 10000 #define SOCK_RECONF_CTR 10 -#define BATCH_SIZE 64 +#define BATCH_SIZE 8 #define POLL_TMOUT 1000 #define DEFAULT_PKT_CNT (4 * 1024) #define RX_FULL_RXQSIZE 32 -- cgit v1.2.3 From 1034b03e54ac80f093619ead6c3b77d0c3086a2b Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:18 +0200 Subject: selftests: xsk: Simplify cleanup of ifobjects Simpify the cleanup of ifobjects right before the program exits by introducing functions for creating and destroying these objects. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-13-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 72 ++++++++++++++++++-------------- tools/testing/selftests/bpf/xdpxceiver.h | 1 - 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index bc7d6bbbb867..5e586a696742 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -1039,62 +1039,70 @@ static void run_pkt_test(int mode, int type) } } +static struct ifobject *ifobject_create(void) +{ + struct ifobject *ifobj; + + ifobj = calloc(1, sizeof(struct ifobject)); + if (!ifobj) + return NULL; + + ifobj->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); + if (!ifobj->xsk_arr) + goto out_xsk_arr; + + ifobj->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); + if (!ifobj->umem_arr) + goto out_umem_arr; + + return ifobj; + +out_umem_arr: + free(ifobj->xsk_arr); +out_xsk_arr: + free(ifobj); + return NULL; +} + +static void ifobject_delete(struct ifobject *ifobj) +{ + free(ifobj->umem_arr); + free(ifobj->xsk_arr); + free(ifobj); +} + int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; - bool failure = false; int i, j; if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) exit_with_error(errno); - for (int i = 0; i < MAX_INTERFACES; i++) { - ifdict[i] = malloc(sizeof(struct ifobject)); + for (i = 0; i < MAX_INTERFACES; i++) { + ifdict[i] = ifobject_create(); if (!ifdict[i]) - exit_with_error(errno); - - ifdict[i]->ifdict_index = i; - ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); - if (!ifdict[i]->xsk_arr) { - failure = true; - goto cleanup; - } - ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); - if (!ifdict[i]->umem_arr) { - failure = true; - goto cleanup; - } + exit_with_error(ENOMEM); } setlocale(LC_ALL, ""); parse_command_line(argc, argv); - init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); - init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); + init_iface(ifdict[tx], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); + init_iface(ifdict[rx], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); - for (i = 0; i < TEST_MODE_MAX; i++) { + for (i = 0; i < TEST_MODE_MAX; i++) for (j = 0; j < TEST_TYPE_MAX; j++) { run_pkt_test(i, j); usleep(USLEEP_MAX); } - } -cleanup: - for (int i = 0; i < MAX_INTERFACES; i++) { - if (ifdict[i]->ns_fd != -1) - close(ifdict[i]->ns_fd); - free(ifdict[i]->xsk_arr); - free(ifdict[i]->umem_arr); - free(ifdict[i]); - } - - if (failure) - exit_with_error(errno); + for (i = 0; i < MAX_INTERFACES; i++) + ifobject_delete(ifdict[i]); ksft_exit_pass(); - return 0; } diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 1c5457e9f1d6..316c3565a99e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -122,7 +122,6 @@ struct ifobject { void *(*func_ptr)(void *arg); struct flow_vector fv; int ns_fd; - int ifdict_index; u32 dst_ip; u32 src_ip; u16 src_port; -- cgit v1.2.3 From 960b6e0153fb383bd634313fbd41bd4813dd73fc Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:19 +0200 Subject: selftests: xsk: Generate packet directly in umem Generate the packet directly in the umem instead of in a temporary buffer that is copied out. Simplifies the code and improves performance. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-14-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 70 +++++++++++++++----------------- tools/testing/selftests/bpf/xdpxceiver.h | 5 --- 2 files changed, 32 insertions(+), 43 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 5e586a696742..433c5c7b1928 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -125,7 +125,7 @@ static void __exit_with_error(int error, const char *file, const char *func, int test_type == TEST_TYPE_STATS ? "Stats" : "",\ test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) -static void *memset32_htonl(void *dest, u32 val, u32 size) +static void memset32_htonl(void *dest, u32 val, u32 size) { u32 *ptr = (u32 *)dest; int i; @@ -134,11 +134,6 @@ static void *memset32_htonl(void *dest, u32 val, u32 size) for (i = 0; i < (size & (~0x3)); i += 4) ptr[i >> 2] = val; - - for (; i < size; i++) - ((char *)dest)[i] = ((char *)&val)[i & 3]; - - return dest; } /* @@ -229,13 +224,13 @@ static void gen_ip_hdr(struct ifobject *ifobject, struct iphdr *ip_hdr) ip_hdr->check = 0; } -static void gen_udp_hdr(struct generic_data *data, struct ifobject *ifobject, +static void gen_udp_hdr(u32 payload, void *pkt, struct ifobject *ifobject, struct udphdr *udp_hdr) { udp_hdr->source = htons(ifobject->src_port); udp_hdr->dest = htons(ifobject->dst_port); udp_hdr->len = htons(UDP_PKT_SIZE); - memset32_htonl(pkt_data + PKT_HDR_SIZE, htonl(data->seqnum), UDP_PKT_DATA_SIZE); + memset32_htonl(pkt + PKT_HDR_SIZE, payload, UDP_PKT_DATA_SIZE); } static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) @@ -245,11 +240,6 @@ static void gen_udp_csum(struct udphdr *udp_hdr, struct iphdr *ip_hdr) udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, IPPROTO_UDP, (u16 *)udp_hdr); } -static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) -{ - memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); -} - static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size, int idx) { struct xsk_umem_config cfg = { @@ -427,6 +417,20 @@ static void parse_command_line(int argc, char **argv) } } +static void pkt_generate(struct ifobject *ifobject, u32 pkt_nb, u64 addr) +{ + void *data = xsk_umem__get_data(ifobject->umem->buffer, addr); + struct udphdr *udp_hdr = + (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr)); + struct iphdr *ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + struct ethhdr *eth_hdr = (struct ethhdr *)data; + + gen_udp_hdr(pkt_nb, data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr(ifobject, eth_hdr); +} + static void pkt_dump(void *pkt, u32 len) { char s[INET_ADDRSTRLEN]; @@ -464,22 +468,23 @@ static void pkt_dump(void *pkt, u32 len) fprintf(stdout, "---------------------------------------\n"); } -static void pkt_validate(void *pkt) +static void pkt_validate(void *buffer, u64 addr) { - struct iphdr *iphdr = (struct iphdr *)(pkt + sizeof(struct ethhdr)); + void *data = xsk_umem__get_data(buffer, addr); + struct iphdr *iphdr = (struct iphdr *)(data + sizeof(struct ethhdr)); - /*do not increment pktcounter if !(tos=0x9 and ipv4) */ if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { - u32 payloadseqnum = *((uint32_t *)(pkt + PKT_HDR_SIZE)); + u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); + u32 expected_seqnum = pkt_counter % num_frames; if (debug_pkt_dump && test_type != TEST_TYPE_STATS) - pkt_dump(pkt, PKT_SIZE); + pkt_dump(data, PKT_SIZE); - if (pkt_counter % num_frames != payloadseqnum) { + if (expected_seqnum != seqnum) { ksft_test_result_fail ("ERROR: [%s] expected seqnum [%d], got seqnum [%d]\n", - __func__, pkt_counter, payloadseqnum); - ksft_exit_xfail(); + __func__, expected_seqnum, seqnum); + sigvar = 1; } if (++pkt_counter == opt_pkt_count) @@ -488,6 +493,7 @@ static void pkt_validate(void *pkt) ksft_print_msg("Invalid frame received: "); ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, iphdr->tos); + sigvar = 1; } } @@ -555,7 +561,7 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) orig = xsk_umem__extract_addr(addr); addr = xsk_umem__add_offset_to_addr(addr); - pkt_validate(xsk_umem__get_data(xsk->umem->buffer, addr)); + pkt_validate(xsk->umem->buffer, addr); *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; } @@ -564,8 +570,9 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_cons__release(&xsk->rx, rcvd); } -static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) +static void tx_only(struct ifobject *ifobject, u32 *frameptr, int batch_size) { + struct xsk_socket_info *xsk = ifobject->xsk; u32 idx = 0; unsigned int i; bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; @@ -579,6 +586,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; tx_desc->len = len; + pkt_generate(ifobject, *frameptr + i, tx_desc->addr); } xsk_ring_prod__submit(&xsk->tx, batch_size); @@ -635,7 +643,7 @@ static void tx_only_all(struct ifobject *ifobject) continue; } - tx_only(ifobject->xsk, &frame_nb, batch_size); + tx_only(ifobject, &frame_nb, batch_size); pkt_cnt += batch_size; usleep(10); } @@ -768,26 +776,12 @@ static void testapp_cleanup_xsk_res(struct ifobject *ifobj) static void *worker_testapp_validate_tx(void *arg) { - struct udphdr *udp_hdr = - (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); - struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + sizeof(struct ethhdr)); - struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; struct ifobject *ifobject = (struct ifobject *)arg; - struct generic_data data; void *bufs = NULL; if (!second_step) thread_common_ops(ifobject, bufs); - for (int i = 0; i < num_frames; i++) { - data.seqnum = i; - gen_udp_hdr(&data, ifobject, udp_hdr); - gen_ip_hdr(ifobject, ip_hdr); - gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr(ifobject, eth_hdr); - gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); - } - print_verbose("Sending %d packets on interface %s\n", opt_pkt_count, ifobject->ifname); tx_only_all(ifobject); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 316c3565a99e..7670df7e7746 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -79,7 +79,6 @@ static u8 opt_verbose; static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; -static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; static u32 pkt_counter; static int sigvar; static int stat_test_type; @@ -108,10 +107,6 @@ struct flow_vector { } vector; }; -struct generic_data { - u32 seqnum; -}; - struct ifobject { char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; -- cgit v1.2.3 From 29f128b38b346a16dc6749b66f20fca29430d271 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:20 +0200 Subject: selftests: xsk: Generate packets from specification Generate packets from a specification instead of something hard coded. The idea is that a test generates one or more packet specifications and provides it/them to both Tx and Rx. The Tx thread will generate from this specification and Rx will validate that it receives what is in the specification. The specification can be the same on both ends, meaning that everything that was sent should be received, or different which means that Rx will only receive part of the sent packets. Currently, the packet specification is the same for both Rx and Tx and the same for each test. This will change in later work as features and tests are added. The data path functions are also renamed to better reflect what actions they are performing after introducing this feature. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-15-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 271 +++++++++++++++++-------------- tools/testing/selftests/bpf/xdpxceiver.h | 16 +- 2 files changed, 166 insertions(+), 121 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 433c5c7b1928..5ca853cf27a1 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -417,18 +417,59 @@ static void parse_command_line(int argc, char **argv) } } -static void pkt_generate(struct ifobject *ifobject, u32 pkt_nb, u64 addr) +static struct pkt *pkt_stream_get_pkt(struct pkt_stream *pkt_stream, u32 pkt_nb) { - void *data = xsk_umem__get_data(ifobject->umem->buffer, addr); - struct udphdr *udp_hdr = - (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr)); - struct iphdr *ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr)); - struct ethhdr *eth_hdr = (struct ethhdr *)data; + if (pkt_nb >= pkt_stream->nb_pkts) + return NULL; + + return &pkt_stream->pkts[pkt_nb]; +} + +static struct pkt_stream *pkt_stream_generate(u32 nb_pkts, u32 pkt_len) +{ + struct pkt_stream *pkt_stream; + u32 i; + + pkt_stream = malloc(sizeof(*pkt_stream)); + if (!pkt_stream) + exit_with_error(ENOMEM); + + pkt_stream->pkts = calloc(nb_pkts, sizeof(*pkt_stream->pkts)); + if (!pkt_stream->pkts) + exit_with_error(ENOMEM); + + pkt_stream->nb_pkts = nb_pkts; + for (i = 0; i < nb_pkts; i++) { + pkt_stream->pkts[i].addr = (i % num_frames) * XSK_UMEM__DEFAULT_FRAME_SIZE; + pkt_stream->pkts[i].len = pkt_len; + pkt_stream->pkts[i].payload = i; + } + + return pkt_stream; +} + +static struct pkt *pkt_generate(struct ifobject *ifobject, u32 pkt_nb) +{ + struct pkt *pkt = pkt_stream_get_pkt(ifobject->pkt_stream, pkt_nb); + struct udphdr *udp_hdr; + struct ethhdr *eth_hdr; + struct iphdr *ip_hdr; + void *data; + + if (!pkt) + return NULL; + + data = xsk_umem__get_data(ifobject->umem->buffer, pkt->addr); + udp_hdr = (struct udphdr *)(data + sizeof(struct ethhdr) + sizeof(struct iphdr)); + ip_hdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + eth_hdr = (struct ethhdr *)data; gen_udp_hdr(pkt_nb, data, ifobject, udp_hdr); gen_ip_hdr(ifobject, ip_hdr); gen_udp_csum(udp_hdr, ip_hdr); gen_eth_hdr(ifobject, eth_hdr); + + return pkt; } static void pkt_dump(void *pkt, u32 len) @@ -468,33 +509,43 @@ static void pkt_dump(void *pkt, u32 len) fprintf(stdout, "---------------------------------------\n"); } -static void pkt_validate(void *buffer, u64 addr) +static bool is_pkt_valid(struct pkt *pkt, void *buffer, const struct xdp_desc *desc) { - void *data = xsk_umem__get_data(buffer, addr); + void *data = xsk_umem__get_data(buffer, desc->addr); struct iphdr *iphdr = (struct iphdr *)(data + sizeof(struct ethhdr)); + if (!pkt) { + ksft_test_result_fail("ERROR: [%s] too many packets received\n", __func__); + return false; + } + if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); - u32 expected_seqnum = pkt_counter % num_frames; if (debug_pkt_dump && test_type != TEST_TYPE_STATS) pkt_dump(data, PKT_SIZE); - if (expected_seqnum != seqnum) { + if (pkt->len != desc->len) { ksft_test_result_fail - ("ERROR: [%s] expected seqnum [%d], got seqnum [%d]\n", - __func__, expected_seqnum, seqnum); - sigvar = 1; + ("ERROR: [%s] expected length [%d], got length [%d]\n", + __func__, pkt->len, desc->len); + return false; } - if (++pkt_counter == opt_pkt_count) - sigvar = 1; + if (pkt->payload != seqnum) { + ksft_test_result_fail + ("ERROR: [%s] expected seqnum [%d], got seqnum [%d]\n", + __func__, pkt->payload, seqnum); + return false; + } } else { ksft_print_msg("Invalid frame received: "); ksft_print_msg("[IP_PKT_VER: %02X], [IP_PKT_TOS: %02X]\n", iphdr->version, iphdr->tos); - sigvar = 1; + return false; } + + return true; } static void kick_tx(struct xsk_socket_info *xsk) @@ -507,7 +558,7 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +static void complete_pkts(struct xsk_socket_info *xsk, int batch_size) { unsigned int rcvd; u32 idx; @@ -525,116 +576,105 @@ static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) } } -static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) +static void receive_pkts(struct pkt_stream *pkt_stream, struct xsk_socket_info *xsk, + struct pollfd *fds) { - unsigned int rcvd, i; - u32 idx_rx = 0, idx_fq = 0; + u32 idx_rx = 0, idx_fq = 0, rcvd, i, pkt_count = 0; + struct pkt *pkt; int ret; - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); - if (!rcvd) { - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret < 0) - exit_with_error(-ret); + pkt = pkt_stream_get_pkt(pkt_stream, pkt_count++); + while (pkt) { + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + } + continue; } - return; - } - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { - ret = poll(fds, 1, POLL_TMOUT); + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { + ret = poll(fds, 1, POLL_TMOUT); + if (ret < 0) + exit_with_error(-ret); + } + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); - } - for (i = 0; i < rcvd; i++) { - u64 addr, orig; + for (i = 0; i < rcvd; i++) { + const struct xdp_desc *desc = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); + u64 addr = desc->addr, orig; - addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; - xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++); - orig = xsk_umem__extract_addr(addr); + orig = xsk_umem__extract_addr(addr); + addr = xsk_umem__add_offset_to_addr(addr); + if (!is_pkt_valid(pkt, xsk->umem->buffer, desc)) + return; - addr = xsk_umem__add_offset_to_addr(addr); - pkt_validate(xsk->umem->buffer, addr); + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + pkt = pkt_stream_get_pkt(pkt_stream, pkt_count++); + } - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); } - - xsk_ring_prod__submit(&xsk->umem->fq, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); } -static void tx_only(struct ifobject *ifobject, u32 *frameptr, int batch_size) +static u32 __send_pkts(struct ifobject *ifobject, u32 pkt_nb) { struct xsk_socket_info *xsk = ifobject->xsk; - u32 idx = 0; - unsigned int i; - bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; - u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; + u32 i, idx; - while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < batch_size) - complete_tx_only(xsk, batch_size); + while (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) < BATCH_SIZE) + complete_pkts(xsk, BATCH_SIZE); - for (i = 0; i < batch_size; i++) { + for (i = 0; i < BATCH_SIZE; i++) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); + struct pkt *pkt = pkt_generate(ifobject, pkt_nb); - tx_desc->addr = (*frameptr + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; - tx_desc->len = len; - pkt_generate(ifobject, *frameptr + i, tx_desc->addr); - } + if (!pkt) + break; - xsk_ring_prod__submit(&xsk->tx, batch_size); - if (!tx_invalid_test) { - xsk->outstanding_tx += batch_size; - } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { - kick_tx(xsk); + tx_desc->addr = pkt->addr; + tx_desc->len = pkt->len; + pkt_nb++; } - *frameptr += batch_size; - *frameptr %= num_frames; - complete_tx_only(xsk, batch_size); -} -static int get_batch_size(int pkt_cnt) -{ - if (pkt_cnt + BATCH_SIZE <= opt_pkt_count) - return BATCH_SIZE; + xsk_ring_prod__submit(&xsk->tx, i); + if (stat_test_type != STAT_TEST_TX_INVALID) + xsk->outstanding_tx += i; + else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + complete_pkts(xsk, i); - return opt_pkt_count - pkt_cnt; + return i; } -static void complete_tx_only_all(struct ifobject *ifobject) +static void wait_for_tx_completion(struct xsk_socket_info *xsk) { - bool pending; - - do { - pending = false; - if (ifobject->xsk->outstanding_tx) { - complete_tx_only(ifobject->xsk, BATCH_SIZE); - pending = !!ifobject->xsk->outstanding_tx; - } - } while (pending); + while (xsk->outstanding_tx) + complete_pkts(xsk, BATCH_SIZE); } -static void tx_only_all(struct ifobject *ifobject) +static void send_pkts(struct ifobject *ifobject) { struct pollfd fds[MAX_SOCKS] = { }; - u32 frame_nb = 0; - int pkt_cnt = 0; - int ret; + u32 pkt_cnt = 0; fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); fds[0].events = POLLOUT; - while (pkt_cnt < opt_pkt_count) { - int batch_size = get_batch_size(pkt_cnt); + while (pkt_cnt < ifobject->pkt_stream->nb_pkts) { + u32 sent; if (test_type == TEST_TYPE_POLL) { + int ret; + ret = poll(fds, 1, POLL_TMOUT); if (ret <= 0) continue; @@ -643,17 +683,17 @@ static void tx_only_all(struct ifobject *ifobject) continue; } - tx_only(ifobject, &frame_nb, batch_size); - pkt_cnt += batch_size; + sent = __send_pkts(ifobject, pkt_cnt); + pkt_cnt += sent; usleep(10); } - complete_tx_only_all(ifobject); + wait_for_tx_completion(ifobject->xsk); } static bool rx_stats_are_valid(struct ifobject *ifobject) { - u32 xsk_stat = 0, expected_stat = opt_pkt_count; + u32 xsk_stat = 0, expected_stat = ifobject->pkt_stream->nb_pkts; struct xsk_socket *xsk = ifobject->xsk->xsk; int fd = xsk_socket__fd(xsk); struct xdp_statistics stats; @@ -709,11 +749,11 @@ static void tx_stats_validate(struct ifobject *ifobject) return; } - if (stats.tx_invalid_descs == opt_pkt_count) + if (stats.tx_invalid_descs == ifobject->pkt_stream->nb_pkts) return; ksft_test_result_fail("ERROR: [%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", - __func__, stats.tx_invalid_descs, opt_pkt_count); + __func__, stats.tx_invalid_descs, ifobject->pkt_stream->nb_pkts); } static void thread_common_ops(struct ifobject *ifobject, void *bufs) @@ -782,8 +822,9 @@ static void *worker_testapp_validate_tx(void *arg) if (!second_step) thread_common_ops(ifobject, bufs); - print_verbose("Sending %d packets on interface %s\n", opt_pkt_count, ifobject->ifname); - tx_only_all(ifobject); + print_verbose("Sending %d packets on interface %s\n", ifobject->pkt_stream->nb_pkts, + ifobject->ifname); + send_pkts(ifobject); if (stat_test_type == STAT_TEST_TX_INVALID) tx_stats_validate(ifobject); @@ -809,19 +850,11 @@ static void *worker_testapp_validate_rx(void *arg) pthread_barrier_wait(&barr); - while (1) { - if (test_type != TEST_TYPE_STATS) { - rx_pkt(ifobject->xsk, fds); - } else { - if (rx_stats_are_valid(ifobject)) - break; - } - if (sigvar) - break; - } - - print_verbose("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + if (test_type == TEST_TYPE_STATS) + while (!rx_stats_are_valid(ifobject)) + continue; + else + receive_pkts(ifobject->pkt_stream, ifobject->xsk, fds); if (test_type == TEST_TYPE_TEARDOWN) print_verbose("Destroying socket\n"); @@ -834,10 +867,18 @@ static void testapp_validate(void) { bool bidi = test_type == TEST_TYPE_BIDI; bool bpf = test_type == TEST_TYPE_BPF_RES; + struct pkt_stream *pkt_stream; if (pthread_barrier_init(&barr, NULL, 2)) exit_with_error(errno); + if (stat_test_type == STAT_TEST_TX_INVALID) + pkt_stream = pkt_stream_generate(DEFAULT_PKT_CNT, XSK_UMEM__INVALID_FRAME_SIZE); + else + pkt_stream = pkt_stream_generate(DEFAULT_PKT_CNT, PKT_SIZE); + ifdict_tx->pkt_stream = pkt_stream; + ifdict_rx->pkt_stream = pkt_stream; + /*Spawn RX thread */ pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); @@ -860,8 +901,6 @@ static void testapp_teardown(void) int i; for (i = 0; i < MAX_TEARDOWN_ITER; i++) { - pkt_counter = 0; - sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); } @@ -887,8 +926,6 @@ static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2) static void testapp_bidi(void) { for (int i = 0; i < MAX_BIDI_ITER; i++) { - pkt_counter = 0; - sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); if (!second_step) { @@ -920,8 +957,6 @@ static void testapp_bpf_res(void) int i; for (i = 0; i < MAX_BPF_ITER; i++) { - pkt_counter = 0; - sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); if (!second_step) @@ -949,6 +984,8 @@ static void testapp_stats(void) case STAT_TEST_RX_FULL: rxqsize = RX_FULL_RXQSIZE; break; + case STAT_TEST_TX_INVALID: + continue; default: break; } @@ -994,9 +1031,7 @@ static void run_pkt_test(int mode, int type) /* reset defaults after potential previous test */ xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; - pkt_counter = 0; second_step = 0; - sigvar = 0; stat_test_type = -1; rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 7670df7e7746..3e5394295ac1 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -40,6 +40,7 @@ #define POLL_TMOUT 1000 #define DEFAULT_PKT_CNT (4 * 1024) #define RX_FULL_RXQSIZE 32 +#define XSK_UMEM__INVALID_FRAME_SIZE (XSK_UMEM__DEFAULT_FRAME_SIZE + 1) #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -74,13 +75,10 @@ static u32 num_frames = DEFAULT_PKT_CNT / 4; static bool second_step; static int test_type; -static u32 opt_pkt_count = DEFAULT_PKT_CNT; static u8 opt_verbose; static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; -static u32 pkt_counter; -static int sigvar; static int stat_test_type; static u32 rxqsize; static u32 frame_headroom; @@ -107,6 +105,17 @@ struct flow_vector { } vector; }; +struct pkt { + u64 addr; + u32 len; + u32 payload; +}; + +struct pkt_stream { + u32 nb_pkts; + struct pkt *pkts; +}; + struct ifobject { char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; @@ -116,6 +125,7 @@ struct ifobject { struct xsk_umem_info *umem; void *(*func_ptr)(void *arg); struct flow_vector fv; + struct pkt_stream *pkt_stream; int ns_fd; u32 dst_ip; u32 src_ip; -- cgit v1.2.3 From 279bdf6b79d5f6a4decbf2699092b55c8c782eec Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:21 +0200 Subject: selftests: xsk: Make enums lower case Make enums lower case as that is the standard. Also drop the unnecessary TEST_MODE_UNCONFIGURED mode. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-16-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 11 +++-------- tools/testing/selftests/bpf/xdpxceiver.h | 9 ++++----- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 5ca853cf27a1..0c7b40d5f4b6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -105,14 +105,9 @@ static const u16 UDP_PORT2 = 2121; static void __exit_with_error(int error, const char *file, const char *func, int line) { - if (configured_mode == TEST_MODE_UNCONFIGURED) { - ksft_exit_fail_msg - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - } else { - ksft_test_result_fail - ("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, strerror(error)); - ksft_exit_xfail(); - } + ksft_test_result_fail("[%s:%s:%i]: ERROR: %d/\"%s\"\n", file, func, line, error, + strerror(error)); + ksft_exit_xfail(); } #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 3e5394295ac1..582af3505c15 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -44,14 +44,13 @@ #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) -enum TEST_MODES { - TEST_MODE_UNCONFIGURED = -1, +enum test_mode { TEST_MODE_SKB, TEST_MODE_DRV, TEST_MODE_MAX }; -enum TEST_TYPES { +enum test_type { TEST_TYPE_NOPOLL, TEST_TYPE_POLL, TEST_TYPE_TEARDOWN, @@ -61,7 +60,7 @@ enum TEST_TYPES { TEST_TYPE_MAX }; -enum STAT_TEST_TYPES { +enum stat_test_type { STAT_TEST_RX_DROPPED, STAT_TEST_TX_INVALID, STAT_TEST_RX_FULL, @@ -69,7 +68,7 @@ enum STAT_TEST_TYPES { STAT_TEST_TYPE_MAX }; -static int configured_mode = TEST_MODE_UNCONFIGURED; +static int configured_mode; static u8 debug_pkt_dump; static u32 num_frames = DEFAULT_PKT_CNT / 4; static bool second_step; -- cgit v1.2.3 From 33a6bef8cf92017ff48e3bd597d7d60652f37b6d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 25 Aug 2021 11:37:22 +0200 Subject: selftests: xsk: Preface options with opt Preface all options with opt_ and make them booleans. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825093722.10219-17-magnus.karlsson@gmail.com --- tools/testing/selftests/bpf/xdpxceiver.c | 6 +++--- tools/testing/selftests/bpf/xdpxceiver.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 0c7b40d5f4b6..f53ce2683f8d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -395,10 +395,10 @@ static void parse_command_line(int argc, char **argv) interface_index++; break; case 'D': - debug_pkt_dump = 1; + opt_pkt_dump = true; break; case 'v': - opt_verbose = 1; + opt_verbose = true; break; default: usage(basename(argv[0])); @@ -517,7 +517,7 @@ static bool is_pkt_valid(struct pkt *pkt, void *buffer, const struct xdp_desc *d if (iphdr->version == IP_PKT_VER && iphdr->tos == IP_PKT_TOS) { u32 seqnum = ntohl(*((u32 *)(data + PKT_HDR_SIZE))); - if (debug_pkt_dump && test_type != TEST_TYPE_STATS) + if (opt_pkt_dump && test_type != TEST_TYPE_STATS) pkt_dump(data, PKT_SIZE); if (pkt->len != desc->len) { diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 582af3505c15..7e49b9fbe25e 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -69,12 +69,12 @@ enum stat_test_type { }; static int configured_mode; -static u8 debug_pkt_dump; +static bool opt_pkt_dump; static u32 num_frames = DEFAULT_PKT_CNT / 4; static bool second_step; static int test_type; -static u8 opt_verbose; +static bool opt_verbose; static u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static u32 xdp_bind_flags = XDP_USE_NEED_WAKEUP | XDP_COPY; -- cgit v1.2.3 From eb18b49ea758ec052ac2a12c6bb204e1e877ec31 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:07 -0700 Subject: bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use case is to allow a bpf-tcp-cc switching to another cc during init(). For example, when the tcp flow is not ecn ready, the bpf_dctcp can switch to another cc by calling setsockopt(TCP_CONGESTION). During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be called and this could cause a recursion but it is stopped by the current trampoline's logic (in the prog->active counter). While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()), the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not available to the bpf-tcp-cc's release(). This will avoid release() making setsockopt() call that will potentially allocate new resources. Although the bpf-tcp-cc already has a more powerful way to read tcp_sock from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and bpf_setsockopt are available together. Thus, bpf_getsockopt() is also added to all tcp_congestion_ops except release(). When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION) to switch to a new cc, the old bpf-tcp-cc will be released by bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map after a rcu grace period because the trampoline's image cannot be freed while the old bpf-tcp-cc is still running. bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size of icsk_ca_priv has already been raised a few times to avoid extra kmalloc and memory referencing. The only exception is the kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv. To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer value stored in icsk_ca_priv after switching and without over-complicating the bpf's verifier for this one exception in tcp_cdg, this patch does not allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be implemented and then use the bpf_sk_storage as the extended storage. bpf_sk_setsockopt proto has only been recently added and used in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the same proto instead of adding a new proto specifically for bpf-tcp-cc. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 22 +++++++++++++++++++++- net/core/filter.c | 6 ++++++ net/ipv4/bpf_tcp_ca.c | 41 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 70f6fd4fa305..d6731c32864e 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -28,6 +28,7 @@ struct bpf_struct_ops_value { struct bpf_struct_ops_map { struct bpf_map map; + struct rcu_head rcu; const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; @@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata) return refcount_inc_not_zero(&kvalue->refcnt); } +static void bpf_struct_ops_put_rcu(struct rcu_head *head) +{ + struct bpf_struct_ops_map *st_map; + + st_map = container_of(head, struct bpf_struct_ops_map, rcu); + bpf_map_put(&st_map->map); +} + void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; @@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata) st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); - bpf_map_put(&st_map->map); + /* The struct_ops's function may switch to another struct_ops. + * + * For example, bpf_tcp_cc_x->init() may switch to + * another tcp_cc_y by calling + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called + * and its map->refcnt may reach 0 which then free its + * trampoline image while tcp_cc_x is still running. + * + * Thus, a rcu grace period is needed here. + */ + call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); } } diff --git a/net/core/filter.c b/net/core/filter.c index cfbd01167eb5..2e32cee2c469 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5051,6 +5051,12 @@ err_clear: BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { + if (level == SOL_TCP && optname == TCP_CONGESTION) { + if (optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, optlen)) + return -ENOTSUPP; + } + return _bpf_setsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9e41eff4a685..0dcee9df1326 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -10,6 +10,9 @@ #include #include +/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, init), offsetof(struct tcp_congestion_ops, release), @@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 prog_ops_moff(const struct bpf_prog *prog) +{ + const struct btf_member *m; + const struct btf_type *t; + u32 midx; + + midx = prog->expected_attach_type; + t = bpf_tcp_congestion_ops.type; + m = &btf_type_member(t)[midx]; + + return btf_member_bit_offset(t, m) / 8; +} + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + /* Does not allow release() to call setsockopt. + * release() is called when the current bpf-tcp-cc + * is retiring. It is not allowed to call + * setsockopt() to make further changes which + * may potentially allocate new resources. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + /* Since get/setsockopt is usually expected to + * be available together, disable getsockopt for + * release also to avoid usage surprise. + * The bpf-tcp-cc already has a more powerful way + * to read tcp_sock from the PTR_TO_BTF_ID. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_getsockopt_proto; + return NULL; default: return bpf_base_func_proto(func_id); } @@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } -/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; - struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, -- cgit v1.2.3 From 700dcf0f447691f35abc7121f234457f90fcfb1c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:13 -0700 Subject: bpf: selftests: Add sk_state to bpf_tcp_helpers.h Add sk_state define to bpf_tcp_helpers.h. Rename the existing global variable "sk_state" in the kfunc_call test to "sk_state_res". Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173013.3977316-1-kafai@fb.com --- tools/testing/selftests/bpf/bpf_tcp_helpers.h | 1 + tools/testing/selftests/bpf/prog_tests/kfunc_call.c | 2 +- tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index c9f9bdad60c7..b1ede6f0b821 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -31,6 +31,7 @@ enum sk_pacing { struct sock { struct sock_common __sk_common; +#define sk_state __sk_common.skc_state unsigned long sk_pacing_rate; __u32 sk_pacing_status; /* see enum sk_pacing */ } __attribute__((preserve_access_index)); diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 30a7b9b837bf..9611f2bc50df 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -44,7 +44,7 @@ static void test_subprog(void) ASSERT_OK(err, "bpf_prog_test_run(test1)"); ASSERT_EQ(retval, 10, "test1-retval"); ASSERT_NEQ(skel->data->active_res, -1, "active_res"); - ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res"); kfunc_call_test_subprog__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c index b2dcb7d9cb03..5fbd9e232d44 100644 --- a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -9,7 +9,7 @@ extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, __u32 c, __u64 d) __ksym; extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; int active_res = -1; -int sk_state = -1; +int sk_state_res = -1; int __noinline f1(struct __sk_buff *skb) { @@ -28,7 +28,7 @@ int __noinline f1(struct __sk_buff *skb) if (active) active_res = *active; - sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + sk_state_res = bpf_kfunc_call_test3((struct sock *)sk)->sk_state; return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); } -- cgit v1.2.3 From 3d7789831df9bda0941bda41cface6687b7c3e04 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:19 -0700 Subject: bpf: selftests: Add connect_to_fd_opts to network_helpers The next test requires to setsockopt(TCP_CONGESTION) before connect(), so a new arg is needed for the connect_to_fd() to specify the cc's name. This patch adds a new "struct network_helper_opts" for the future option needs. It starts with the "cc" and "timeout_ms" option. A new helper connect_to_fd_opts() is added to take the new "const struct network_helper_opts *opts" as an arg. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173019.3977910-1-kafai@fb.com --- tools/testing/selftests/bpf/network_helpers.c | 23 +++++++++++++++++++++-- tools/testing/selftests/bpf/network_helpers.h | 6 ++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index d6857683397f..7e9f6375757a 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -218,13 +218,18 @@ static int connect_fd_to_addr(int fd, return 0; } -int connect_to_fd(int server_fd, int timeout_ms) +static const struct network_helper_opts default_opts; + +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) { struct sockaddr_storage addr; struct sockaddr_in *addr_in; socklen_t addrlen, optlen; int fd, type; + if (!opts) + opts = &default_opts; + optlen = sizeof(type); if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) { log_err("getsockopt(SOL_TYPE)"); @@ -244,7 +249,12 @@ int connect_to_fd(int server_fd, int timeout_ms) return -1; } - if (settimeo(fd, timeout_ms)) + if (settimeo(fd, opts->timeout_ms)) + goto error_close; + + if (opts->cc && opts->cc[0] && + setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc, + strlen(opts->cc) + 1)) goto error_close; if (connect_fd_to_addr(fd, &addr, addrlen)) @@ -257,6 +267,15 @@ error_close: return -1; } +int connect_to_fd(int server_fd, int timeout_ms) +{ + struct network_helper_opts opts = { + .timeout_ms = timeout_ms, + }; + + return connect_to_fd_opts(server_fd, &opts); +} + int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms) { struct sockaddr_storage addr; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index c59a8f6d770b..da7e132657d5 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -17,6 +17,11 @@ typedef __u16 __sum16; #define VIP_NUM 5 #define MAGIC_BYTES 123 +struct network_helper_opts { + const char *cc; + int timeout_ms; +}; + /* ipv4 test vector */ struct ipv4_packet { struct ethhdr eth; @@ -41,6 +46,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str, unsigned int nr_listens); void free_fds(int *fds, unsigned int nr_close_fds); int connect_to_fd(int server_fd, int timeout_ms); +int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts); int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms); int fastopen_connect(int server_fd, const char *data, unsigned int data_len, int timeout_ms); -- cgit v1.2.3 From 574ee209286755ae57449196bfa11a90d2d724e5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:26 -0700 Subject: bpf: selftests: Add dctcp fallback test This patch makes the bpf_dctcp test to fallback to cubic by using setsockopt(TCP_CONGESTION) when the tcp flow is not ecn ready. It also checks setsockopt() is not available to release(). The settimeo() from the network_helpers.h is used, so the local one is removed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173026.3979130-1-kafai@fb.com --- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 106 ++++++++++++++++----- tools/testing/selftests/bpf/progs/bpf_dctcp.c | 25 +++++ .../selftests/bpf/progs/bpf_dctcp_release.c | 26 +++++ 3 files changed, 134 insertions(+), 23 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp_release.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index efe1e979affb..94e03df69d71 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -4,37 +4,22 @@ #include #include #include +#include "network_helpers.h" #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" #include "bpf_tcp_nogpl.skel.h" +#include "bpf_dctcp_release.skel.h" #define min(a, b) ((a) < (b) ? (a) : (b)) +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + static const unsigned int total_bytes = 10 * 1024 * 1024; -static const struct timeval timeo_sec = { .tv_sec = 10 }; -static const size_t timeo_optlen = sizeof(timeo_sec); static int expected_stg = 0xeB9F; static int stop, duration; -static int settimeo(int fd) -{ - int err; - - err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, - timeo_optlen); - if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n", - errno)) - return -1; - - err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec, - timeo_optlen); - if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n", - errno)) - return -1; - - return 0; -} - static int settcpca(int fd, const char *tcp_ca) { int err; @@ -61,7 +46,7 @@ static void *server(void *arg) goto done; } - if (settimeo(fd)) { + if (settimeo(fd, 0)) { err = -errno; goto done; } @@ -114,7 +99,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) } if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) || - settimeo(lfd) || settimeo(fd)) + settimeo(lfd, 0) || settimeo(fd, 0)) goto done; /* bind, listen and start server thread to accept */ @@ -267,6 +252,77 @@ static void test_invalid_license(void) libbpf_set_print(old_print_fn); } +static void test_dctcp_fallback(void) +{ + int err, lfd = -1, cli_fd = -1, srv_fd = -1; + struct network_helper_opts opts = { + .cc = "cubic", + }; + struct bpf_dctcp *dctcp_skel; + struct bpf_link *link = NULL; + char srv_cc[16]; + socklen_t cc_len = sizeof(srv_cc); + + dctcp_skel = bpf_dctcp__open(); + if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) + return; + strcpy(dctcp_skel->rodata->fallback, "cubic"); + if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load")) + goto done; + + link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp); + if (!ASSERT_OK_PTR(link, "dctcp link")) + goto done; + + lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(lfd, 0, "lfd") || + !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp")) + goto done; + + cli_fd = connect_to_fd_opts(lfd, &opts); + if (!ASSERT_GE(cli_fd, 0, "cli_fd")) + goto done; + + srv_fd = accept(lfd, NULL, 0); + if (!ASSERT_GE(srv_fd, 0, "srv_fd")) + goto done; + ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res"); + ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res"); + + err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len); + if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)")) + goto done; + ASSERT_STREQ(srv_cc, "cubic", "srv_fd cc"); + +done: + bpf_link__destroy(link); + bpf_dctcp__destroy(dctcp_skel); + if (lfd != -1) + close(lfd); + if (srv_fd != -1) + close(srv_fd); + if (cli_fd != -1) + close(cli_fd); +} + +static void test_rel_setsockopt(void) +{ + struct bpf_dctcp_release *rel_skel; + libbpf_print_fn_t old_print_fn; + + err_str = "unknown func bpf_setsockopt"; + found = false; + + old_print_fn = libbpf_set_print(libbpf_debug_print); + rel_skel = bpf_dctcp_release__open_and_load(); + libbpf_set_print(old_print_fn); + + ASSERT_ERR_PTR(rel_skel, "rel_skel"); + ASSERT_TRUE(found, "expected_err_msg"); + + bpf_dctcp_release__destroy(rel_skel); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) @@ -275,4 +331,8 @@ void test_bpf_tcp_ca(void) test_cubic(); if (test__start_subtest("invalid_license")) test_invalid_license(); + if (test__start_subtest("dctcp_fallback")) + test_dctcp_fallback(); + if (test__start_subtest("rel_setsockopt")) + test_rel_setsockopt(); } diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index fd42247da8b4..9573be6122be 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -17,6 +17,11 @@ char _license[] SEC("license") = "GPL"; +volatile const char fallback[TCP_CA_NAME_MAX]; +const char bpf_dctcp[] = "bpf_dctcp"; +const char tcp_cdg[] = "cdg"; +char cc_res[TCP_CA_NAME_MAX]; +int tcp_cdg_res = 0; int stg_result = 0; struct { @@ -57,6 +62,26 @@ void BPF_PROG(dctcp_init, struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); int *stg; + if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) { + /* Switch to fallback */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)); + /* Switch back to myself which the bpf trampoline + * stopped calling dctcp_init recursively. + */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)bpf_dctcp, sizeof(bpf_dctcp)); + /* Switch back to fallback */ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)fallback, sizeof(fallback)); + /* Expecting -ENOTSUPP for tcp_cdg_res */ + tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)tcp_cdg, sizeof(tcp_cdg)); + bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)cc_res, sizeof(cc_res)); + return; + } + ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c new file mode 100644 index 000000000000..d836f7c372f0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp_release.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "GPL"; +const char cubic[] = "cubic"; + +void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk) +{ + bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + (void *)cubic, sizeof(cubic)); +} + +SEC(".struct_ops") +struct tcp_congestion_ops dctcp_rel = { + .release = (void *)dctcp_nouse_release, + .name = "bpf_dctcp_rel", +}; -- cgit v1.2.3 From eb529c5b10b9401a0f2d1f469e82c6a0ba98082c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 25 Aug 2021 18:48:31 -0700 Subject: bpf: Fix bpf-next builds without CONFIG_BPF_EVENTS This commit fixes linker errors along the lines of: s390-linux-ld: task_iter.c:(.init.text+0xa4): undefined reference to `btf_task_struct_ids'` Fix by defining btf_task_struct_ids unconditionally in kernel/bpf/btf.c since there exists code that unconditionally uses btf_task_struct_ids. Reported-by: kernel test robot Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/05d94748d9f4b3eecedc4fddd6875418a396e23c.1629942444.git.dxu@dxuuu.xyz --- include/linux/btf_ids.h | 1 + kernel/bpf/btf.c | 2 ++ kernel/trace/bpf_trace.c | 2 -- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93d881ab0d48..47d9abfbdb55 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -151,6 +151,7 @@ extern struct btf_id_set name; #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c395024610ed..dfe61df4f974 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6213,3 +6213,5 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; + +BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 580e14ee7ff9..8e2eb950aa82 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -714,8 +714,6 @@ BPF_CALL_0(bpf_get_current_task_btf) return (unsigned long) current; } -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) - const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, -- cgit v1.2.3 From 3599bc5101b36d9ff88da17fe65b001aeadd9c62 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Wed, 25 Aug 2021 11:47:45 -0700 Subject: selftests/bpf: Reduce more flakyness in sockmap_listen This patch adds similar retry logic to more places where read() is used, to reduce flakyness in slow CI environment. Signed-off-by: Yucong Sun Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210825184745.2680830-1-fallentree@fb.com --- .../testing/selftests/bpf/prog_tests/sockmap_listen.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 6a5df28f9a3d..5c5979046523 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -949,6 +949,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, int err, n; u32 key; char b; + int retries = 100; zero_verdict_count(verd_mapfd); @@ -1001,10 +1002,15 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd, goto close_peer1; if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); - +again: n = read(c0, &b, 1); - if (n < 0) + if (n < 0) { + if (errno == EAGAIN && retries--) { + usleep(1000); + goto again; + } FAIL_ERRNO("%s: read", log_prefix); + } if (n == 0) FAIL("%s: incomplete read", log_prefix); @@ -1926,6 +1932,7 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, int sfd[2]; u32 key; char b; + int retries = 100; zero_verdict_count(verd_mapfd); @@ -1956,9 +1963,15 @@ static void unix_inet_redir_to_connected(int family, int type, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); +again: n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) + if (n < 0) { + if (errno == EAGAIN && retries--) { + usleep(1000); + goto again; + } FAIL_ERRNO("%s: read", log_prefix); + } if (n == 0) FAIL("%s: incomplete read", log_prefix); -- cgit v1.2.3 From 48b2e71c2e53263ebbb6798bbf208e191937e691 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 26 Aug 2021 17:39:10 +0530 Subject: samples: bpf: Fix uninitialized variable in xdp_redirect_cpu While at it, also improve help output when CPU number is greater than possible. Fixes: e531a220cc59 ("samples: bpf: Convert xdp_redirect_cpu to XDP samples helper") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210826120910.454081-1-memxor@gmail.com --- samples/bpf/xdp_redirect_cpu_user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 631700aef69c..6e25fba64c72 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -141,7 +141,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, static int mark_cpus_unavailable(void) { int ret, i, n_cpus = libbpf_num_possible_cpus(); - __u32 invalid_cpu; + __u32 invalid_cpu = n_cpus; for (i = 0; i < n_cpus; i++) { ret = bpf_map_update_elem(avail_fd, &i, @@ -449,8 +449,9 @@ int main(int argc, char **argv) add_cpu = strtoul(optarg, NULL, 0); if (add_cpu >= n_cpus) { fprintf(stderr, - "--cpu nr too large for cpumap err(%d):%s\n", + "--cpu nr too large for cpumap err (%d):%s\n", errno, strerror(errno)); + usage(argv, long_options, __doc__, mask, true, skel->obj); goto end_cpu; } cpu[added_cpus++] = add_cpu; -- cgit v1.2.3 From 47bb27a20d6ea22cd092c1fc2bb4fcecac374838 Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Fri, 27 Aug 2021 00:41:40 -0700 Subject: selftests/bpf: Fix potential unreleased lock This lock is not released if the program return at the patched branch. Signed-off-by: Chengfeng Ye Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210827074140.118671-1-cyeaa@connect.ust.hk --- tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index ec281b0363b8..86f97681ad89 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -195,8 +195,10 @@ static void run_test(int cgroup_fd) pthread_mutex_lock(&server_started_mtx); if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread, - (void *)&server_fd))) + (void *)&server_fd))) { + pthread_mutex_unlock(&server_started_mtx); goto close_server_fd; + } pthread_cond_wait(&server_started, &server_started_mtx); pthread_mutex_unlock(&server_started_mtx); -- cgit v1.2.3 From fca35b11e18a9d854cda6b18ed39a78011f4b082 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 27 Aug 2021 16:49:05 +0530 Subject: MAINTAINERS: Remove self from powerpc BPF JIT Stepping down as I haven't had a chance to look into the powerpc BPF JIT compilers for a while. Signed-off-by: Sandipan Das Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210827111905.396145-1-sandipan@linux.ibm.com --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e23f2d8bd7aa..bc88786ea13a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3409,7 +3409,6 @@ F: drivers/net/ethernet/netronome/nfp/bpf/ BPF JIT for POWERPC (32-BIT AND 64-BIT) M: Naveen N. Rao -M: Sandipan Das L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Maintained -- cgit v1.2.3