From 2c860a43dd77f969bb959336a2f743d7103a8f63 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:15 +0900 Subject: bpf: af_unix: Implement BPF iterator for UNIX domain socket. This patch implements the BPF iterator for the UNIX domain socket. Currently, the batch optimisation introduced for the TCP iterator in the commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") is not used for the UNIX domain socket. It will require replacing the big lock for the hash table with small locks for each hash list not to block other processes. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-2-kuniyu@amazon.co.jp --- net/unix/af_unix.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1c2224f05b51..bad8f19174e3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -113,6 +113,7 @@ #include #include #include +#include #include "scm.h" @@ -3143,6 +3144,64 @@ static const struct seq_operations unix_seq_ops = { .stop = unix_seq_stop, .show = unix_seq_show, }; + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__unix { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct unix_sock *, unix_sk); + uid_t uid __aligned(8); +}; + +static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) +{ + struct bpf_iter__unix ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.unix_sk = unix_sk; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return unix_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)unix_prog_seq_show(prog, &meta, v, 0); + } + + unix_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = bpf_iter_unix_seq_stop, + .show = bpf_iter_unix_seq_show, +}; +#endif #endif static const struct net_proto_family unix_family_ops = { @@ -3183,6 +3242,35 @@ static struct pernet_operations unix_net_ops = { .exit = unix_net_exit, }; +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) + +static const struct bpf_iter_seq_info unix_seq_info = { + .seq_ops = &bpf_iter_unix_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct seq_net_private), +}; + +static struct bpf_iter_reg unix_reg_info = { + .target = "unix", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__unix, unix_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &unix_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; + if (bpf_iter_reg_target(&unix_reg_info)) + pr_warn("Warning: could not register bpf iterator unix\n"); +} +#endif + static int __init af_unix_init(void) { int rc = -1; @@ -3198,6 +3286,11 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); unix_bpf_build_proto(); + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif + out: return rc; } -- cgit v1.2.3 From fb7dd8bca0139fd73d3f4a6cd257b11731317ded Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:54 -0700 Subject: bpf: Refactor BPF_PROG_RUN into a function Turn BPF_PROG_RUN into a proper always inlined function. No functional and performance changes are intended, but it makes it much easier to understand what's going on with how BPF programs are actually get executed. It's more obvious what types and callbacks are expected. Also extra () around input parameters can be dropped, as well as `__` variable prefixes intended to avoid naming collisions, which makes the code simpler to read and write. This refactoring also highlighted one extra issue. BPF_PROG_RUN is both a macro and an enum value (BPF_PROG_RUN == BPF_PROG_TEST_RUN). Turning BPF_PROG_RUN into a function causes naming conflict compilation error. So rename BPF_PROG_RUN into lower-case bpf_prog_run(), similar to bpf_prog_run_xdp(), bpf_prog_run_pin_on_cpu(), etc. All existing callers of BPF_PROG_RUN, the macro, are switched to bpf_prog_run() explicitly. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-2-andrii@kernel.org --- net/bpf/test_run.c | 6 +++--- net/core/filter.c | 4 ++-- net/core/ptp_classifier.c | 2 +- net/netfilter/xt_bpf.c | 2 +- net/sched/act_bpf.c | 4 ++-- net/sched/cls_bpf.c | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4b855af267b1..2eb0e55ef54d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -116,7 +116,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else - *retval = BPF_PROG_RUN(prog, ctx); + *retval = bpf_prog_run(prog, ctx); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); @@ -327,7 +327,7 @@ __bpf_prog_test_run_raw_tp(void *data) struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); - info->retval = BPF_PROG_RUN(info->prog, info->ctx); + info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } @@ -989,7 +989,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; - retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); bpf_test_timer_leave(&t); diff --git a/net/core/filter.c b/net/core/filter.c index 3aca07c44fad..5cf38e8886f1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to BPF_PROG_RUN. It returns 0 if the packet should + * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -10115,7 +10115,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); - action = BPF_PROG_RUN(prog, &reuse_kern); + action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index e33fde06d528..dd4cf01d1e0a 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return BPF_PROG_RUN(ptp_insns, skb); + return bpf_prog_run(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 13cf3f9b5938..849ac552a154 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return BPF_PROG_RUN(info->filter, skb); + return bpf_prog_run(info->filter, skb); } static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 040807aa15b9..5c36013339e1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,11 +47,11 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 3b472bafdc9d..df19a847829e 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -96,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); } if (prog->exts_integrated) { -- cgit v1.2.3 From 77462de14a43f4d98dbd8de0f5743a4e02450b1d Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:20 +0000 Subject: af_unix: Add read_sock for stream socket types To support sockmap for af_unix stream type, implement read_sock, which is similar to the read_sock for unix dgram sockets. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210816190327.2739291-2-jiang.wang@bytedance.com --- net/unix/af_unix.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bad8f19174e3..4455b62317d4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -679,6 +679,8 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -732,6 +734,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, + .read_sock = unix_stream_read_sock, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -2491,6 +2494,15 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } #endif +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + return -ENOTCONN; + + return unix_read_sock(sk, desc, recv_actor); +} + static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { -- cgit v1.2.3 From 94531cfcbe79c3598acf96806627b2137ca32eb9 Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Mon, 16 Aug 2021 19:03:21 +0000 Subject: af_unix: Add unix_stream_proto for sockmap Previously, sockmap for AF_UNIX protocol only supports dgram type. This patch add unix stream type support, which is similar to unix_dgram_proto. To support sockmap, dgram and stream cannot share the same unix_proto anymore, because they have different implementations, such as unhash for stream type (which will remove closed or disconnected sockets from the map), so rename unix_proto to unix_dgram_proto and add a new unix_stream_proto. Also implement stream related sockmap functions. And add dgram key words to those dgram specific functions. Signed-off-by: Jiang Wang Signed-off-by: Andrii Nakryiko Reviewed-by: Cong Wang Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210816190327.2739291-3-jiang.wang@bytedance.com --- net/core/sock_map.c | 1 + net/unix/af_unix.c | 83 +++++++++++++++++++++++++++++++++++++++-------- net/unix/unix_bpf.c | 93 ++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 142 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ae5fa4338d9c..e252b8ec2b85 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1494,6 +1494,7 @@ void sock_map_unhash(struct sock *sk) rcu_read_unlock(); saved_unhash(sk); } +EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_close(struct sock *sk, long timeout) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4455b62317d4..443c49081636 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -798,17 +798,35 @@ static void unix_close(struct sock *sk, long timeout) */ } -struct proto unix_proto = { - .name = "UNIX", +static void unix_unhash(struct sock *sk) +{ + /* Nothing to do here, unix socket does not need a ->unhash(). + * This is merely for sockmap. + */ +} + +struct proto unix_dgram_proto = { + .name = "UNIX-DGRAM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), + .close = unix_close, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_dgram_bpf_update_proto, +#endif +}; + +struct proto unix_stream_proto = { + .name = "UNIX-STREAM", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, + .unhash = unix_unhash, #ifdef CONFIG_BPF_SYSCALL - .psock_update_sk_prot = unix_bpf_update_proto, + .psock_update_sk_prot = unix_stream_bpf_update_proto, #endif }; -static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { struct sock *sk = NULL; struct unix_sock *u; @@ -817,7 +835,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); + if (type == SOCK_STREAM) + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); + else /*dgram and seqpacket */ + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); + if (!sk) goto out; @@ -879,7 +901,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern) ? 0 : -ENOMEM; + return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -1293,7 +1315,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0); + newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); if (newsk == NULL) goto out; @@ -2323,8 +2345,10 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si struct sock *sk = sock->sk; #ifdef CONFIG_BPF_SYSCALL - if (sk->sk_prot != &unix_proto) - return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_dgram_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); @@ -2728,6 +2752,20 @@ static int unix_stream_read_actor(struct sk_buff *skb, return ret ?: chunk; } +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sk->sk_socket, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state, true); +} + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -2739,6 +2777,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, .flags = flags }; +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_stream_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif return unix_stream_read_generic(&state, true); } @@ -2799,7 +2845,9 @@ static int unix_shutdown(struct socket *sock, int mode) (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; + const struct proto *prot = READ_ONCE(other->sk_prot); + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) @@ -2808,10 +2856,12 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) + if (peer_mode == SHUTDOWN_MASK) { sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - else if (peer_mode & RCV_SHUTDOWN) + other->sk_state = TCP_CLOSE; + } else if (peer_mode & RCV_SHUTDOWN) { sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); + } } if (other) sock_put(other); @@ -3289,7 +3339,13 @@ static int __init af_unix_init(void) BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - rc = proto_register(&unix_proto, 1); + rc = proto_register(&unix_dgram_proto, 1); + if (rc != 0) { + pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + goto out; + } + + rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; @@ -3310,7 +3366,8 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - proto_unregister(&unix_proto); + proto_unregister(&unix_dgram_proto); + proto_unregister(&unix_stream_proto); unregister_pernet_subsys(&unix_net_ops); } diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 20f53575b5c9..b927e2baae50 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -38,9 +38,18 @@ static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } -static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, - int *addr_len) +static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + if (sk->sk_type == SOCK_DGRAM) + return __unix_dgram_recvmsg(sk, msg, len, flags); + else + return __unix_stream_recvmsg(sk, msg, len, flags); +} + +static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, + int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; @@ -48,14 +57,14 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg, psock = sk_psock_get(sk); if (unlikely(!psock)) - return __unix_dgram_recvmsg(sk, msg, len, flags); + return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); - return __unix_dgram_recvmsg(sk, msg, len, flags); + return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: @@ -71,7 +80,7 @@ msg_bytes_ready: goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); - return __unix_dgram_recvmsg(sk, msg, len, flags); + return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } @@ -80,30 +89,55 @@ msg_bytes_ready: return copied; } -static struct proto *unix_prot_saved __read_mostly; -static DEFINE_SPINLOCK(unix_prot_lock); -static struct proto unix_bpf_prot; +static struct proto *unix_dgram_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_dgram_prot_lock); +static struct proto unix_dgram_bpf_prot; + +static struct proto *unix_stream_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_stream_prot_lock); +static struct proto unix_stream_bpf_prot; -static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base) +static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; - prot->recvmsg = unix_dgram_bpf_recvmsg; + prot->recvmsg = unix_bpf_recvmsg; +} + +static void unix_stream_bpf_rebuild_protos(struct proto *prot, + const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_bpf_recvmsg; + prot->unhash = sock_map_unhash; +} + +static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { + spin_lock_bh(&unix_dgram_prot_lock); + if (likely(ops != unix_dgram_prot_saved)) { + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); + smp_store_release(&unix_dgram_prot_saved, ops); + } + spin_unlock_bh(&unix_dgram_prot_lock); + } } -static void unix_bpf_check_needs_rebuild(struct proto *ops) +static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { - if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) { - spin_lock_bh(&unix_prot_lock); - if (likely(ops != unix_prot_saved)) { - unix_bpf_rebuild_protos(&unix_bpf_prot, ops); - smp_store_release(&unix_prot_saved, ops); + if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { + spin_lock_bh(&unix_stream_prot_lock); + if (likely(ops != unix_stream_prot_saved)) { + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); + smp_store_release(&unix_stream_prot_saved, ops); } - spin_unlock_bh(&unix_prot_lock); + spin_unlock_bh(&unix_stream_prot_lock); } } -int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; @@ -114,12 +148,27 @@ int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) return 0; } - unix_bpf_check_needs_rebuild(psock->sk_proto); - WRITE_ONCE(sk->sk_prot, &unix_bpf_prot); + unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot); + return 0; +} + +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_stream_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { - unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto); + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); + } -- cgit v1.2.3 From 6cf1770d63dd2d0d0d4048e7b3ee360336c072d9 Mon Sep 17 00:00:00 2001 From: Xu Liu Date: Wed, 18 Aug 2021 18:58:19 +0800 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_SOCK_OPS We'd like to be able to identify netns from sockops hooks to accelerate local process communication form different netns. Signed-off-by: Xu Liu Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818105820.91894-2-liuxu623@gmail.com --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5cf38e8886f1..59b8f5050180 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4676,6 +4676,18 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; +BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { + .func = bpf_get_netns_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -7491,6 +7503,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; -- cgit v1.2.3 From d359902d5c357b280e7a0862bb8a1ba56b3fc197 Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Sat, 21 Aug 2021 18:07:36 +0000 Subject: af_unix: Fix NULL pointer bug in unix_shutdown Commit 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") introduced a bug for af_unix SEQPACKET type. In unix_shutdown, the unhash function will call prot->unhash(), which is NULL for SEQPACKET. And kernel will panic. On ARM32, it will show following messages: (it likely affects x86 too). Fix the bug by checking the prot->unhash is NULL or not first. Kernel log: <--- cut here --- Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = 2fba1ffb *pgd=00000000 Internal error: Oops: 80000005 [#1] PREEMPT SMP THUMB2 Modules linked in: CPU: 1 PID: 1999 Comm: falkon Tainted: G W 5.14.0-rc5-01175-g94531cfcbe79-dirty #9240 Hardware name: NVIDIA Tegra SoC (Flattened Device Tree) PC is at 0x0 LR is at unix_shutdown+0x81/0x1a8 pc : [<00000000>] lr : [] psr: 600f0013 sp : e45aff70 ip : e463a3c0 fp : beb54f04 r10: 00000125 r9 : e45ae000 r8 : c4a56664 r7 : 00000001 r6 : c4a56464 r5 : 00000001 r4 : c4a56400 r3 : 00000000 r2 : c5a6b180 r1 : 00000000 r0 : c4a56400 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 50c5387d Table: 05aa804a DAC: 00000051 Register r0 information: slab PING start c4a56400 pointer offset 0 Register r1 information: NULL pointer Register r2 information: slab task_struct start c5a6b180 pointer offset 0 Register r3 information: NULL pointer Register r4 information: slab PING start c4a56400 pointer offset 0 Register r5 information: non-paged memory Register r6 information: slab PING start c4a56400 pointer offset 100 Register r7 information: non-paged memory Register r8 information: slab PING start c4a56400 pointer offset 612 Register r9 information: non-slab/vmalloc memory Register r10 information: non-paged memory Register r11 information: non-paged memory Register r12 information: slab filp start e463a3c0 pointer offset 0 Process falkon (pid: 1999, stack limit = 0x9ec48895) Stack: (0xe45aff70 to 0xe45b0000) ff60: e45ae000 c5f26a00 00000000 00000125 ff80: c0100264 c07f7fa3 beb54f04 fffffff7 00000001 e6f3fc0e b5e5e9ec beb54ec4 ffa0: b5da0ccc c010024b b5e5e9ec beb54ec4 0000000f 00000000 00000000 beb54ebc ffc0: b5e5e9ec beb54ec4 b5da0ccc 00000125 beb54f58 00785238 beb5529c beb54f04 ffe0: b5da1e24 beb54eac b301385c b62b6ee8 600f0030 0000000f 00000000 00000000 [] (unix_shutdown) from [] (__sys_shutdown+0x2f/0x50) [] (__sys_shutdown) from [] (__sys_trace_return+0x1/0x16) Exception stack(0xe45affa8 to 0xe45afff0) Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Reported-by: Dmitry Osipenko Signed-off-by: Jiang Wang Signed-off-by: Daniel Borkmann Tested-by: Dmitry Osipenko Acked-by: Kuniyuki Iwashima Link: https://lore.kernel.org/bpf/20210821180738.1151155-1-jiang.wang@bytedance.com --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 443c49081636..15c1e4e4012d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2847,7 +2847,8 @@ static int unix_shutdown(struct socket *sock, int mode) int peer_mode = 0; const struct proto *prot = READ_ONCE(other->sk_prot); - prot->unhash(other); + if (prot->unhash) + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) -- cgit v1.2.3 From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- net/ipv4/af_inet.c | 6 +++--- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 6 +++--- net/ipv6/udp.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0e4d758c2585..1d816a5fd3eb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -452,7 +452,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET4_BIND, &flags); + CGROUP_INET4_BIND, &flags); if (err) return err; @@ -781,7 +781,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, + CGROUP_INET4_GETPEERNAME, NULL); } else { __be32 addr = inet->inet_rcv_saddr; @@ -790,7 +790,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, + CGROUP_INET4_GETSOCKNAME, NULL); } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1a742b710e54..8851c9463b4b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1143,7 +1143,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d92c90d97763..b5878bb8e419 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -455,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET6_BIND, &flags); + CGROUP_INET6_BIND, &flags); if (err) return err; @@ -532,7 +532,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (np->sndflow) sin->sin6_flowinfo = np->flow_label; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, + CGROUP_INET6_GETPEERNAME, NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) @@ -541,7 +541,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, + CGROUP_INET6_GETSOCKNAME, NULL); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c5e15e94bb00..ea53847b5b7e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1475,7 +1475,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) -- cgit v1.2.3 From fab60e29fcc6d60396da20d63d45fd0d305ba4e4 Mon Sep 17 00:00:00 2001 From: Xu Liu Date: Fri, 20 Aug 2021 15:17:11 +0800 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_SK_MSG We'd like to be able to identify netns from sk_msg hooks to accelerate local process communication form different netns. Signed-off-by: Xu Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820071712.52852-2-liuxu623@gmail.com --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 59b8f5050180..cfbd01167eb5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4688,6 +4688,18 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; +BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { + .func = bpf_get_netns_cookie_sk_msg, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -7551,6 +7563,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; -- cgit v1.2.3 From eb18b49ea758ec052ac2a12c6bb204e1e877ec31 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:07 -0700 Subject: bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use case is to allow a bpf-tcp-cc switching to another cc during init(). For example, when the tcp flow is not ecn ready, the bpf_dctcp can switch to another cc by calling setsockopt(TCP_CONGESTION). During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be called and this could cause a recursion but it is stopped by the current trampoline's logic (in the prog->active counter). While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()), the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not available to the bpf-tcp-cc's release(). This will avoid release() making setsockopt() call that will potentially allocate new resources. Although the bpf-tcp-cc already has a more powerful way to read tcp_sock from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and bpf_setsockopt are available together. Thus, bpf_getsockopt() is also added to all tcp_congestion_ops except release(). When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION) to switch to a new cc, the old bpf-tcp-cc will be released by bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map after a rcu grace period because the trampoline's image cannot be freed while the old bpf-tcp-cc is still running. bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size of icsk_ca_priv has already been raised a few times to avoid extra kmalloc and memory referencing. The only exception is the kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv. To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer value stored in icsk_ca_priv after switching and without over-complicating the bpf's verifier for this one exception in tcp_cdg, this patch does not allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be implemented and then use the bpf_sk_storage as the extended storage. bpf_sk_setsockopt proto has only been recently added and used in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the same proto instead of adding a new proto specifically for bpf-tcp-cc. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com --- net/core/filter.c | 6 ++++++ net/ipv4/bpf_tcp_ca.c | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index cfbd01167eb5..2e32cee2c469 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5051,6 +5051,12 @@ err_clear: BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { + if (level == SOL_TCP && optname == TCP_CONGESTION) { + if (optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, optlen)) + return -ENOTSUPP; + } + return _bpf_setsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9e41eff4a685..0dcee9df1326 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -10,6 +10,9 @@ #include #include +/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, init), offsetof(struct tcp_congestion_ops, release), @@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 prog_ops_moff(const struct bpf_prog *prog) +{ + const struct btf_member *m; + const struct btf_type *t; + u32 midx; + + midx = prog->expected_attach_type; + t = bpf_tcp_congestion_ops.type; + m = &btf_type_member(t)[midx]; + + return btf_member_bit_offset(t, m) / 8; +} + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + /* Does not allow release() to call setsockopt. + * release() is called when the current bpf-tcp-cc + * is retiring. It is not allowed to call + * setsockopt() to make further changes which + * may potentially allocate new resources. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + /* Since get/setsockopt is usually expected to + * be available together, disable getsockopt for + * release also to avoid usage surprise. + * The bpf-tcp-cc already has a more powerful way + * to read tcp_sock from the PTR_TO_BTF_ID. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_getsockopt_proto; + return NULL; default: return bpf_base_func_proto(func_id); } @@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } -/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; - struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, -- cgit v1.2.3