diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 1 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 3 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 733 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 257 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 14 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 9 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 20 |
7 files changed, 975 insertions, 62 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..5e49b614d0e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1526,6 +1526,7 @@ static struct pernet_operations audit_net_ops __net_initdata = { .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), + .async = true, }; /* Initialize audit support at boot time. */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 81e2f6995adb..bf6da59ae0d0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { + /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future + * extensions. + */ if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a927e89dad6e..69c5bccabd22 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,6 +38,7 @@ #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/list.h> +#include <linux/mm.h> #include <net/strparser.h> #include <net/tcp.h> @@ -47,6 +48,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -62,8 +64,7 @@ struct smap_psock_map_entry { struct smap_psock { struct rcu_head rcu; - /* refcnt is used inside sk_callback_lock */ - u32 refcnt; + refcount_t refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -74,7 +75,16 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -92,6 +102,11 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -116,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -175,6 +204,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -186,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + memset(sg, 0, sizeof(*sg)); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_table(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) + m = psock->cork; + else + m = &md; + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg = &m->sg_data[m->sg_end]; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -373,15 +1014,13 @@ static void smap_destroy_psock(struct rcu_head *rcu) static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - psock->refcnt--; - if (psock->refcnt) - return; - - tcp_cleanup_ulp(sock); - smap_stop_sock(psock, sock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); + if (refcount_dec_and_test(&psock->refcnt)) { + tcp_cleanup_ulp(sock); + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); + } } static int smap_parse_func_strparser(struct strparser *strp, @@ -415,7 +1054,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -485,12 +1123,22 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -506,12 +1154,13 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); - psock->refcnt = 1; + refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -714,10 +1363,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -740,6 +1390,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -758,6 +1409,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -772,7 +1434,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - psock->refcnt++; + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -780,11 +1449,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -797,6 +1463,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -818,8 +1492,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -832,6 +1504,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -846,6 +1520,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -907,6 +1584,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include <linux/filter.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> +#include <linux/elf.h> +#include <linux/pagemap.h> #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -114,13 +136,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43f95d190eea..dd172ee16716 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,7 +1315,8 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1329,8 +1330,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1382,9 +1382,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1437,9 +1439,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6eff108aa99..e9f7c20691c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -508,10 +508,6 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -#define CALLEE_SAVED_REGS 5 -static const int callee_saved[CALLEE_SAVED_REGS] = { - BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 -}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -1252,6 +1248,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -2075,7 +2072,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2113,6 +2111,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 01e6b3a38871..7f9691c86b6e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -738,8 +738,7 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, - sample_period); + const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -750,8 +749,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): - bpf_ctx_record_field_size(info, size_sp); - if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: @@ -778,6 +782,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, |