diff options
-rw-r--r-- | include/uapi/linux/bpf.h | 75 | ||||
-rw-r--r-- | net/core/filter.c | 114 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 75 |
3 files changed, 264 insertions, 0 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b1f6058cccf..4c24daa43bac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3847,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4012,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5045,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index e7a9b1667dd6..439f43f00483 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5637,6 +5637,116 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .arg4_type = ARG_ANYTHING, }; +static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, + u32 ifindex) +{ + struct net *netns = dev_net(dev_curr); + + /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ + if (ifindex == 0) + return dev_curr; + + return dev_get_by_index_rcu(netns, ifindex); +} + +BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + struct net_device *dev = skb->dev; + int skb_len, dev_len; + int mtu; + + if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) + return -EINVAL; + + if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + dev_len = mtu + dev->hard_header_len; + skb_len = skb->len + len_diff; /* minus result pass check */ + if (skb_len <= dev_len) { + ret = BPF_MTU_CHK_RET_SUCCESS; + goto out; + } + /* At this point, skb->len exceed MTU, but as it include length of all + * segments, it can still be below MTU. The SKB can possibly get + * re-segmented in transmit path (see validate_xmit_skb). Thus, user + * must choose if segs are to be MTU checked. + */ + if (skb_is_gso(skb)) { + ret = BPF_MTU_CHK_RET_SUCCESS; + + if (flags & BPF_MTU_CHK_SEGS && + !skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } +out: + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, + u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) +{ + struct net_device *dev = xdp->rxq->dev; + int xdp_len = xdp->data_end - xdp->data; + int ret = BPF_MTU_CHK_RET_SUCCESS; + int mtu, dev_len; + + /* XDP variant doesn't support multi-buffer segment check (yet) */ + if (unlikely(flags)) + return -EINVAL; + + dev = __dev_via_ifindex(dev, ifindex); + if (unlikely(!dev)) + return -ENODEV; + + mtu = READ_ONCE(dev->mtu); + + /* Add L2-header as dev MTU is L3 size */ + dev_len = mtu + dev->hard_header_len; + + xdp_len += len_diff; /* minus result pass check */ + if (xdp_len > dev_len) + ret = BPF_MTU_CHK_RET_FRAG_NEEDED; + + /* BPF verifier guarantees valid pointer */ + *mtu_len = mtu; + + return ret; +} + +static const struct bpf_func_proto bpf_skb_check_mtu_proto = { + .func = bpf_skb_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { + .func = bpf_xdp_check_mtu, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_INT, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { @@ -7222,6 +7332,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7291,6 +7403,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; + case BPF_FUNC_check_mtu: + return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b1f6058cccf..4c24daa43bac 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3847,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4012,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5045,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ |