diff options
author | Alexei Starovoitov <ast@kernel.org> | 2020-05-14 21:21:57 -0700 |
---|---|---|
committer | Alexei Starovoitov <ast@kernel.org> | 2020-05-14 21:50:03 -0700 |
commit | 5cc5924d8315a53e03e7dbfa7a3067cde7a81ded (patch) | |
tree | 0c842b02f62a8b7e2e7bd6f0a91053e652b45f7a | |
parent | d00f26b623333f2419f4c3b95ff11c8b1bb96f56 (diff) | |
parent | 7ae2e00e8fc23f10169079fadd388317d81012be (diff) | |
download | linux-5cc5924d8315a53e03e7dbfa7a3067cde7a81ded.tar.gz linux-5cc5924d8315a53e03e7dbfa7a3067cde7a81ded.tar.bz2 linux-5cc5924d8315a53e03e7dbfa7a3067cde7a81ded.zip |
Merge branch 'xdp-grow-tail'
Jesper Dangaard Brouer says:
====================
V4:
- Fixup checkpatch.pl issues
- Collected more ACKs
V3:
- Fix issue on virtio_net patch spotted by Jason Wang
- Adjust name for variable in mlx5 patch
- Collected more ACKs
V2:
- Fix bug in mlx5 for XDP_PASS case
- Collected nitpicks and ACKs from mailing list
V1:
- Fix bug in dpaa2
XDP have evolved to support several frame sizes, but xdp_buff was not
updated with this information. This have caused the side-effect that
XDP frame data hard end is unknown. This have limited the BPF-helper
bpf_xdp_adjust_tail to only shrink the packet. This patchset address
this and add packet tail extend/grow.
The purpose of the patchset is ALSO to reserve a memory area that can be
used for storing extra information, specifically for extending XDP with
multi-buffer support. One proposal is to use same layout as
skb_shared_info, which is why this area is currently 320 bytes.
When converting xdp_frame to SKB (veth and cpumap), the full tailroom
area can now be used and SKB truesize is now correct. For most
drivers this result in a much larger tailroom in SKB "head" data
area. The network stack can now take advantage of this when doing SKB
coalescing. Thus, a good driver test is to use xdp_redirect_cpu from
samples/bpf/ and do some TCP stream testing.
Use-cases for tail grow/extend:
(1) IPsec / XFRM needs a tail extend[1][2].
(2) DNS-cache responses in XDP.
(3) HAProxy ALOHA would need it to convert to XDP.
(4) Add tail info e.g. timestamp and collect via tcpdump
[1] http://vger.kernel.org/netconf2019_files/xfrm_xdp.pdf
[2] http://vger.kernel.org/netconf2019.html
Examples on howto access the tail area of an XDP packet is shown in the
XDP-tutorial example[3].
[3] https://github.com/xdp-project/xdp-tutorial/blob/master/experiment01-tailgrow/
====================
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
43 files changed, 451 insertions, 115 deletions
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 2818965427e9..85b87ed02dd5 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1606,6 +1606,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, "%s qid %d\n", __func__, rx_ring->qid); res_budget = budget; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = ENA_PAGE_SIZE; do { xdp_verdict = XDP_PASS; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 7df67bf09b93..680099afcccf 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -151,8 +151,9 @@ * The buffer size we share with the device is defined to be ENA_PAGE_SIZE */ -#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ - VLAN_HLEN - XDP_PACKET_HEADROOM) +#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \ + VLAN_HLEN - XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \ ((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c6f6f2033880..5e3b4a3b69ea 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -138,6 +138,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; xdp.rxq = &rxr->xdp_rxq; + xdp.frame_sz = PAGE_SIZE; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */ orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index b4b33368698f..2ba0ce115e63 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -552,6 +552,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = RCV_FRAG_LEN + XDP_PACKET_HEADROOM; orig_data = xdp.data; rcu_read_lock(); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0f3e842a4fd6..8c8d95aa1dfd 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -331,6 +331,9 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, xdp_set_data_meta_invalid(&xdp); xdp.rxq = &ch->xdp_rxq; + xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE - + (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM); + xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); /* xdp.data pointer may have changed */ @@ -366,7 +369,11 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, dma_unmap_page(priv->net_dev->dev.parent, addr, DPAA2_ETH_RX_BUF_SIZE, DMA_BIDIRECTIONAL); ch->buf_count--; + + /* Allow redirect use of full headroom */ xdp.data_hard_start = vaddr; + xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE; + err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog); if (unlikely(err)) ch->stats.xdp_drop++; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b8496037ef7f..a3772beffe02 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1507,6 +1507,22 @@ static inline unsigned int i40e_rx_offset(struct i40e_ring *rx_ring) return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0; } +static unsigned int i40e_rx_frame_truesize(struct i40e_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = i40e_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = i40e_rx_offset(rx_ring) ? + SKB_DATA_ALIGN(size + i40e_rx_offset(rx_ring)) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + /** * i40e_alloc_mapped_page - recycle or make a new page * @rx_ring: ring to use @@ -2246,13 +2262,11 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring, struct i40e_rx_buffer *rx_buffer, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2; + unsigned int truesize = i40e_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = SKB_DATA_ALIGN(i40e_rx_offset(rx_ring) + size); - rx_buffer->page_offset += truesize; #endif } @@ -2335,6 +2349,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) bool failure = false; struct xdp_buff xdp; +#if (PAGE_SIZE < 8192) + xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); +#endif xdp.rxq = &rx_ring->xdp_rxq; while (likely(total_rx_packets < (unsigned int)budget)) { @@ -2389,7 +2406,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) xdp.data_hard_start = xdp.data - i40e_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, size); +#endif skb = i40e_run_xdp(rx_ring, &xdp); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 0b7d29192b2c..2b9184aead5f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -531,12 +531,14 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *bi; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index f67e8362958c..69b21b436f9a 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -423,6 +423,22 @@ static unsigned int ice_rx_offset(struct ice_ring *rx_ring) return 0; } +static unsigned int ice_rx_frame_truesize(struct ice_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ice_rx_offset(rx_ring) ? + SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + /** * ice_run_xdp - Executes an XDP program on initialized xdp_buff * @rx_ring: Rx ring @@ -991,6 +1007,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) bool failure; xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0); +#endif /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { @@ -1038,6 +1058,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring); xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size); +#endif rcu_read_lock(); xdp_prog = READ_ONCE(rx_ring->xdp_prog); @@ -1051,16 +1075,8 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) if (!xdp_res) goto construct_skb; if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) { - unsigned int truesize; - -#if (PAGE_SIZE < 8192) - truesize = ice_rx_pg_size(rx_ring) / 2; -#else - truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + - size); -#endif xdp_xmit |= xdp_res; - ice_rx_buf_adjust_pg_offset(rx_buf, truesize); + ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz); } else { rx_buf->pagecnt_bias++; } diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 8279db15e870..23e5515d4527 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -840,11 +840,13 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_xmit = 0; bool failure = false; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 718931d951bc..eab5934b04f5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2244,19 +2244,30 @@ xdp_out: return ERR_PTR(-result); } +static unsigned int ixgbe_rx_frame_truesize(struct ixgbe_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ixgbe_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + static void ixgbe_rx_buffer_flip(struct ixgbe_ring *rx_ring, struct ixgbe_rx_buffer *rx_buffer, unsigned int size) { + unsigned int truesize = ixgbe_rx_frame_truesize(rx_ring, size); #if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2; - rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) : - SKB_DATA_ALIGN(size); - rx_buffer->page_offset += truesize; #endif } @@ -2290,6 +2301,11 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0); +#endif + while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; struct ixgbe_rx_buffer *rx_buffer; @@ -2323,7 +2339,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size); +#endif skb = ixgbe_run_xdp(adapter, rx_ring, &xdp); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 74b540ebb3dc..a656ee9a1fae 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -431,12 +431,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector, unsigned int total_rx_bytes = 0, total_rx_packets = 0; struct ixgbe_adapter *adapter = q_vector->adapter; u16 cleaned_count = ixgbe_desc_unused(rx_ring); + struct xdp_umem *umem = rx_ring->xsk_umem; unsigned int xdp_res, xdp_xmit = 0; bool failure = false; struct sk_buff *skb; struct xdp_buff xdp; xdp.rxq = &rx_ring->xdp_rxq; + xdp.frame_sz = xsk_umem_xdp_frame_sz(umem); while (likely(total_rx_packets < budget)) { union ixgbe_adv_rx_desc *rx_desc; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 4622c4ea2e46..a39e2cb384dd 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1095,19 +1095,31 @@ xdp_out: return ERR_PTR(-result); } +static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring, + unsigned int size) +{ + unsigned int truesize; + +#if (PAGE_SIZE < 8192) + truesize = ixgbevf_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */ +#else + truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : + SKB_DATA_ALIGN(size); +#endif + return truesize; +} + static void ixgbevf_rx_buffer_flip(struct ixgbevf_ring *rx_ring, struct ixgbevf_rx_buffer *rx_buffer, unsigned int size) { -#if (PAGE_SIZE < 8192) - unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2; + unsigned int truesize = ixgbevf_rx_frame_truesize(rx_ring, size); +#if (PAGE_SIZE < 8192) rx_buffer->page_offset ^= truesize; #else - unsigned int truesize = ring_uses_build_skb(rx_ring) ? - SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) : - SKB_DATA_ALIGN(size); - rx_buffer->page_offset += truesize; #endif } @@ -1125,6 +1137,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, xdp.rxq = &rx_ring->xdp_rxq; + /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */ +#if (PAGE_SIZE < 8192) + xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0); +#endif + while (likely(total_rx_packets < budget)) { struct ixgbevf_rx_buffer *rx_buffer; union ixgbe_adv_rx_desc *rx_desc; @@ -1157,7 +1174,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector, xdp.data_hard_start = xdp.data - ixgbevf_rx_offset(rx_ring); xdp.data_end = xdp.data + size; - +#if (PAGE_SIZE > 4096) + /* At larger PAGE_SIZE, frame_sz depend on len size */ + xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size); +#endif skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp); } diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index e0e9e56830c0..41d2a0eac5fa 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2148,12 +2148,17 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, struct bpf_prog *prog, struct xdp_buff *xdp, struct mvneta_stats *stats) { - unsigned int len; + unsigned int len, sync; + struct page *page; u32 ret, act; len = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; act = bpf_prog_run_xdp(prog, xdp); + /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ + sync = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction; + sync = max(sync, len); + switch (act) { case XDP_PASS: stats->xdp_pass++; @@ -2164,9 +2169,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, err = xdp_do_redirect(pp->dev, xdp, prog); if (unlikely(err)) { ret = MVNETA_XDP_DROPPED; - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, - true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); } else { ret = MVNETA_XDP_REDIR; stats->xdp_redirect++; @@ -2175,10 +2179,10 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, } case XDP_TX: ret = mvneta_xdp_xmit_back(pp, xdp); - if (ret != MVNETA_XDP_TX) - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, - true); + if (ret != MVNETA_XDP_TX) { + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); + } break; default: bpf_warn_invalid_xdp_action(act); @@ -2187,8 +2191,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, trace_xdp_exception(pp->dev, prog, act); /* fall through */ case XDP_DROP: - page_pool_put_page(rxq->page_pool, - virt_to_head_page(xdp->data), len, true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(rxq->page_pool, page, sync, true); ret = MVNETA_XDP_DROPPED; stats->xdp_drop++; break; @@ -2320,6 +2324,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi, rcu_read_lock(); xdp_prog = READ_ONCE(pp->xdp_prog); xdp_buf.rxq = &rxq->xdp_rxq; + xdp_buf.frame_sz = PAGE_SIZE; /* Fairness NAPI loop */ while (rx_proc < budget && rx_proc < rx_todo) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 43dcbd8214c6..5bd3cd37d50f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -51,7 +51,8 @@ #include "en_port.h" #define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \ - XDP_PACKET_HEADROOM)) + XDP_PACKET_HEADROOM - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))) int mlx4_en_setup_tc(struct net_device *dev, u8 up) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 787139219813..8a10285b0e10 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -683,6 +683,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud rcu_read_lock(); xdp_prog = rcu_dereference(ring->xdp_prog); xdp.rxq = &ring->xdp_rxq; + xdp.frame_sz = priv->frag_info[0].frag_stride; doorbell_pending = 0; /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 3bd64c63865b..26911b15f8fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -625,6 +625,7 @@ struct mlx5e_rq { struct { u16 umem_headroom; u16 headroom; + u32 frame0_sz; u8 map_dir; /* dma map direction */ } buff; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index c4a7fb4ecd14..761c8979bd41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -137,6 +137,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, if (xsk) xdp.handle = di->xsk.handle; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = rq->buff.frame0_sz; act = bpf_prog_run_xdp(prog, &xdp); if (xsk) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 0a9dfc31de3e..0e4ca08ddca9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -462,6 +462,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk)); + rq->buff.frame0_sz = (1 << rq->mpwqe.log_stride_sz); + err = mlx5e_create_rq_umr_mkey(mdev, rq); if (err) goto err_rq_wq_destroy; @@ -485,6 +487,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags; rq->wqe.info = rqp->frags_info; + rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; + rq->wqe.frags = kvzalloc_node(array_size(sizeof(*rq->wqe.frags), (wq_sz << rq->wqe.info.log_num_frags)), @@ -522,6 +526,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, } if (xsk) { + rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem); + err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames); if (unlikely(err)) { mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 779600bebcca..821f94beda7a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1070,6 +1070,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, if (consumed) return NULL; /* page/packet was consumed by XDP */ + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); if (unlikely(!skb)) return NULL; @@ -1371,6 +1372,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, return NULL; /* page/packet was consumed by XDP */ } + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 9bfb3b077bc1..0e0cc3d58bdc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1741,10 +1741,15 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring, struct nfp_net_rx_buf *rxbuf, unsigned int dma_off, unsigned int pkt_len, bool *completed) { + unsigned int dma_map_sz = dp->fl_bufsz - NFP_NET_RX_BUF_NON_DATA; struct nfp_net_tx_buf *txbuf; struct nfp_net_tx_desc *txd; int wr_idx; + /* Reject if xdp_adjust_tail grow packet beyond DMA area */ + if (pkt_len + dma_off > dma_map_sz) + return false; + if (unlikely(nfp_net_tx_full(tx_ring, 1))) { if (!*completed) { nfp_net_xdp_complete(tx_ring); @@ -1817,6 +1822,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) rcu_read_lock(); xdp_prog = READ_ONCE(dp->xdp_prog); true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz; + xdp.frame_sz = PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM; xdp.rxq = &rx_ring->xdp_rxq; tx_ring = r_vec->xdp_ring; diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index c6c20776b474..7598ebe0962a 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1066,6 +1066,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; xdp.rxq = &rxq->xdp_rxq; + xdp.frame_sz = rxq->rx_buf_seg_size; /* PAGE_SIZE when XDP enabled */ /* Queues always have a full reset currently, so for the time * being until there's atomic program replace just mark read diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index f50d9a9b76be..b2d154258b07 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1476,7 +1476,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq) if (rxq->rx_buf_size + size > PAGE_SIZE) rxq->rx_buf_size = PAGE_SIZE - size; - /* Segment size to spilt a page in multiple equal parts , + /* Segment size to split a page in multiple equal parts, * unless XDP is used in which case we'd use the entire page. */ if (!edev->xdp_prog) { diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 260352d97d9d..68c47a8c71df 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -308,6 +308,7 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + rx_buf->len; xdp.rxq = &rx_queue->xdp_rxq_info; + xdp.frame_sz = efx->rx_page_buf_step; xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp); rcu_read_unlock(); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index a5a0fb60193a..e1f4be4b3d69 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -884,23 +884,28 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, struct xdp_buff *xdp) { struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX]; - unsigned int len = xdp->data_end - xdp->data; + unsigned int sync, len = xdp->data_end - xdp->data; u32 ret = NETSEC_XDP_PASS; + struct page *page; int err; u32 act; act = bpf_prog_run_xdp(prog, xdp); + /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */ + sync = xdp->data_end - xdp->data_hard_start - NETSEC_RXBUF_HEADROOM; + sync = max(sync, len); + switch (act) { case XDP_PASS: ret = NETSEC_XDP_PASS; break; case XDP_TX: ret = netsec_xdp_xmit_back(priv, xdp); - if (ret != NETSEC_XDP_TX) - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, - true); + if (ret != NETSEC_XDP_TX) { + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); + } break; case XDP_REDIRECT: err = xdp_do_redirect(priv->ndev, xdp, prog); @@ -908,9 +913,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, ret = NETSEC_XDP_REDIR; } else { ret = NETSEC_XDP_CONSUMED; - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, - true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); } break; default: @@ -921,8 +925,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, /* fall through -- handle aborts by dropping packet */ case XDP_DROP: ret = NETSEC_XDP_CONSUMED; - page_pool_put_page(dring->page_pool, - virt_to_head_page(xdp->data), len, true); + page = virt_to_head_page(xdp->data); + page_pool_put_page(dring->page_pool, page, sync, true); break; } @@ -936,10 +940,14 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct netsec_rx_pkt_info rx_info; enum dma_data_direction dma_dir; struct bpf_prog *xdp_prog; + struct xdp_buff xdp; u16 xdp_xmit = 0; u32 xdp_act = 0; int done = 0; + xdp.rxq = &dring->xdp_rxq; + xdp.frame_sz = PAGE_SIZE; + rcu_read_lock(); xdp_prog = READ_ONCE(priv->xdp_prog); dma_dir = page_pool_get_dma_dir(dring->page_pool); @@ -953,7 +961,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) struct sk_buff *skb = NULL; u16 pkt_len, desc_len; dma_addr_t dma_handle; - struct xdp_buff xdp; void *buf_addr; if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD)) { @@ -1002,7 +1009,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + pkt_len; - xdp.rxq = &dring->xdp_rxq; if (xdp_prog) { xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 09f98fa2fb4e..ce0645ada6e7 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -406,6 +406,7 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp.data_hard_start = pa; xdp.rxq = &priv->xdp_rxq[ch]; + xdp.frame_sz = PAGE_SIZE; port = priv->emac_port + cpsw->data.dual_emac; ret = cpsw_run_xdp(priv, ch, &xdp, page, port); diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index dce49311d3d3..1247d35d42ef 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -348,6 +348,7 @@ static void cpsw_rx_handler(void *token, int len, int status) xdp.data_hard_start = pa; xdp.rxq = &priv->xdp_rxq[ch]; + xdp.frame_sz = PAGE_SIZE; ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port); if (ret != CPSW_XDP_PASS) diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index b86611041db6..1e0c024b0a93 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -49,6 +49,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp_set_data_meta_invalid(xdp); xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; + xdp->frame_sz = PAGE_SIZE; xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5de57fc3ec60..6267f706e8ee 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -795,7 +795,7 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, if (xbuf) { unsigned int hdroom = xdp->data - xdp->data_hard_start; unsigned int xlen = xdp->data_end - xdp->data; - unsigned int frag_size = netvsc_xdp_fraglen(hdroom + xlen); + unsigned int frag_size = xdp->frame_sz; skb = build_skb(xbuf, frag_size); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44889eba1dbc..c54f967e2c66 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1671,6 +1671,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &tfile->xdp_rxq; + xdp.frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { @@ -2411,6 +2412,7 @@ static int tun_xdp_one(struct tun_struct *tun, } xdp_set_data_meta_invalid(xdp); xdp->rxq = &tfile->xdp_rxq; + xdp->frame_sz = buflen; act = bpf_prog_run_xdp(xdp_prog, xdp); err = tun_xdp_act(tun, xdp_prog, xdp, act); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index aece0e5eec8c..b586d2fa5551 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -405,10 +405,6 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len, { struct sk_buff *skb; - if (!buflen) { - buflen = SKB_DATA_ALIGN(headroom + len) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - } skb = build_skb(head, buflen); if (!skb) return NULL; @@ -564,13 +560,15 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, struct veth_stats *stats) { void *hard_start = frame->data - frame->headroom; - void *head = hard_start - sizeof(struct xdp_frame); int len = frame->len, delta = 0; struct xdp_frame orig_frame; struct bpf_prog *xdp_prog; unsigned int headroom; struct sk_buff *skb; + /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */ + hard_start -= sizeof(struct xdp_frame); + rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); if (likely(xdp_prog)) { @@ -581,6 +579,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, xdp.data = frame->data; xdp.data_end = frame->data + frame->len; xdp.data_meta = frame->data - frame->metasize; + xdp.frame_sz = frame->frame_sz; xdp.rxq = &rq->xdp_rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -592,7 +591,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, break; case XDP_TX: orig_frame = *frame; - xdp.data_hard_start = head; xdp.rxq->mem = frame->mem; if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); @@ -605,7 +603,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp.data_hard_start = head; xdp.rxq->mem = frame->mem; if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { frame = &orig_frame; @@ -629,7 +626,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_unlock(); headroom = sizeof(struct xdp_frame) + frame->headroom - delta; - skb = veth_build_skb(head, headroom, len, 0); + skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz); if (!skb) { xdp_return_frame(frame); stats->rx_drops++; @@ -695,9 +692,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, goto drop; } - nskb = veth_build_skb(head, - VETH_XDP_HEADROOM + mac_len, skb->len, - PAGE_SIZE); + nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, + skb->len, PAGE_SIZE); if (!nskb) { page_frag_free(head); goto drop; @@ -715,6 +711,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, xdp.data_end = xdp.data + pktlen; xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start; + xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data = xdp.data; orig_data_end = xdp.data_end; @@ -758,6 +759,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, } rcu_read_unlock(); + /* check if bpf_xdp_adjust_head was used */ delta = orig_data - xdp.data; off = mac_len + delta; if (off > 0) @@ -765,9 +767,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, else if (off < 0) __skb_pull(skb, -off); skb->mac_header -= delta; + + /* check if bpf_xdp_adjust_tail was used */ off = xdp.data_end - orig_data_end; if (off != 0) - __skb_put(skb, off); + __skb_put(skb, off); /* positive on grow, negative on shrink */ skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp.data - xdp.data_meta; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 11f722460513..9e1b5d748586 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -689,6 +689,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_end = xdp.data + len; xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = buflen; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -797,10 +798,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, int offset = buf - page_address(page); struct sk_buff *head_skb, *curr_skb; struct bpf_prog *xdp_prog; - unsigned int truesize; + unsigned int truesize = mergeable_ctx_to_truesize(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx); - int err; unsigned int metasize = 0; + unsigned int frame_sz; + int err; head_skb = NULL; stats->bytes += len - vi->hdr_len; @@ -821,6 +823,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, if (unlikely(hdr->hdr.gso_type)) goto err_xdp; + /* Buffers with headroom use PAGE_SIZE as alloc size, + * see add_recvbuf_mergeable() + get_mergeable_buf_len() + */ + frame_sz = headroom ? PAGE_SIZE : truesize; + /* This happens when rx buffer size is underestimated * or headroom is not enough because of the buffer * was refilled before XDP is set. This should only @@ -834,6 +841,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, page, offset, VIRTIO_XDP_HEADROOM, &len); + frame_sz = PAGE_SIZE; + if (!xdp_page) goto err_xdp; offset = VIRTIO_XDP_HEADROOM; @@ -850,6 +859,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdp.data_end = xdp.data + (len - vi->hdr_len); xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; + xdp.frame_sz = frame_sz - vi->hdr_len; act = bpf_prog_run_xdp(xdp_prog, &xdp); stats->xdp_packets++; @@ -924,7 +934,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } rcu_read_unlock(); - truesize = mergeable_ctx_to_truesize(ctx); if (unlikely(len > truesize)) { pr_debug("%s: rx error: len %u exceeds truesize %lu\n", dev->name, len, (unsigned long)ctx); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 2927f02cc7e1..516519dcc8ff 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -747,6 +747,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, xdp->data = buf + pad; xdp->data_end = xdp->data + len; hdr->buflen = buflen; + xdp->frame_sz = buflen; --net->refcnt_bias; alloc_frag->offset += buflen; diff --git a/include/net/xdp.h b/include/net/xdp.h index 3cc6d5d84aa4..3094fccf5a88 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -6,6 +6,8 @@ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ +#include <linux/skbuff.h> /* skb_shared_info */ + /** * DOC: XDP RX-queue information * @@ -70,13 +72,25 @@ struct xdp_buff { void *data_hard_start; unsigned long handle; struct xdp_rxq_info *rxq; + u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; +/* Reserve memory area at end-of data area. + * + * This macro reserves tailroom in the XDP buffer by limiting the + * XDP/BPF data access to data_hard_end. Notice same area (and size) + * is used for XDP_PASS, when constructing the SKB via build_skb(). + */ +#define xdp_data_hard_end(xdp) \ + ((xdp)->data_hard_start + (xdp)->frame_sz - \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + struct xdp_frame { void *data; u16 len; u16 headroom; - u16 metasize; + u32 metasize:8; + u32 frame_sz:24; /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ @@ -91,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +/* Avoids inlining WARN macro in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line); +#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) + struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); /* Convert xdp_buff to xdp_frame */ @@ -111,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return NULL; + /* Catch if driver didn't reserve tailroom for skb_shared_info */ + if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { + XDP_WARN("Driver BUG: missing reserved tailroom"); + return NULL; + } + /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; @@ -118,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; + xdp_frame->frame_sz = xdp->frame_sz; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 67191ccaab85..abd72de25fa4 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -236,6 +236,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, else return address + offset; } + +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return umem->chunk_size_nohr + umem->headroom; +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -366,6 +372,11 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } +static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) +{ + return 0; +} + static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 32cbf36c7729..b9b8a0f63b91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2015,8 +2015,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3fe0b006d2d2..a71790dab12d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - /* build_skb need to place skb_shared_info after SKB end, and - * also want to know the memory "truesize". Thus, need to - * know the memory frame size backing xdp_buff. - * - * XDP was designed to have PAGE_SIZE frames, but this - * assumption is not longer true with ixgbe and i40e. It - * would be preferred to set frame_size to 2048 or 4096 - * depending on the driver. - * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_frame); - * - * Instead, with info avail, skb_shared_info in placed after - * packet len. This, unfortunately fakes the truesize. - * Another disadvantage of this approach, the skb_shared_info - * is not at a fixed memory location, with mixed length - * packets, which is bad for cache-line hotness. + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_size = xdpf->frame_sz; pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb_around(skb, pkt_data_start, frame_size); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 29dbdd4c29f6..30ba7d38941d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -470,25 +470,34 @@ out: int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 headroom = XDP_PACKET_HEADROOM; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + u32 max_data_sz; void *data; int ret; if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; - data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); + /* XDP have extra tailroom as (most) drivers use full page */ + max_data_sz = 4096 - headroom - tailroom; + if (size > max_data_sz) + return -EINVAL; + + data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) return PTR_ERR(data); xdp.data_hard_start = data; - xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; @@ -496,8 +505,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || - xdp.data_end != xdp.data + size) + if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: diff --git a/net/core/dev.c b/net/core/dev.c index 4c91de39890a..f937a3ff668d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ diff --git a/net/core/filter.c b/net/core/filter.c index 5815902bb617..a85eb538d4d6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3411,15 +3411,26 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) return -EINVAL; + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); + return -EINVAL; + } + if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..490b8f5fa8ee 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/bug.h> #include <net/page_pool.h> #include <net/xdp.h> @@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c index 6c8ca1c93f9b..d5c98f2cb12f 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c @@ -2,13 +2,13 @@ #include <test_progs.h> #include <network_helpers.h> -void test_xdp_adjust_tail(void) +void test_xdp_adjust_tail_shrink(void) { - const char *file = "./test_adjust_tail.o"; + const char *file = "./test_xdp_adjust_tail_shrink.o"; + __u32 duration, retval, size, expect_sz; struct bpf_object *obj; - char buf[128]; - __u32 duration, retval, size; int err, prog_fd; + char buf[128]; err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); if (CHECK_FAIL(err)) @@ -21,10 +21,121 @@ void test_xdp_adjust_tail(void) "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || retval != XDP_TX || size != 54, - "ipv6", "err %d errno %d retval %d size %d\n", + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + struct bpf_object *obj; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + __u32 duration, retval, size, expect_sz; + int err, prog_fd; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_DROP, + "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); + + expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */ + err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */, + buf, &size, &retval, &duration); + CHECK(err || retval != XDP_TX || size != expect_sz, + "ipv6", "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, retval, size, expect_sz); + + bpf_object__close(obj); +} + +void test_xdp_adjust_tail_grow2(void) +{ + const char *file = "./test_xdp_adjust_tail_grow.o"; + char buf[4096]; /* avoid segfault: large buf to hold grow results */ + int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/; + struct bpf_object *obj; + int err, cnt, i; + int max_grow; + + struct bpf_prog_test_run_attr tattr = { + .repeat = 1, + .data_in = &buf, + .data_out = &buf, + .data_size_in = 0, /* Per test */ + .data_size_out = 0, /* Per test */ + }; + + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd); + if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno)) + return; + + /* Test case-64 */ + memset(buf, 1, sizeof(buf)); + tattr.data_size_in = 64; /* Determine test case via pkt size */ + tattr.data_size_out = 128; /* Limit copy_size */ + /* Kernel side alloc packet memory area that is zero init */ + err = bpf_prog_test_run_xattr(&tattr); + + CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */ + || tattr.retval != XDP_TX + || tattr.data_size_out != 192, /* Expected grow size */ + "case-64", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Extra checks for data contents */ + CHECK_ATTR(tattr.data_size_out != 192 + || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */ + || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */ + || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */ + "case-64-data", + "err %d errno %d retval %d size %d\n", + err, errno, tattr.retval, tattr.data_size_out); + + /* Test case-128 */ + memset(buf, 2, sizeof(buf)); + tattr.data_size_in = 128; /* Determine test case via pkt size */ + tattr.data_size_out = sizeof(buf); /* Copy everything */ + err = bpf_prog_test_run_xattr(&tattr); + + max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */ + CHECK_ATTR(err + || tattr.retval != XDP_TX + || tattr.data_size_out != max_grow,/* Expect max grow size */ + "case-128", + "err %d errno %d retval %d size %d expect-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, max_grow); + + /* Extra checks for data content: Count grow size, will contain zeros */ + for (i = 0, cnt = 0; i < sizeof(buf); i++) { + if (buf[i] == 0) + cnt++; + } + CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */ + || tattr.data_size_out != max_grow, /* Total grow size */ + "case-128-data", + "err %d errno %d retval %d size %d grow-size %d\n", + err, errno, tattr.retval, tattr.data_size_out, cnt); + bpf_object__close(obj); } + +void test_xdp_adjust_tail(void) +{ + if (test__start_subtest("xdp_adjust_tail_shrink")) + test_xdp_adjust_tail_shrink(); + if (test__start_subtest("xdp_adjust_tail_grow")) + test_xdp_adjust_tail_grow(); + if (test__start_subtest("xdp_adjust_tail_grow2")) + test_xdp_adjust_tail_grow2(); +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c new file mode 100644 index 000000000000..3d66599eee2e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp_adjust_tail_grow") +int _xdp_adjust_tail_grow(struct xdp_md *xdp) +{ + void *data_end = (void *)(long)xdp->data_end; + void *data = (void *)(long)xdp->data; + unsigned int data_len; + int offset = 0; + + /* Data length determine test case */ + data_len = data_end - data; + + if (data_len == 54) { /* sizeof(pkt_v4) */ + offset = 4096; /* test too large offset */ + } else if (data_len == 74) { /* sizeof(pkt_v6) */ + offset = 40; + } else if (data_len == 64) { + offset = 128; + } else if (data_len == 128) { + offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */ + } else { + return XDP_ABORTED; /* No matching test */ + } + + if (bpf_xdp_adjust_tail(xdp, offset)) + return XDP_DROP; + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c index b7fc85769bdc..22065a9cfb25 100644 --- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Copyright (c) 2018 Facebook +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -11,15 +11,15 @@ int _version SEC("version") = 1; -SEC("xdp_adjust_tail") -int _xdp_adjust_tail(struct xdp_md *xdp) +SEC("xdp_adjust_tail_shrink") +int _xdp_adjust_tail_shrink(struct xdp_md *xdp) { void *data_end = (void *)(long)xdp->data_end; void *data = (void *)(long)xdp->data; int offset = 0; - if (data_end - data == 54) - offset = 256; + if (data_end - data == 54) /* sizeof(pkt_v4) */ + offset = 256; /* shrink too much */ else offset = 20; if (bpf_xdp_adjust_tail(xdp, 0 - offset)) |