diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 1 | ||||
-rw-r--r-- | net/core/datagram.c | 72 | ||||
-rw-r--r-- | net/core/dev.c | 1018 | ||||
-rw-r--r-- | net/core/devlink.c | 100 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 23 | ||||
-rw-r--r-- | net/core/ethtool.c | 98 | ||||
-rw-r--r-- | net/core/fib_rules.c | 78 | ||||
-rw-r--r-- | net/core/filter.c | 848 | ||||
-rw-r--r-- | net/core/flow.c | 66 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 206 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 294 | ||||
-rw-r--r-- | net/core/gen_stats.c | 20 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 396 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 52 | ||||
-rw-r--r-- | net/core/neighbour.c | 18 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 65 | ||||
-rw-r--r-- | net/core/net_namespace.c | 97 | ||||
-rw-r--r-- | net/core/netpoll.c | 6 | ||||
-rw-r--r-- | net/core/pktgen.c | 40 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 344 | ||||
-rw-r--r-- | net/core/secure_seq.c | 11 | ||||
-rw-r--r-- | net/core/skbuff.c | 220 | ||||
-rw-r--r-- | net/core/sock.c | 126 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 1 | ||||
-rw-r--r-- | net/core/stream.c | 29 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 5 |
26 files changed, 2992 insertions, 1242 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index d6508c2ddca5..f6761b6e3b29 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/datagram.c b/net/core/datagram.c index b7de71f8d5d3..9482037a5c8c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -165,6 +165,7 @@ done: * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket * @flags: MSG_ flags + * @destructor: invoked under the receive lock on successful dequeue * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset * within an skb where data actually starts @@ -197,6 +198,8 @@ done: * the standard around please. */ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), int *peeked, int *off, int *err, struct sk_buff **last) { @@ -211,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, if (error) goto no_packet; + *peeked = 0; do { /* Again only user level code calls this function, so nothing * interrupt level will suddenly eat the receive_queue. @@ -224,26 +228,28 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, spin_lock_irqsave(&queue->lock, cpu_flags); skb_queue_walk(queue, skb) { *last = skb; - *peeked = skb->peeked; if (flags & MSG_PEEK) { if (_off >= skb->len && (skb->len || _off || skb->peeked)) { _off -= skb->len; continue; } - - skb = skb_set_peeked(skb); - error = PTR_ERR(skb); - if (IS_ERR(skb)) { - spin_unlock_irqrestore(&queue->lock, - cpu_flags); - goto no_packet; + if (!skb->len) { + skb = skb_set_peeked(skb); + if (IS_ERR(skb)) { + error = PTR_ERR(skb); + spin_unlock_irqrestore(&queue->lock, + cpu_flags); + goto no_packet; + } } - + *peeked = 1; atomic_inc(&skb->users); - } else + } else { __skb_unlink(skb, queue); - + if (destructor) + destructor(sk, skb); + } spin_unlock_irqrestore(&queue->lock, cpu_flags); *off = _off; return skb; @@ -262,6 +268,8 @@ no_packet: EXPORT_SYMBOL(__skb_try_recv_datagram); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), int *peeked, int *off, int *err) { struct sk_buff *skb, *last; @@ -270,8 +278,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { - skb = __skb_try_recv_datagram(sk, flags, peeked, off, err, - &last); + skb = __skb_try_recv_datagram(sk, flags, destructor, peeked, + off, err, &last); if (skb) return skb; @@ -290,7 +298,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int peeked, off = 0; return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), - &peeked, &off, err); + NULL, &peeked, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); @@ -323,6 +331,27 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_free_datagram_locked); +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, + unsigned int flags) +{ + int err = 0; + + if (flags & MSG_PEEK) { + err = -ENOENT; + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + err = 0; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + atomic_inc(&sk->sk_drops); + return err; +} +EXPORT_SYMBOL(__sk_queue_drop_skb); + /** * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket @@ -346,23 +375,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = 0; - - if (flags & MSG_PEEK) { - err = -ENOENT; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - atomic_dec(&skb->users); - err = 0; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - } + int err = __sk_queue_drop_skb(sk, skb, flags); kfree_skb(skb); - atomic_inc(&sk->sk_drops); sk_mem_reclaim_partial(sk); - return err; } EXPORT_SYMBOL(skb_kill_datagram); diff --git a/net/core/dev.c b/net/core/dev.c index ea6312057a71..6372117f653f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -139,7 +139,6 @@ #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> -#include <linux/sctp.h> #include <linux/crash_dump.h> #include "net-sysfs.h" @@ -1766,19 +1765,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } + int ret = ____dev_forward_skb(dev, skb); - skb_scrub_packet(skb, true); - skb->priority = 0; - skb->protocol = eth_type_trans(skb, dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - return 0; + return ret; } EXPORT_SYMBOL_GPL(__dev_forward_skb); @@ -1949,37 +1943,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) } } +int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) +{ + if (dev->num_tc) { + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + int i; + + for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { + if ((txq - tc->offset) < tc->count) + return i; + } + + return -1; + } + + return 0; +} + #ifdef CONFIG_XPS static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, - int cpu, u16 index) +static bool remove_xps_queue(struct xps_dev_maps *dev_maps, + int tci, u16 index) { struct xps_map *map = NULL; int pos; if (dev_maps) - map = xmap_dereference(dev_maps->cpu_map[cpu]); + map = xmap_dereference(dev_maps->cpu_map[tci]); + if (!map) + return false; - for (pos = 0; map && pos < map->len; pos++) { - if (map->queues[pos] == index) { - if (map->len > 1) { - map->queues[pos] = map->queues[--map->len]; - } else { - RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); - kfree_rcu(map, rcu); - map = NULL; - } + for (pos = map->len; pos--;) { + if (map->queues[pos] != index) + continue; + + if (map->len > 1) { + map->queues[pos] = map->queues[--map->len]; break; } + + RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL); + kfree_rcu(map, rcu); + return false; } - return map; + return true; } -static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +static bool remove_xps_queue_cpu(struct net_device *dev, + struct xps_dev_maps *dev_maps, + int cpu, u16 offset, u16 count) +{ + int num_tc = dev->num_tc ? : 1; + bool active = false; + int tci; + + for (tci = cpu * num_tc; num_tc--; tci++) { + int i, j; + + for (i = count, j = offset; i--; j++) { + if (!remove_xps_queue(dev_maps, cpu, j)) + break; + } + + active |= i < 0; + } + + return active; +} + +static void netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) { struct xps_dev_maps *dev_maps; int cpu, i; @@ -1991,21 +2028,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) if (!dev_maps) goto out_no_maps; - for_each_possible_cpu(cpu) { - for (i = index; i < dev->num_tx_queues; i++) { - if (!remove_xps_queue(dev_maps, cpu, i)) - break; - } - if (i == dev->num_tx_queues) - active = true; - } + for_each_possible_cpu(cpu) + active |= remove_xps_queue_cpu(dev, dev_maps, cpu, + offset, count); if (!active) { RCU_INIT_POINTER(dev->xps_maps, NULL); kfree_rcu(dev_maps, rcu); } - for (i = index; i < dev->num_tx_queues; i++) + for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), NUMA_NO_NODE); @@ -2013,6 +2045,11 @@ out_no_maps: mutex_unlock(&xps_map_mutex); } +static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) +{ + netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); +} + static struct xps_map *expand_xps_map(struct xps_map *map, int cpu, u16 index) { @@ -2052,20 +2089,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + int i, cpu, tci, numa_node_id = -2; + int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); - int cpu, numa_node_id = -2; bool active = false; + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + + maps_sz = XPS_DEV_MAPS_SIZE(num_tc); + if (maps_sz < L1_CACHE_BYTES) + maps_sz = L1_CACHE_BYTES; + mutex_lock(&xps_map_mutex); dev_maps = xmap_dereference(dev->xps_maps); /* allocate memory for queue storage */ - for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mask)) - continue; - + for_each_cpu_and(cpu, cpu_online_mask, mask) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { @@ -2073,25 +2118,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, return -ENOMEM; } - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : + tci = cpu * num_tc + tc; + map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) : NULL; map = expand_xps_map(map, cpu, index); if (!map) goto error; - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; for_each_possible_cpu(cpu) { + /* copy maps belonging to foreign traffic classes */ + for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + } + + /* We need to explicitly update tci as prevous loop + * could break out early if dev_maps is NULL. + */ + tci = cpu * num_tc + tc; + if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { /* add queue to CPU maps */ int pos = 0; - map = xmap_dereference(new_dev_maps->cpu_map[cpu]); + map = xmap_dereference(new_dev_maps->cpu_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2105,26 +2163,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->cpu_map[cpu]); - RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); } + /* copy maps belonging to foreign traffic classes */ + for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->cpu_map[tci]); + RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map); + } } rcu_assign_pointer(dev->xps_maps, new_dev_maps); /* Cleanup old maps */ - if (dev_maps) { - for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = xmap_dereference(dev_maps->cpu_map[cpu]); + if (!dev_maps) + goto out_no_old_maps; + + for_each_possible_cpu(cpu) { + for (i = num_tc, tci = cpu * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = xmap_dereference(dev_maps->cpu_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } - - kfree_rcu(dev_maps, rcu); } + kfree_rcu(dev_maps, rcu); + +out_no_old_maps: dev_maps = new_dev_maps; active = true; @@ -2139,11 +2207,12 @@ out_no_new_maps: /* removes queue from unused CPUs */ for_each_possible_cpu(cpu) { - if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) - continue; - - if (remove_xps_queue(dev_maps, cpu, index)) - active = true; + for (i = tc, tci = cpu * num_tc; i--; tci++) + active |= remove_xps_queue(dev_maps, tci, index); + if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu)) + active |= remove_xps_queue(dev_maps, tci, index); + for (i = num_tc - tc, tci++; --i; tci++) + active |= remove_xps_queue(dev_maps, tci, index); } /* free map if not active */ @@ -2159,11 +2228,14 @@ out_no_maps: error: /* remove any maps that we added */ for_each_possible_cpu(cpu) { - new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); - map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : - NULL; - if (new_map && new_map != map) - kfree(new_map); + for (i = num_tc, tci = cpu * num_tc; i--; tci++) { + new_map = xmap_dereference(new_dev_maps->cpu_map[tci]); + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[tci]) : + NULL; + if (new_map && new_map != map) + kfree(new_map); + } } mutex_unlock(&xps_map_mutex); @@ -2174,6 +2246,44 @@ error: EXPORT_SYMBOL(netif_set_xps_queue); #endif +void netdev_reset_tc(struct net_device *dev) +{ +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} +EXPORT_SYMBOL(netdev_reset_tc); + +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues(dev, offset, count); +#endif + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} +EXPORT_SYMBOL(netdev_set_tc_queue); + +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + +#ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, 0); +#endif + dev->num_tc = num_tc; + return 0; +} +EXPORT_SYMBOL(netdev_set_num_tc); + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -2484,7 +2594,7 @@ int skb_checksum_help(struct sk_buff *skb) goto out; } - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: @@ -2492,141 +2602,6 @@ out: } EXPORT_SYMBOL(skb_checksum_help); -/* skb_csum_offload_check - Driver helper function to determine if a device - * with limited checksum offload capabilities is able to offload the checksum - * for a given packet. - * - * Arguments: - * skb - sk_buff for the packet in question - * spec - contains the description of what device can offload - * csum_encapped - returns true if the checksum being offloaded is - * encpasulated. That is it is checksum for the transport header - * in the inner headers. - * checksum_help - when set indicates that helper function should - * call skb_checksum_help if offload checks fail - * - * Returns: - * true: Packet has passed the checksum checks and should be offloadable to - * the device (a driver may still need to check for additional - * restrictions of its device) - * false: Checksum is not offloadable. If checksum_help was set then - * skb_checksum_help was called to resolve checksum for non-GSO - * packets and when IP protocol is not SCTP - */ -bool __skb_csum_offload_chk(struct sk_buff *skb, - const struct skb_csum_offl_spec *spec, - bool *csum_encapped, - bool csum_help) -{ - struct iphdr *iph; - struct ipv6hdr *ipv6; - void *nhdr; - int protocol; - u8 ip_proto; - - if (skb->protocol == htons(ETH_P_8021Q) || - skb->protocol == htons(ETH_P_8021AD)) { - if (!spec->vlan_okay) - goto need_help; - } - - /* We check whether the checksum refers to a transport layer checksum in - * the outermost header or an encapsulated transport layer checksum that - * corresponds to the inner headers of the skb. If the checksum is for - * something else in the packet we need help. - */ - if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { - /* Non-encapsulated checksum */ - protocol = eproto_to_ipproto(vlan_get_protocol(skb)); - nhdr = skb_network_header(skb); - *csum_encapped = false; - if (spec->no_not_encapped) - goto need_help; - } else if (skb->encapsulation && spec->encap_okay && - skb_checksum_start_offset(skb) == - skb_inner_transport_offset(skb)) { - /* Encapsulated checksum */ - *csum_encapped = true; - switch (skb->inner_protocol_type) { - case ENCAP_TYPE_ETHER: - protocol = eproto_to_ipproto(skb->inner_protocol); - break; - case ENCAP_TYPE_IPPROTO: - protocol = skb->inner_protocol; - break; - } - nhdr = skb_inner_network_header(skb); - } else { - goto need_help; - } - - switch (protocol) { - case IPPROTO_IP: - if (!spec->ipv4_okay) - goto need_help; - iph = nhdr; - ip_proto = iph->protocol; - if (iph->ihl != 5 && !spec->ip_options_okay) - goto need_help; - break; - case IPPROTO_IPV6: - if (!spec->ipv6_okay) - goto need_help; - if (spec->no_encapped_ipv6 && *csum_encapped) - goto need_help; - ipv6 = nhdr; - nhdr += sizeof(*ipv6); - ip_proto = ipv6->nexthdr; - break; - default: - goto need_help; - } - -ip_proto_again: - switch (ip_proto) { - case IPPROTO_TCP: - if (!spec->tcp_okay || - skb->csum_offset != offsetof(struct tcphdr, check)) - goto need_help; - break; - case IPPROTO_UDP: - if (!spec->udp_okay || - skb->csum_offset != offsetof(struct udphdr, check)) - goto need_help; - break; - case IPPROTO_SCTP: - if (!spec->sctp_okay || - skb->csum_offset != offsetof(struct sctphdr, checksum)) - goto cant_help; - break; - case NEXTHDR_HOP: - case NEXTHDR_ROUTING: - case NEXTHDR_DEST: { - u8 *opthdr = nhdr; - - if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) - goto need_help; - - ip_proto = opthdr[0]; - nhdr += (opthdr[1] + 1) << 3; - - goto ip_proto_again; - } - default: - goto need_help; - } - - /* Passed the tests for offloading checksum */ - return true; - -need_help: - if (csum_help && !skb_shinfo(skb)->gso_size) - skb_checksum_help(skb); -cant_help: - return false; -} -EXPORT_SYMBOL(__skb_csum_offload_chk); - __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -3035,6 +3010,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } return head; } +EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { @@ -3220,8 +3196,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { - map = rcu_dereference( - dev_maps->cpu_map[skb->sender_cpu - 1]); + unsigned int tci = skb->sender_cpu - 1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->cpu_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; @@ -3355,16 +3337,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); -#ifdef CONFIG_NET_SWITCHDEV - /* Don't forward if offload device already forwarded */ - if (skb->offload_fwd_mark && - skb->offload_fwd_mark == dev->offload_fwd_mark) { - consume_skb(skb); - rc = NET_XMIT_SUCCESS; - goto out; - } -#endif - txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -3475,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); +struct static_key rfs_needed __read_mostly; +EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -3855,7 +3829,7 @@ int netif_rx_ni(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx_ni); -static void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -3914,8 +3888,7 @@ static void net_tx_action(struct softirq_action *h) } } -#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ - (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; @@ -4066,12 +4039,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, { #ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { + int ingress_retval; + if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - return nf_hook_ingress(skb); + rcu_read_lock(); + ingress_retval = nf_hook_ingress(skb); + rcu_read_unlock(); + return ingress_retval; } #endif /* CONFIG_NETFILTER_INGRESS */ return 0; @@ -4308,32 +4286,53 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); -/* Network device is going away, flush any packets still pending - * Called with irqs disabled. - */ -static void flush_backlog(void *arg) +DEFINE_PER_CPU(struct work_struct, flush_works); + +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(struct work_struct *work) { - struct net_device *dev = arg; - struct softnet_data *sd = this_cpu_ptr(&softnet_data); struct sk_buff *skb, *tmp; + struct softnet_data *sd; + + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + local_irq_disable(); rps_lock(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev == dev) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); input_queue_head_incr(sd); } } rps_unlock(sd); + local_irq_enable(); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev == dev) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); input_queue_head_incr(sd); } } + local_bh_enable(); +} + +static void flush_all_backlogs(void) +{ + unsigned int cpu; + + get_online_cpus(); + + for_each_online_cpu(cpu) + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&flush_works, cpu)); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(&flush_works, cpu)); + + put_online_cpus(); } static int napi_gro_complete(struct sk_buff *skb) @@ -4480,7 +4479,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) + if (skb->csum_bad) goto normal; gro_list_prepare(napi, skb); @@ -4493,9 +4492,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->recursion_counter = 0; NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; @@ -4821,8 +4821,9 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) static int process_backlog(struct napi_struct *napi, int quota) { - int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + bool again = true; + int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. @@ -4833,23 +4834,20 @@ static int process_backlog(struct napi_struct *napi, int quota) } napi->weight = weight_p; - local_irq_disable(); - while (1) { + while (again) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); - local_irq_enable(); __netif_receive_skb(skb); rcu_read_unlock(); - local_irq_disable(); input_queue_head_incr(sd); - if (++work >= quota) { - local_irq_enable(); + if (++work >= quota) return work; - } + } + local_irq_disable(); rps_lock(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* @@ -4861,16 +4859,14 @@ static int process_backlog(struct napi_struct *napi, int quota) * and we dont need an smp_mb() memory barrier. */ napi->state = 0; - rps_unlock(sd); - - break; + again = false; + } else { + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); } - - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); rps_unlock(sd); + local_irq_enable(); } - local_irq_enable(); return work; } @@ -4904,26 +4900,36 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -void __napi_complete(struct napi_struct *n) +bool __napi_complete(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + /* Some drivers call us directly, instead of calling + * napi_complete_done(). + */ + if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) + return false; + list_del_init(&n->poll_list); smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); + return true; } EXPORT_SYMBOL(__napi_complete); -void napi_complete_done(struct napi_struct *n, int work_done) +bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags; /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu + * 1) Don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu. + * 2) If we are busy polling, do nothing here, we have + * the guarantee we will be called later. */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) - return; + if (unlikely(n->state & (NAPIF_STATE_NPSVC | + NAPIF_STATE_IN_BUSY_POLL))) + return false; if (n->gro_list) { unsigned long timeout = 0; @@ -4945,6 +4951,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) __napi_complete(n); local_irq_restore(flags); } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4962,13 +4969,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) + #define BUSY_POLL_BUDGET 8 + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) +{ + int rc; + + clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); + + local_bh_disable(); + + /* All we really want here is to re-enable device interrupts. + * Ideally, a new ndo_busy_poll_stop() could avoid another round. + */ + rc = napi->poll(napi, BUSY_POLL_BUDGET); + netpoll_poll_unlock(have_poll_lock); + if (rc == BUSY_POLL_BUDGET) + __napi_schedule(napi); + local_bh_enable(); + if (local_softirq_pending()) + do_softirq(); +} + bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + int (*napi_poll)(struct napi_struct *napi, int budget); int (*busy_poll)(struct napi_struct *dev); + void *have_poll_lock = NULL; struct napi_struct *napi; - int rc = false; + int rc; + +restart: + rc = false; + napi_poll = NULL; rcu_read_lock(); @@ -4979,24 +5014,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock) /* Note: ndo_busy_poll method is optional in linux-4.5 */ busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - do { + preempt_disable(); + for (;;) { rc = 0; local_bh_disable(); if (busy_poll) { rc = busy_poll(napi); - } else if (napi_schedule_prep(napi)) { - void *have = netpoll_poll_lock(napi); - - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { - rc = napi->poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); - if (rc == BUSY_POLL_BUDGET) { - napi_complete_done(napi, rc); - napi_schedule(napi); - } - } - netpoll_poll_unlock(have); + goto count; + } + if (!napi_poll) { + unsigned long val = READ_ONCE(napi->state); + + /* If multiple threads are competing for this napi, + * we avoid dirtying napi->state as much as we can. + */ + if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | + NAPIF_STATE_IN_BUSY_POLL)) + goto count; + if (cmpxchg(&napi->state, val, + val | NAPIF_STATE_IN_BUSY_POLL | + NAPIF_STATE_SCHED) != val) + goto count; + have_poll_lock = netpoll_poll_lock(napi); + napi_poll = napi->poll; } + rc = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); +count: if (rc > 0) __NET_ADD_STATS(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); @@ -5005,10 +5049,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); + if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + break; + if (unlikely(need_resched())) { + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + rc = !skb_queue_empty(&sk->sk_receive_queue); + if (rc || busy_loop_timeout(end_time)) + return rc; + goto restart; + } + cpu_relax(); + } + if (napi_poll) + busy_poll_stop(napi, have_poll_lock); + preempt_enable(); rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); @@ -5018,7 +5078,7 @@ EXPORT_SYMBOL(sk_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ -void napi_hash_add(struct napi_struct *napi) +static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) @@ -5038,7 +5098,6 @@ void napi_hash_add(struct napi_struct *napi) spin_unlock(&napi_hash_lock); } -EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi @@ -5086,7 +5145,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, list_add(&napi->dev_list, &dev->napi_list); napi->dev = dev; #ifdef CONFIG_NETPOLL - spin_lock_init(&napi->poll_lock); napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); @@ -5187,7 +5245,7 @@ out_unlock: return work; } -static void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; @@ -5204,7 +5262,7 @@ static void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - return; + goto out; break; } @@ -5222,7 +5280,6 @@ static void net_rx_action(struct softirq_action *h) } } - __kfree_skb_flush(); local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); @@ -5232,6 +5289,8 @@ static void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); +out: + __kfree_skb_flush(); } struct netdev_adjacent { @@ -5262,6 +5321,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } +static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) +{ + struct net_device *dev = data; + + return upper_dev == dev; +} + /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device @@ -5276,11 +5342,30 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); + return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev); /** + * netdev_has_upper_dev_all - Check if device is linked to an upper device + * @dev: device + * @upper_dev: upper device to check + * + * Find out if a device is linked to specified upper device and return true + * in case it is. Note that this checks the entire upper device chain. + * The caller must hold rcu lock. + */ + +bool netdev_has_upper_dev_all_rcu(struct net_device *dev, + struct net_device *upper_dev) +{ + return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, + upper_dev); +} +EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); + +/** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * @@ -5291,7 +5376,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->all_adj_list.upper); + return !list_empty(&dev->adj_list.upper); } /** @@ -5318,6 +5403,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); +/** + * netdev_has_any_lower_dev - Check if device is linked to some device + * @dev: device + * + * Find out if a device is linked to a lower device and return true in case + * it is. The caller must hold the RTNL lock. + */ +static bool netdev_has_any_lower_dev(struct net_device *dev) +{ + ASSERT_RTNL(); + + return !list_empty(&dev->adj_list.lower); +} + void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; @@ -5354,16 +5453,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); -/** - * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next device from the dev's upper list, starting from iter - * position. The caller must hold RCU read lock. - */ -struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter) +static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *upper; @@ -5371,14 +5462,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - if (&upper->list == &dev->all_adj_list.upper) + if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } -EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +int netdev_walk_all_upper_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *udev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.upper, + udev = netdev_next_upper_dev_rcu(dev, &iter); + udev; + udev = netdev_next_upper_dev_rcu(dev, &iter)) { + /* first is the upper device itself */ + ret = fn(udev, data); + if (ret) + return ret; + + /* then look at all of its upper devices */ + ret = netdev_walk_all_upper_dev_rcu(udev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); /** * netdev_lower_get_next_private - Get the next ->private from the @@ -5461,51 +5579,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) } EXPORT_SYMBOL(netdev_lower_get_next); -/** - * netdev_all_lower_get_next - Get the next device from all lower neighbour list - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next netdev_adjacent from the dev's all lower neighbour - * list, starting from iter position. The caller must hold RTNL lock or - * its own locking that guarantees that the neighbour all lower - * list will remain unchanged. - */ -struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) +static struct net_device *netdev_next_lower_dev(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_entry(*iter, struct netdev_adjacent, list); + lower = list_entry((*iter)->next, struct netdev_adjacent, list); - if (&lower->list == &dev->all_adj_list.lower) + if (&lower->list == &dev->adj_list.lower) return NULL; - *iter = lower->list.next; + *iter = &lower->list; return lower->dev; } -EXPORT_SYMBOL(netdev_all_lower_get_next); -/** - * netdev_all_lower_get_next_rcu - Get the next device from all - * lower neighbour list, RCU variant - * @dev: device - * @iter: list_head ** of the current position - * - * Gets the next netdev_adjacent from the dev's all lower neighbour - * list, starting from iter position. The caller must hold RCU read lock. - */ -struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, - struct list_head **iter) +int netdev_walk_all_lower_dev(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.lower, + ldev = netdev_next_lower_dev(dev, &iter); + ldev; + ldev = netdev_next_lower_dev(dev, &iter)) { + /* first is the lower device itself */ + ret = fn(ldev, data); + if (ret) + return ret; + + /* then look at all of its lower devices */ + ret = netdev_walk_all_lower_dev(ldev, fn, data); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); + +static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_first_or_null_rcu(&dev->all_adj_list.lower, - struct netdev_adjacent, list); + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + if (&lower->list == &dev->adj_list.lower) + return NULL; + + *iter = &lower->list; + + return lower->dev; +} + +int netdev_walk_all_lower_dev_rcu(struct net_device *dev, + int (*fn)(struct net_device *dev, + void *data), + void *data) +{ + struct net_device *ldev; + struct list_head *iter; + int ret; + + for (iter = &dev->adj_list.lower, + ldev = netdev_next_lower_dev_rcu(dev, &iter); + ldev; + ldev = netdev_next_lower_dev_rcu(dev, &iter)) { + /* first is the lower device itself */ + ret = fn(ldev, data); + if (ret) + return ret; + + /* then look at all of its lower devices */ + ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data); + if (ret) + return ret; + } - return lower ? lower->dev : NULL; + return 0; } -EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); +EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the @@ -5587,7 +5744,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { - adj->ref_nr++; + adj->ref_nr += 1; + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", + dev->name, adj_dev->name, adj->ref_nr); + return 0; } @@ -5601,8 +5761,8 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->private = private; dev_hold(adj_dev); - pr_debug("dev_hold for %s, because of link added from %s to %s\n", - adj_dev->name, dev->name, adj_dev->name); + pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", + dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); @@ -5636,22 +5796,28 @@ free_adj: static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, + u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; + pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", + dev->name, adj_dev->name, ref_nr); + adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { - pr_err("tried to remove device %s from %s\n", + pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); - BUG(); + WARN_ON(1); + return; } - if (adj->ref_nr > 1) { - pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, - adj->ref_nr-1); - adj->ref_nr--; + if (adj->ref_nr > ref_nr) { + pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", + dev->name, adj_dev->name, ref_nr, + adj->ref_nr - ref_nr); + adj->ref_nr -= ref_nr; return; } @@ -5662,7 +5828,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); - pr_debug("dev_put for %s, because link removed from %s to %s\n", + pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); @@ -5676,73 +5842,45 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev, { int ret; - ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, - master); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, + private, master); if (ret) return ret; - ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, - false); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, + private, false); if (ret) { - __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } -static int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev) -{ - return __netdev_adjacent_dev_link_lists(dev, upper_dev, - &dev->all_adj_list.upper, - &upper_dev->all_adj_list.lower, - NULL, false); -} - static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, + u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { - __netdev_adjacent_dev_remove(dev, upper_dev, up_list); - __netdev_adjacent_dev_remove(upper_dev, dev, down_list); -} - -static void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) -{ - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, - &dev->all_adj_list.upper, - &upper_dev->all_adj_list.lower); + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { - int ret = __netdev_adjacent_dev_link(dev, upper_dev); - - if (ret) - return ret; - - ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, - &dev->adj_list.upper, - &upper_dev->adj_list.lower, - private, master); - if (ret) { - __netdev_adjacent_dev_unlink(dev, upper_dev); - return ret; - } - - return 0; + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { - __netdev_adjacent_dev_unlink(dev, upper_dev); - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } @@ -5752,7 +5890,6 @@ static int __netdev_upper_dev_link(struct net_device *dev, void *upper_priv, void *upper_info) { struct netdev_notifier_changeupper_info changeupper_info; - struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; ASSERT_RTNL(); @@ -5761,10 +5898,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) + if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) + if (netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) @@ -5786,80 +5923,15 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - /* Now that we linked these devs, make all the upper_dev's - * all_adj_list.upper visible to every dev's all_adj_list.lower an - * versa, and don't forget the devices itself. All of these - * links are non-neighbours. - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { - pr_debug("Interlinking %s with %s, non-neighbour\n", - i->dev->name, j->dev->name); - ret = __netdev_adjacent_dev_link(i->dev, j->dev); - if (ret) - goto rollback_mesh; - } - } - - /* add dev to every upper_dev's upper device */ - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { - pr_debug("linking %s's upper device %s with %s\n", - upper_dev->name, i->dev->name, dev->name); - ret = __netdev_adjacent_dev_link(dev, i->dev); - if (ret) - goto rollback_upper_mesh; - } - - /* add upper_dev to every dev's lower device */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - pr_debug("linking %s's lower device %s with %s\n", dev->name, - i->dev->name, upper_dev->name); - ret = __netdev_adjacent_dev_link(i->dev, upper_dev); - if (ret) - goto rollback_lower_mesh; - } - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) - goto rollback_lower_mesh; + goto rollback; return 0; -rollback_lower_mesh: - to_i = i; - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - if (i == to_i) - break; - __netdev_adjacent_dev_unlink(i->dev, upper_dev); - } - - i = NULL; - -rollback_upper_mesh: - to_i = i; - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { - if (i == to_i) - break; - __netdev_adjacent_dev_unlink(dev, i->dev); - } - - i = j = NULL; - -rollback_mesh: - to_i = i; - to_j = j; - list_for_each_entry(i, &dev->all_adj_list.lower, list) { - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { - if (i == to_i && j == to_j) - break; - __netdev_adjacent_dev_unlink(i->dev, j->dev); - } - if (i == to_i) - break; - } - +rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; @@ -5916,7 +5988,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_notifier_changeupper_info changeupper_info; - struct netdev_adjacent *i, *j; ASSERT_RTNL(); changeupper_info.upper_dev = upper_dev; @@ -5928,23 +5999,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - /* Here is the tricky part. We must remove all dev's lower - * devices from all upper_dev's upper devices and vice - * versa, to maintain the graph relationship. - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) - list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(i->dev, j->dev); - - /* remove also the devices itself from lower/upper device - * list - */ - list_for_each_entry(i, &dev->all_adj_list.lower, list) - __netdev_adjacent_dev_unlink(i->dev, upper_dev); - - list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(dev, i->dev); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); } @@ -6482,9 +6536,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) if (new_mtu == dev->mtu) return 0; - /* MTU must be positive. */ - if (new_mtu < 0) + /* MTU must be positive, and in range */ + if (new_mtu < 0 || new_mtu < dev->min_mtu) { + net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", + dev->name, new_mtu, dev->min_mtu); return -EINVAL; + } + + if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { + net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", + dev->name, new_mtu, dev->max_mtu); + return -EINVAL; + } if (!netif_device_present(dev)) return -ENODEV; @@ -6631,26 +6694,42 @@ EXPORT_SYMBOL(dev_change_proto_down); * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @fd: new program fd or negative value to clear + * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd) +int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp = {}; + struct netdev_xdp xdp; int err; + ASSERT_RTNL(); + if (!ops->ndo_xdp) return -EOPNOTSUPP; if (fd >= 0) { + if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + err = ops->ndo_xdp(dev, &xdp); + if (err < 0) + return err; + if (xdp.prog_attached) + return -EBUSY; + } + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } + memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; xdp.prog = prog; + err = ops->ndo_xdp(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); @@ -6723,8 +6802,8 @@ static void rollback_registered_many(struct list_head *head) unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; - on_each_cpu(flush_backlog, dev, 1); } + flush_all_backlogs(); synchronize_net(); @@ -6759,6 +6838,7 @@ static void rollback_registered_many(struct list_head *head) /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); @@ -7637,16 +7717,17 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); - INIT_LIST_HEAD(&dev->all_adj_list.upper); - INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; - dev->tx_queue_len = 1; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; @@ -7927,18 +8008,13 @@ out: } EXPORT_SYMBOL_GPL(dev_change_net_namespace); -static int dev_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *ocpu) +static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; - unsigned int cpu, oldcpu = (unsigned long)ocpu; + unsigned int cpu; struct softnet_data *sd, *oldsd; - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; - local_irq_disable(); cpu = smp_processor_id(); sd = &per_cpu(softnet_data, cpu); @@ -7988,10 +8064,9 @@ static int dev_cpu_callback(struct notifier_block *nfb, input_queue_head_incr(oldsd); } - return NOTIFY_OK; + return 0; } - /** * netdev_increment_features - increment feature set by one * @all: current feature set @@ -8286,8 +8361,11 @@ static int __init net_dev_init(void) */ for_each_possible_cpu(i) { + struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); + INIT_WORK(flush, flush_backlog); + skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); INIT_LIST_HEAD(&sd->poll_list); @@ -8322,7 +8400,9 @@ static int __init net_dev_init(void) open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); - hotcpu_notifier(dev_cpu_callback, 0); + rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", + NULL, dev_cpu_dead); + WARN_ON(rc < 0); dst_subsys_init(); rc = 0; out: diff --git a/net/core/devlink.c b/net/core/devlink.c index 1b5063088f1a..2b5bf9efa720 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); } -static struct genl_family devlink_nl_family = { - .id = GENL_ID_GENERATE, - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .netnsok = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, -}; +static struct genl_family devlink_nl_family; enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, @@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink, if (devlink->ops && devlink->ops->port_type_set) { if (port_type == DEVLINK_PORT_TYPE_NOTSET) return -EINVAL; + if (port_type == devlink_port->type) + return 0; err = devlink->ops->port_type_set(devlink_port, port_type); if (err) return err; @@ -1400,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, - u32 seq, int flags, u16 mode) + u32 seq, int flags) { + const struct devlink_ops *ops = devlink->ops; void *hdr; + int err = 0; + u16 mode; + u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; + err = devlink_nl_put_handle(msg, devlink); + if (err) + goto out; - if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode)) - goto nla_put_failure; + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto out; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto out; + + if (ops->eswitch_inline_mode_get) { + err = ops->eswitch_inline_mode_get(devlink, &inline_mode); + if (err) + goto out; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, + inline_mode); + if (err) + goto out; + } genlmsg_end(msg, hdr); return 0; -nla_put_failure: +out: genlmsg_cancel(msg, hdr); - return -EMSGSIZE; + return err; } static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, @@ -1428,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct sk_buff *msg; - u16 mode; int err; if (!ops || !ops->eswitch_mode_get) return -EOPNOTSUPP; - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - return err; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, - info->snd_portid, info->snd_seq, 0, mode); + info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); @@ -1459,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; u16 mode; + u8 inline_mode; + int err = 0; - if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) - return -EINVAL; + if (!ops) + return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { + if (!ops->eswitch_mode_set) + return -EOPNOTSUPP; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); + err = ops->eswitch_mode_set(devlink, mode); + if (err) + return err; + } - if (ops && ops->eswitch_mode_set) - return ops->eswitch_mode_set(devlink, mode); - return -EOPNOTSUPP; + if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) { + if (!ops->eswitch_inline_mode_set) + return -EOPNOTSUPP; + inline_mode = nla_get_u8( + info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]); + err = ops->eswitch_inline_mode_set(devlink, inline_mode); + if (err) + return err; + } + + return 0; } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -1484,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, + [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1618,6 +1644,20 @@ static const struct genl_ops devlink_nl_ops[] = { }, }; +static struct genl_family devlink_nl_family __ro_after_init = { + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, + .module = THIS_MODULE, + .ops = devlink_nl_ops, + .n_ops = ARRAY_SIZE(devlink_nl_ops), + .mcgrps = devlink_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), +}; + /** * devlink_alloc - Allocate new devlink instance resources * @@ -1840,9 +1880,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); static int __init devlink_module_init(void) { - return genl_register_family_with_ops_groups(&devlink_nl_family, - devlink_nl_ops, - devlink_nl_mcgrps); + return genl_register_family(&devlink_nl_family); } static void __exit devlink_module_exit(void) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d6b3b579560d..8e0c0635ee97 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -59,12 +59,7 @@ struct dm_hw_stat_delta { unsigned long last_drop_val; }; -static struct genl_family net_drop_monitor_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "NET_DM", - .version = 2, -}; +static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); @@ -105,7 +100,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) return skb; } -static struct genl_multicast_group dropmon_mcgrps[] = { +static const struct genl_multicast_group dropmon_mcgrps[] = { { .name = "events", }, }; @@ -351,6 +346,17 @@ static const struct genl_ops dropmon_ops[] = { }, }; +static struct genl_family net_drop_monitor_family __ro_after_init = { + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .module = THIS_MODULE, + .ops = dropmon_ops, + .n_ops = ARRAY_SIZE(dropmon_ops), + .mcgrps = dropmon_mcgrps, + .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), +}; + static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; @@ -367,8 +373,7 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, - dropmon_ops, dropmon_mcgrps); + rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 977489820eb9..e23766c7e3ba 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -119,6 +119,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", }; +static const char +phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { + [ETHTOOL_ID_UNSPEC] = "Unspec", + [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -227,6 +233,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); + if (sset == ETH_SS_PHY_TUNABLES) + return ARRAY_SIZE(phy_tunable_strings); + if (sset == ETH_SS_PHY_STATS) { if (dev->phydev) return phy_get_sset_count(dev->phydev); @@ -253,6 +262,8 @@ static void __ethtool_get_strings(struct net_device *dev, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); + else if (stringset == ETH_SS_PHY_TUNABLES) + memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS) { struct phy_device *phydev = dev->phydev; @@ -2422,6 +2433,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) }; } +static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_PHY_DOWNSHIFT: + if (tuna->len != sizeof(u8) || + tuna->type_id != ETHTOOL_TUNABLE_U8) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int get_phy_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + struct phy_device *phydev = dev->phydev; + void *data; + + if (!(phydev && phydev->drv && phydev->drv->get_tunable)) + return -EOPNOTSUPP; + + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_phy_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + mutex_lock(&phydev->lock); + ret = phydev->drv->get_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int set_phy_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + struct phy_device *phydev = dev->phydev; + void *data; + + if (!(phydev && phydev->drv && phydev->drv->set_tunable)) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_phy_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + mutex_lock(&phydev->lock); + ret = phydev->drv->set_tunable(phydev, &tuna, data); + mutex_unlock(&phydev->lock); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -2479,6 +2569,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: + case ETHTOOL_PHY_GTUNABLE: + case ETHTOOL_GLINKSETTINGS: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -2684,6 +2776,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_SLINKSETTINGS: rc = ethtool_set_link_ksettings(dev, useraddr); break; + case ETHTOOL_PHY_GTUNABLE: + rc = get_phy_tunable(dev, useraddr); + break; + case ETHTOOL_PHY_STUNABLE: + rc = set_phy_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index be4629c344a6..b6791d94841d 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -18,6 +18,11 @@ #include <net/fib_rules.h> #include <net/ip_tunnels.h> +static const struct fib_kuid_range fib_kuid_range_unset = { + KUIDT_INIT(0), + KUIDT_INIT(~0), +}; + int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->table = table; r->flags = flags; r->fr_net = ops->fro_net; + r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; @@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_unregister); +static int uid_range_set(struct fib_kuid_range *range) +{ + return uid_valid(range->start) && uid_valid(range->end); +} + +static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) +{ + struct fib_rule_uid_range *in; + struct fib_kuid_range out; + + in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); + + out.start = make_kuid(current_user_ns(), in->start); + out.end = make_kuid(current_user_ns(), in->end); + + return out; +} + +static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) +{ + struct fib_rule_uid_range out = { + from_kuid_munged(current_user_ns(), range->start), + from_kuid_munged(current_user_ns(), range->end) + }; + + return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; + if (uid_lt(fl->flowi_uid, rule->uid_range.start) || + uid_gt(fl->flowi_uid, rule->uid_range.end)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->l3mdev != rule->l3mdev) continue; + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (rule->l3mdev && rule->table) goto errout_free; + if (tb[FRA_UID_RANGE]) { + if (current_user_ns() != net->user_ns) { + err = -EPERM; + goto errout_free; + } + + rule->uid_range = nla_get_kuid_range(tb); + + if (!uid_range_set(&rule->uid_range) || + !uid_lte(rule->uid_range.start, rule->uid_range.end)) + goto errout_free; + } else { + rule->uid_range = fib_kuid_range_unset; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *tmp; struct nlattr *tb[FRA_MAX+1]; + struct fib_kuid_range range; int err = -EINVAL; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) @@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; + if (tb[FRA_UID_RANGE]) { + range = nla_get_kuid_range(tb); + if (!uid_range_set(&range)) + goto errout; + } else { + range = fib_kuid_range_unset; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (frh->action && (frh->action != rule->action)) continue; @@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) continue; + if (uid_range_set(&range) && + (!uid_eq(rule->uid_range.start, range.start) || + !uid_eq(rule->uid_range.end, range.end))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ - + nla_total_size_64bit(8); /* FRA_TUN_ID */ + + nla_total_size_64bit(8) /* FRA_TUN_ID */ + + nla_total_size(sizeof(struct fib_kuid_range)); if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && - nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev))) + nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || + (uid_range_set(&rule->uid_range) && + nla_put_uid_range(skb, &rule->uid_range))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index cb06aceb512a..b1461708a977 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -30,6 +30,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> +#include <linux/if_arp.h> #include <linux/gfp.h> #include <net/ip.h> #include <net/protocol.h> @@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; + err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); + if (err) + return err; + err = security_sock_rcv_skb(sk, skb); if (err) return err; @@ -94,14 +99,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) { - return skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff(skb); } -static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -120,9 +124,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -145,7 +148,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_0(__get_raw_cpu_id) { return raw_smp_processor_id(); } @@ -233,9 +236,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); - BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ @@ -1350,17 +1352,26 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err; + int err = __bpf_try_make_writable(skb, write_len); - err = skb_ensure_writable(skb, write_len); bpf_compute_data_end(skb); - return err; } +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) @@ -1373,12 +1384,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; - void *from = (void *) (long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) @@ -1413,12 +1421,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) { - const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; - unsigned int offset = (unsigned int) r2; - void *to = (void *)(unsigned long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(offset > 0xffff)) @@ -1446,10 +1451,31 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) @@ -1491,12 +1517,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | @@ -1544,12 +1569,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u64 diff_size = from_size + to_size; - __be32 *from = (__be32 *) (long) r1; - __be32 *to = (__be32 *) (long) r3; + u32 diff_size = from_size + to_size; int i, j = 0; /* This is quite flexible, some examples: @@ -1575,6 +1599,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_STACK, .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, @@ -1583,11 +1608,44 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); } +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; @@ -1607,10 +1665,55 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } -static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* skb->mac_len is not set on normal egress */ + unsigned int mlen = skb->network_header - skb->mac_header; + + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + if (dev_is_mac_header_xmit(dev)) + return __bpf_redirect_common(skb, dev, flags); + else + return __bpf_redirect_no_mac(skb, dev, flags); +} + +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; struct net_device *dev; + struct sk_buff *clone; + int ret; if (unlikely(flags & ~(BPF_F_INGRESS))) return -EINVAL; @@ -1619,14 +1722,22 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!dev)) return -EINVAL; - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - bpf_push_mac_rcsum(skb); + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } - return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1645,7 +1756,7 @@ struct redirect_info { static DEFINE_PER_CPU(struct redirect_info, redirect_info); -static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -1670,10 +1781,7 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - bpf_push_mac_rcsum(skb); - - return ri->flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + return __bpf_redirect(skb, dev, ri->flags); } static const struct bpf_func_proto bpf_redirect_proto = { @@ -1684,9 +1792,9 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { - return task_get_classid((struct sk_buff *) (unsigned long) r1); + return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { @@ -1696,9 +1804,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { - return dst_tclassid((struct sk_buff *) (unsigned long) r1); + return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1708,14 +1816,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ - return skb_get_hash((struct sk_buff *) (unsigned long) r1); + return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { @@ -1725,10 +1833,25 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 vlan_proto = (__force __be16) r2; int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && @@ -1753,9 +1876,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); -static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; int ret; bpf_push_mac_rcsum(skb); @@ -1930,10 +2052,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) return -ENOTSUPP; } -static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 proto = (__force __be16) r2; int ret; if (unlikely(flags)) @@ -1970,14 +2091,11 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 pkt_type = r2; - /* We only allow a restricted subset to be changed for now. */ - if (unlikely(skb->pkt_type > PACKET_OTHERHOST || - pkt_type > PACKET_OTHERHOST)) + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || + !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; @@ -1992,19 +2110,163 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { .arg2_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - if (func == bpf_skb_vlan_push) - return true; - if (func == bpf_skb_vlan_pop) - return true; - if (func == bpf_skb_store_bytes) - return true; - if (func == bpf_skb_change_proto) - return true; - if (func == bpf_l3_csum_replace) - return true; - if (func == bpf_l4_csum_replace) + u32 min_len = skb_network_offset(skb); + + if (skb_transport_header_was_set(skb)) + min_len = skb_transport_offset(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + min_len = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + return min_len; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ + return skb->dev->mtu + skb->dev->hard_header_len; +} + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 min_len = __bpf_skb_min_len(skb); + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + } + + bpf_compute_data_end(skb); + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *data = xdp->data + offset; + + if (unlikely(data < xdp->data_hard_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_head || + func == bpf_skb_change_tail || + func == bpf_skb_pull_data || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head) return true; return false; @@ -2023,13 +2285,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, return 0; } -static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, - u64 meta_size) +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) { - struct sk_buff *skb = (struct sk_buff *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; - void *meta = (void *)(long) r4; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; @@ -2056,10 +2315,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags) return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } -static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; @@ -2124,10 +2382,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *to = (u8 *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; @@ -2162,10 +2418,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { static struct metadata_dst __percpu *md_dst; -static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; @@ -2183,7 +2438,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); - from = (struct bpf_tunnel_key *)compat; + from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; @@ -2233,10 +2488,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *from = (u8 *) (long) r2; struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); @@ -2282,28 +2536,24 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -#ifdef CONFIG_SOCK_CGROUP_DATA -static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) { - struct sk_buff *skb = (struct sk_buff *)(long)r1; - struct bpf_map *map = (struct bpf_map *)(long)r2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; - u32 i = (u32)r3; - sk = skb->sk; + sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; - - if (unlikely(i >= array->map.max_entries)) + if (unlikely(idx >= array->map.max_entries)) return -E2BIG; - cgrp = READ_ONCE(array->ptrs[i]); + cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; - return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); + return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { @@ -2314,7 +2564,38 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; -#endif + +static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, + unsigned long off, unsigned long len) +{ + memcpy(dst_buff, src_buff + off, len); + return 0; +} + +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, + bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) @@ -2330,6 +2611,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: @@ -2350,8 +2633,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -2368,6 +2655,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: @@ -2382,14 +2671,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; -#ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; -#endif default: return sk_filter_func_proto(func_id); } @@ -2398,10 +2687,92 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id) { - return sk_filter_func_proto(func_id); + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +cg_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_inout_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + default: + return lwt_inout_func_proto(func_id); + } } -static bool __is_valid_access(int off, int size, enum bpf_access_type type) +static bool __is_valid_access(int off, int size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2435,7 +2806,103 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return __is_valid_access(off, size, type); + return __is_valid_access(off, size); +} + +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_access(off, size); +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; } static bool tc_cls_act_is_valid_access(int off, int size, @@ -2465,17 +2932,16 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; } - return __is_valid_access(off, size, type); + return __is_valid_access(off, size); } -static bool __is_valid_xdp_access(int off, int size, - enum bpf_access_type type) +static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; - if (size != 4) + if (size != sizeof(__u32)) return false; return true; @@ -2497,7 +2963,7 @@ static bool xdp_is_valid_access(int off, int size, break; } - return __is_valid_xdp_access(off, size, type); + return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(u32 act) @@ -2506,10 +2972,10 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn_buf, - struct bpf_prog *prog) +static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -2556,7 +3022,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), dst_reg, src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); @@ -2597,7 +3063,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, dst_reg, src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); prog->cb_access = 1; @@ -2621,7 +3087,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, break; case offsetof(struct __sk_buff, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), dst_reg, src_reg, offsetof(struct sk_buff, data)); break; @@ -2630,8 +3096,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, data_end); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), - dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, + ctx_off); break; case offsetof(struct __sk_buff, tc_index): @@ -2657,6 +3123,76 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + + case offsetof(struct bpf_sock, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sock, sk_family)); + break; + + case offsetof(struct bpf_sock, type): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock, protocol): + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, __sk_flags_offset)); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + default: + return sk_filter_convert_ctx_access(type, dst_reg, src_reg, + ctx_off, insn_buf, prog); + } + + return insn - insn_buf; +} + static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2666,12 +3202,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct xdp_md, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), dst_reg, src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), dst_reg, src_reg, offsetof(struct xdp_buff, data_end)); break; @@ -2683,13 +3219,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, }; static const struct bpf_verifier_ops xdp_ops = { @@ -2698,6 +3235,31 @@ static const struct bpf_verifier_ops xdp_ops = { .convert_ctx_access = xdp_convert_ctx_access, }; +static const struct bpf_verifier_ops cg_skb_ops = { + .get_func_proto = cg_skb_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_inout_ops = { + .get_func_proto = lwt_inout_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_xmit_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -2718,12 +3280,42 @@ static struct bpf_prog_type_list xdp_type __read_mostly = { .type = BPF_PROG_TYPE_XDP, }; +static struct bpf_prog_type_list cg_skb_type __read_mostly = { + .ops = &cg_skb_ops, + .type = BPF_PROG_TYPE_CGROUP_SKB, +}; + +static struct bpf_prog_type_list lwt_in_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_IN, +}; + +static struct bpf_prog_type_list lwt_out_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_OUT, +}; + +static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { + .ops = &lwt_xmit_ops, + .type = BPF_PROG_TYPE_LWT_XMIT, +}; + +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); + bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); + bpf_register_prog_type(&lwt_in_type); + bpf_register_prog_type(&lwt_out_type); + bpf_register_prog_type(&lwt_xmit_type); return 0; } diff --git a/net/core/flow.c b/net/core/flow.c index 3937b1b68d5b..f765c11d8df5 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -95,7 +95,6 @@ static void flow_cache_gc_task(struct work_struct *work) list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) { flow_entry_kill(fce, xfrm); atomic_dec(&xfrm->flow_cache_gc_count); - WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0); } } @@ -236,9 +235,8 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, if (fcp->hash_count > fc->high_watermark) flow_cache_shrink(fc, fcp); - if (fcp->hash_count > 2 * fc->high_watermark || - atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) { - atomic_inc(&net->xfrm.flow_cache_genid); + if (atomic_read(&net->xfrm.flow_cache_gc_count) > + 2 * num_online_cpus() * fc->high_watermark) { flo = ERR_PTR(-ENOBUFS); goto ret_object; } @@ -419,28 +417,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) return 0; } -static int flow_cache_cpu(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node) { - struct flow_cache *fc = container_of(nfb, struct flow_cache, - hotcpu_notifier); - int res, cpu = (unsigned long) hcpu; + struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node); + + return flow_cache_cpu_prepare(fc, cpu); +} + +static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node); struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - res = flow_cache_cpu_prepare(fc, cpu); - if (res) - return notifier_from_errno(res); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - __flow_cache_shrink(fc, fcp, 0); - break; - } - return NOTIFY_OK; + __flow_cache_shrink(fc, fcp, 0); + return 0; } int flow_cache_init(struct net *net) @@ -467,18 +457,8 @@ int flow_cache_init(struct net *net) if (!fc->percpu) return -ENOMEM; - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - if (flow_cache_cpu_prepare(fc, i)) - goto err; - } - fc->hotcpu_notifier = (struct notifier_block){ - .notifier_call = flow_cache_cpu, - }; - __register_hotcpu_notifier(&fc->hotcpu_notifier); - - cpu_notifier_register_done(); + if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node)) + goto err; setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc); @@ -494,8 +474,6 @@ err: fcp->hash_table = NULL; } - cpu_notifier_register_done(); - free_percpu(fc->percpu); fc->percpu = NULL; @@ -509,7 +487,8 @@ void flow_cache_fini(struct net *net) struct flow_cache *fc = &net->xfrm.flow_cache_global; del_timer_sync(&fc->rnd_timer); - unregister_hotcpu_notifier(&fc->hotcpu_notifier); + + cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node); for_each_possible_cpu(i) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); @@ -521,3 +500,14 @@ void flow_cache_fini(struct net *net) fc->percpu = NULL; } EXPORT_SYMBOL(flow_cache_fini); + +void __init flow_cache_hp_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE, + "net/flow:prepare", + flow_cache_cpu_up_prep, + flow_cache_cpu_dead); + WARN_ON(ret < 0); +} diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 52742a02814f..d6447dc10371 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -6,6 +6,8 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/gre.h> +#include <net/pptp.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> @@ -56,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, EXPORT_SYMBOL(skb_flow_dissector_init); /** + * skb_flow_get_be16 - extract be16 entity + * @skb: sk_buff to extract from + * @poff: offset to extract at + * @data: raw buffer pointer to the packet + * @hlen: packet header length + * + * The function will try to retrieve a be32 entity at + * offset poff + */ +__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, + int hlen) +{ + __be16 *u, _u; + + u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); + if (u) + return *u; + + return 0; +} + +/** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset @@ -115,14 +139,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_keyid *key_keyid; + bool skip_vlan = false; u8 ip_proto = 0; - bool ret = false; + bool ret; if (!data) { data = skb->data; - proto = skb->protocol; + proto = skb_vlan_tag_present(skb) ? + skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); } @@ -242,22 +270,42 @@ ipv6: case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; struct vlan_hdr _vlan; + bool vlan_tag_present = skb && skb_vlan_tag_present(skb); - vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); - if (!vlan) - goto out_bad; + if (vlan_tag_present) + proto = skb->protocol; + + if (!vlan_tag_present || eth_type_vlan(skb->protocol)) { + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), + data, hlen, &_vlan); + if (!vlan) + goto out_bad; + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + if (skip_vlan) + goto again; + } + skip_vlan = true; if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { - key_tags = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID, + FLOW_DISSECTOR_KEY_VLAN)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLAN, target_container); - key_tags->vlan_id = skb_vlan_tag_get_id(skb); + if (vlan_tag_present) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = + (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & + VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } } - proto = vlan->h_vlan_encapsulated_proto; - nhoff += sizeof(*vlan); goto again; } case htons(ETH_P_PPP_SES): { @@ -338,32 +386,42 @@ mpls: ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - } *hdr, _hdr; + struct gre_base_hdr *hdr, _hdr; + u16 gre_ver; + int offset = 0; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) goto out_bad; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) break; - proto = hdr->proto; - nhoff += 4; + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + break; + + proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + break; + } + + offset += sizeof(struct gre_base_hdr); + if (hdr->flags & GRE_CSUM) - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->csum) + + sizeof(((struct gre_full_hdr *)0)->reserved1); + if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; - keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid), data, hlen, &_keyid); - if (!keyid) goto out_bad; @@ -372,32 +430,65 @@ ip_proto_again: key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); - key_keyid->keyid = *keyid; + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->key); } + if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) + offset += sizeof(((struct pptp_gre_header *)0)->seq); + + if (gre_ver == 0) { + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff + offset, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + goto out_bad; + proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = (nhoff + offset); + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *)0)->ack); + + ppp_hdr = skb_header_pointer(skb, nhoff + offset, + sizeof(_ppp_hdr), _ppp_hdr); + if (!ppp_hdr) goto out_bad; - proto = eth->h_proto; - nhoff += sizeof(*eth); - - /* Cap headers that we access via pointers at the - * end of the Ethernet header as our maximum alignment - * at that point is only 2 bytes. - */ - if (NET_IP_ALIGN) - hlen = nhoff; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; } + nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; @@ -478,15 +569,28 @@ ip_proto_again: data, hlen); } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP)) { + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); + } + out_good: ret = true; -out_bad: + key_control->thoff = (u16)nhoff; +out: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; return ret; + +out_bad: + ret = false; + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); + goto out; } EXPORT_SYMBOL(__skb_flow_dissect); @@ -653,7 +757,7 @@ EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; -u32 __skb_get_hash_symmetric(struct sk_buff *skb) +u32 __skb_get_hash_symmetric(const struct sk_buff *skb) { struct flow_keys keys; @@ -874,8 +978,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .offset = offsetof(struct flow_keys, ports), }, { - .key_id = FLOW_DISSECTOR_KEY_VLANID, - .offset = offsetof(struct flow_keys, tags), + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, @@ -940,4 +1044,4 @@ static int __init init_default_flow_dissectors(void) return 0; } -late_initcall_sync(init_default_flow_dissectors); +core_initcall(init_default_flow_dissectors); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index cad8e791f28e..101b5d0e2142 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -7,6 +7,7 @@ * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Eric Dumazet <edumazet@google.com> * * Changes: * Jamal Hadi Salim - moved it to net/core and reshulfed @@ -30,165 +31,79 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> -#include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/seqlock.h> #include <net/sock.h> #include <net/gen_stats.h> -/* - This code is NOT intended to be used for statistics collection, - its purpose is to provide a base for statistical multiplexing - for controlled load service. - If you need only statistics, run a user level daemon which - periodically reads byte counters. - - Unfortunately, rate estimation is not a very easy task. - F.e. I did not find a simple way to estimate the current peak rate - and even failed to formulate the problem 8)8) - - So I preferred not to built an estimator into the scheduler, - but run this task separately. - Ideally, it should be kernel thread(s), but for now it runs - from timers, which puts apparent top bounds on the number of rated - flows, has minimal overhead on small, but is enough - to handle controlled load service, sets of aggregates. - - We measure rate over A=(1<<interval) seconds and evaluate EWMA: - - avrate = avrate*(1-W) + rate*W - - where W is chosen as negative power of 2: W = 2^(-ewma_log) - - The resulting time constant is: - - T = A/(-ln(1-W)) - - - NOTES. - - * avbps and avpps are scaled by 2^5. - * both values are reported as 32 bit unsigned values. bps can - overflow for fast links : max speed being 34360Mbit/sec - * Minimal interval is HZ/4=250msec (it is the greatest common divisor - for HZ=100 and HZ=1024 8)), maximal interval - is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals - are too expensive, longer ones can be implemented - at user level painlessly. +/* This code is NOT intended to be used for statistics collection, + * its purpose is to provide a base for statistical multiplexing + * for controlled load service. + * If you need only statistics, run a user level daemon which + * periodically reads byte counters. */ -#define EST_MAX_INTERVAL 5 - -struct gen_estimator -{ - struct list_head list; +struct net_rate_estimator { struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; seqcount_t *running; - int ewma_log; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ + + seqcount_t seq; u32 last_packets; - unsigned long avpps; u64 last_bytes; + + u64 avpps; u64 avbps; - struct rcu_head e_rcu; - struct rb_node node; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; - struct rcu_head head; -}; -struct gen_estimator_head -{ - struct timer_list timer; - struct list_head list; + unsigned long next_jiffies; + struct timer_list timer; + struct rcu_head rcu; }; -static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; - -/* Protects against NULL dereference */ -static DEFINE_RWLOCK(est_lock); - -/* Protects against soft lockup during large deletion */ -static struct rb_root est_root = RB_ROOT; -static DEFINE_SPINLOCK(est_tree_lock); - -static void est_timer(unsigned long arg) +static void est_fetch_counters(struct net_rate_estimator *e, + struct gnet_stats_basic_packed *b) { - int idx = (int)arg; - struct gen_estimator *e; + if (e->stats_lock) + spin_lock(e->stats_lock); - rcu_read_lock(); - list_for_each_entry_rcu(e, &elist[idx].list, list) { - struct gnet_stats_basic_packed b = {0}; - unsigned long rate; - u64 brate; - - if (e->stats_lock) - spin_lock(e->stats_lock); - read_lock(&est_lock); - if (e->bstats == NULL) - goto skip; - - __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats); - - brate = (b.bytes - e->last_bytes)<<(7 - idx); - e->last_bytes = b.bytes; - e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); - WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5); - - rate = b.packets - e->last_packets; - rate <<= (7 - idx); - e->last_packets = b.packets; - e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); - WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5); -skip: - read_unlock(&est_lock); - if (e->stats_lock) - spin_unlock(e->stats_lock); - } + __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); + + if (e->stats_lock) + spin_unlock(e->stats_lock); - if (!list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); - rcu_read_unlock(); } -static void gen_add_node(struct gen_estimator *est) +static void est_timer(unsigned long arg) { - struct rb_node **p = &est_root.rb_node, *parent = NULL; + struct net_rate_estimator *est = (struct net_rate_estimator *)arg; + struct gnet_stats_basic_packed b; + u64 rate, brate; - while (*p) { - struct gen_estimator *e; + est_fetch_counters(est, &b); + brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log); + brate -= (est->avbps >> est->ewma_log); - parent = *p; - e = rb_entry(parent, struct gen_estimator, node); + rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log); + rate -= (est->avpps >> est->ewma_log); - if (est->bstats > e->bstats) - p = &parent->rb_right; - else - p = &parent->rb_left; - } - rb_link_node(&est->node, parent, p); - rb_insert_color(&est->node, &est_root); -} + write_seqcount_begin(&est->seq); + est->avbps += brate; + est->avpps += rate; + write_seqcount_end(&est->seq); -static -struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est) -{ - struct rb_node *p = est_root.rb_node; - - while (p) { - struct gen_estimator *e; + est->last_bytes = b.bytes; + est->last_packets = b.packets; - e = rb_entry(p, struct gen_estimator, node); + est->next_jiffies += ((HZ/4) << est->intvl_log); - if (bstats > e->bstats) - p = p->rb_right; - else if (bstats < e->bstats || rate_est != e->rate_est) - p = p->rb_left; - else - return e; + if (unlikely(time_after_eq(jiffies, est->next_jiffies))) { + /* Ouch... timer was delayed. */ + est->next_jiffies = jiffies + 1; } - return NULL; + mod_timer(&est->timer, est->next_jiffies); } /** @@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, - struct gnet_stats_rate_est64 *rate_est, + struct net_rate_estimator __rcu **rate_est, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt) { - struct gen_estimator *est; struct gnet_estimator *parm = nla_data(opt); - struct gnet_stats_basic_packed b = {0}; - int idx; + struct net_rate_estimator *old, *est; + struct gnet_stats_basic_packed b; + int intvl_log; if (nla_len(opt) < sizeof(*parm)) return -EINVAL; + /* allowed timer periods are : + * -2 : 250ms, -1 : 500ms, 0 : 1 sec + * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec + */ if (parm->interval < -2 || parm->interval > 3) return -EINVAL; est = kzalloc(sizeof(*est), GFP_KERNEL); - if (est == NULL) + if (!est) return -ENOBUFS; - __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats); - - idx = parm->interval + 2; + seqcount_init(&est->seq); + intvl_log = parm->interval + 2; est->bstats = bstats; - est->rate_est = rate_est; est->stats_lock = stats_lock; est->running = running; est->ewma_log = parm->ewma_log; - est->last_bytes = b.bytes; - est->avbps = rate_est->bps<<5; - est->last_packets = b.packets; - est->avpps = rate_est->pps<<10; + est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; - spin_lock_bh(&est_tree_lock); - if (!elist[idx].timer.function) { - INIT_LIST_HEAD(&elist[idx].list); - setup_timer(&elist[idx].timer, est_timer, idx); + est_fetch_counters(est, &b); + est->last_bytes = b.bytes; + est->last_packets = b.packets; + old = rcu_dereference_protected(*rate_est, 1); + if (old) { + del_timer_sync(&old->timer); + est->avbps = old->avbps; + est->avpps = old->avpps; } - if (list_empty(&elist[idx].list)) - mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); - - list_add_rcu(&est->list, &elist[idx].list); - gen_add_node(est); - spin_unlock_bh(&est_tree_lock); + est->next_jiffies = jiffies + ((HZ/4) << intvl_log); + setup_timer(&est->timer, est_timer, (unsigned long)est); + mod_timer(&est->timer, est->next_jiffies); + rcu_assign_pointer(*rate_est, est); + if (old) + kfree_rcu(old, rcu); return 0; } EXPORT_SYMBOL(gen_new_estimator); /** * gen_kill_estimator - remove a rate estimator - * @bstats: basic statistics - * @rate_est: rate estimator statistics + * @rate_est: rate estimator * - * Removes the rate estimator specified by &bstats and &rate_est. + * Removes the rate estimator. * - * Note : Caller should respect an RCU grace period before freeing stats_lock */ -void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est64 *rate_est) +void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { - struct gen_estimator *e; - - spin_lock_bh(&est_tree_lock); - while ((e = gen_find_node(bstats, rate_est))) { - rb_erase(&e->node, &est_root); + struct net_rate_estimator *est; - write_lock(&est_lock); - e->bstats = NULL; - write_unlock(&est_lock); - - list_del_rcu(&e->list); - kfree_rcu(e, e_rcu); + est = xchg((__force struct net_rate_estimator **)rate_est, NULL); + if (est) { + del_timer_sync(&est->timer); + kfree_rcu(est, rcu); } - spin_unlock_bh(&est_tree_lock); } EXPORT_SYMBOL(gen_kill_estimator); @@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator); */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, - struct gnet_stats_rate_est64 *rate_est, + struct net_rate_estimator __rcu **rate_est, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt) { - gen_kill_estimator(bstats, rate_est); - return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt); + return gen_new_estimator(bstats, cpu_bstats, rate_est, + stats_lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); /** * gen_estimator_active - test if estimator is currently in use - * @bstats: basic statistics - * @rate_est: rate estimator statistics + * @rate_est: rate estimator * * Returns true if estimator is active, and false if not. */ -bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est) +bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est) { - bool res; + return !!rcu_access_pointer(*rate_est); +} +EXPORT_SYMBOL(gen_estimator_active); - ASSERT_RTNL(); +bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est, + struct gnet_stats_rate_est64 *sample) +{ + struct net_rate_estimator *est; + unsigned seq; + + rcu_read_lock(); + est = rcu_dereference(*rate_est); + if (!est) { + rcu_read_unlock(); + return false; + } - spin_lock_bh(&est_tree_lock); - res = gen_find_node(bstats, rate_est) != NULL; - spin_unlock_bh(&est_tree_lock); + do { + seq = read_seqcount_begin(&est->seq); + sample->bps = est->avbps >> 8; + sample->pps = est->avpps >> 8; + } while (read_seqcount_retry(&est->seq, seq)); - return res; + rcu_read_unlock(); + return true; } -EXPORT_SYMBOL(gen_estimator_active); +EXPORT_SYMBOL(gen_estimator_read); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 508e051304fb..87f28557b329 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle - * @b: basic statistics - * @r: rate estimator statistics + * @rate_est: rate estimator * * Appends the rate estimator statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); */ int gnet_stats_copy_rate_est(struct gnet_dump *d, - const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est64 *r) + struct net_rate_estimator __rcu **rate_est) { + struct gnet_stats_rate_est64 sample; struct gnet_stats_rate_est est; int res; - if (b && !gen_estimator_active(b, r)) + if (!gen_estimator_read(rate_est, &sample)) return 0; - - est.bps = min_t(u64, UINT_MAX, r->bps); + est.bps = min_t(u64, UINT_MAX, sample.bps); /* we have some time before reaching 2^32 packets per second */ - est.pps = r->pps; + est.pps = sample.pps; if (d->compat_tc_stats) { d->tc_stats.bps = est.bps; @@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, if (d->tail) { res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), TCA_STATS_PAD); - if (res < 0 || est.bps == r->bps) + if (res < 0 || est.bps == sample.bps) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r), - TCA_STATS_PAD); + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample, + sizeof(sample), TCA_STATS_PAD); } return 0; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..71bb3e2eca08 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,396 @@ +/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <net/lwtunnel.h> + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Preempt disable is needed to protect per-cpu redirect_info between + * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and + * access to maps strictly require a rcu_read_lock() for protection, + * mixing with BH RCU lock doesn't work. + */ + preempt_disable(); + rcu_read_lock(); + bpf_compute_data_end(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + rcu_read_unlock(); + + switch (ret) { + case BPF_OK: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : "<unknown>"); + ret = BPF_OK; + } else { + ret = skb_do_redirect(skb); + if (ret == 0) + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + preempt_enable(); + + return ret; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_input)) { + pr_warn_once("orig_input not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb) +{ + int hh_len = skb_dst(skb)->dev->hard_header_len; + + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, +}; + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 669ecc9f884e..a5d4e866ce88 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -39,6 +39,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; + case LWTUNNEL_ENCAP_SEG6: + return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: @@ -130,6 +134,19 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, } EXPORT_SYMBOL(lwtunnel_build_state); +void lwtstate_free(struct lwtunnel_state *lws) +{ + const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; + + if (ops->destroy_state) { + ops->destroy_state(lws); + kfree_rcu(lws, rcu); + } else { + kfree(lws); + } +} +EXPORT_SYMBOL(lwtstate_free); + int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; @@ -251,6 +268,41 @@ drop: } EXPORT_SYMBOL(lwtunnel_output); +int lwtunnel_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->xmit)) + ret = ops->xmit(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_xmit); + int lwtunnel_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index cf26e04c4046..782dd8663665 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } else goto out; } else { - if (lladdr == neigh->ha && new == NUD_STALE) + if (lladdr == neigh->ha && new == NUD_STALE && + !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } @@ -2290,13 +2291,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; n != NULL; n = rcu_dereference_bh(n->next)) { - if (!net_eq(dev_net(n->dev), net)) - continue; - if (neigh_ifindex_filtered(n->dev, filter_idx)) - continue; - if (neigh_master_filtered(n->dev, filter_master_idx)) - continue; - if (idx < s_idx) + if (idx < s_idx || !net_eq(dev_net(n->dev), net)) + goto next; + if (neigh_ifindex_filtered(n->dev, filter_idx) || + neigh_master_filtered(n->dev, filter_master_idx)) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -2331,9 +2329,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { - if (pneigh_net(n) != net) - continue; - if (idx < s_idx) + if (idx < s_idx || pneigh_net(n) != net) goto next; if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 6e4f34721080..b0c04cf4851d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -950,10 +950,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { + struct kobject *kobj = &dev->_rx[i].kobj; + + if (!list_empty(&dev_net(dev)->exit_list)) + kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) - sysfs_remove_group(&dev->_rx[i].kobj, - dev->sysfs_rx_queue_group); - kobject_put(&dev->_rx[i].kobj); + sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + kobject_put(kobj); } return error; @@ -1021,7 +1024,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue, return sprintf(buf, "%lu", trans_timeout); } -#ifdef CONFIG_XPS static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; @@ -1033,6 +1035,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } +static ssize_t show_traffic_class(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + struct net_device *dev = queue->dev; + int index = get_netdev_queue_index(queue); + int tc = netdev_txq_to_tc(dev, index); + + if (tc < 0) + return -EINVAL; + + return sprintf(buf, "%u\n", tc); +} + +#ifdef CONFIG_XPS static ssize_t show_tx_maxrate(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) @@ -1075,6 +1092,9 @@ static struct netdev_queue_attribute queue_tx_maxrate = static struct netdev_queue_attribute queue_trans_timeout = __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); +static struct netdev_queue_attribute queue_traffic_class = + __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL); + #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. @@ -1190,29 +1210,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, char *buf) { struct net_device *dev = queue->dev; + int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; - int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); + if (dev->num_tc) { + num_tc = dev->num_tc; + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) + return -EINVAL; + } + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps); if (dev_maps) { - for_each_possible_cpu(i) { - struct xps_map *map = - rcu_dereference(dev_maps->cpu_map[i]); - if (map) { - int j; - for (j = 0; j < map->len; j++) { - if (map->queues[j] == index) { - cpumask_set_cpu(i, mask); - break; - } + for_each_possible_cpu(cpu) { + int i, tci = cpu * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->cpu_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + cpumask_set_cpu(cpu, mask); + break; } } } @@ -1260,6 +1289,7 @@ static struct netdev_queue_attribute xps_cpus_attribute = static struct attribute *netdev_queue_default_attrs[] = { &queue_trans_timeout.attr, + &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &queue_tx_maxrate.attr, @@ -1340,6 +1370,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; + if (!list_empty(&dev_net(dev)->exit_list)) + queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); #endif @@ -1525,6 +1557,9 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &(ndev->dev); + if (!list_empty(&dev_net(ndev)->exit_list)) + dev_set_uevent_suppress(dev, 1); + kobject_get(&dev->kobj); remove_queue_kobjects(ndev); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 10608dd1489f..3c4bbec39713 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -37,6 +37,11 @@ struct net init_net = { }; EXPORT_SYMBOL(init_net); +static bool init_net_initialized; + +#define MIN_PERNET_OPS_ID \ + ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) + #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; @@ -44,27 +49,28 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; - size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) - ng->len = max_gen_ptrs; + ng->s.len = max_gen_ptrs; return ng; } -static int net_assign_generic(struct net *net, int id, void *data) +static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; BUG_ON(!mutex_is_locked(&net_mutex)); - BUG_ON(id == 0); + BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, lockdep_is_held(&net_mutex)); - ng = old_ng; - if (old_ng->len >= id) - goto assign; + if (old_ng->s.len > id) { + old_ng->ptr[id] = data; + return 0; + } ng = net_alloc_generic(); if (ng == NULL) @@ -81,12 +87,12 @@ static int net_assign_generic(struct net *net, int id, void *data) * the old copy for kfree after a grace period. */ - memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], + (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); + ng->ptr[id] = data; rcu_assign_pointer(net->gen, ng); - kfree_rcu(old_ng, rcu); -assign: - ng->ptr[id - 1] = data; + kfree_rcu(old_ng, s.rcu); return 0; } @@ -120,8 +126,7 @@ out: static void ops_free(const struct pernet_operations *ops, struct net *net) { if (ops->id && ops->size) { - int id = *ops->id; - kfree(net_generic(net, id)); + kfree(net_generic(net, *ops->id)); } } @@ -216,6 +221,8 @@ int peernet2id_alloc(struct net *net, struct net *peer) bool alloc; int id; + if (atomic_read(&net->count) == 0) + return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); alloc = atomic_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); @@ -224,7 +231,6 @@ int peernet2id_alloc(struct net *net, struct net *peer) rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } -EXPORT_SYMBOL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ int peernet2id(struct net *net, struct net *peer) @@ -236,6 +242,7 @@ int peernet2id(struct net *net, struct net *peer) spin_unlock_bh(&net->nsid_lock); return id; } +EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. @@ -307,6 +314,16 @@ out_undo: #ifdef CONFIG_NET_NS +static struct ucounts *inc_net_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); +} + +static void dec_net_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); +} + static struct kmem_cache *net_cachep; static struct workqueue_struct *netns_wq; @@ -348,19 +365,34 @@ void net_drop_ns(void *p) struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { + struct ucounts *ucounts; struct net *net; int rv; if (!(flags & CLONE_NEWNET)) return get_net(old_net); + ucounts = inc_net_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + net = net_alloc(); - if (!net) + if (!net) { + dec_net_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } get_user_ns(user_ns); - mutex_lock(&net_mutex); + rv = mutex_lock_killable(&net_mutex); + if (rv < 0) { + net_free(net); + dec_net_namespaces(ucounts); + put_user_ns(user_ns); + return ERR_PTR(rv); + } + + net->ucounts = ucounts; rv = setup_net(net, user_ns); if (rv == 0) { rtnl_lock(); @@ -369,6 +401,7 @@ struct net *copy_net_ns(unsigned long flags, } mutex_unlock(&net_mutex); if (rv < 0) { + dec_net_namespaces(ucounts); put_user_ns(user_ns); net_drop_ns(net); return ERR_PTR(rv); @@ -441,6 +474,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + dec_net_namespaces(net->ucounts); put_user_ns(net->user_ns); net_drop_ns(net); } @@ -528,7 +562,7 @@ static struct pernet_operations __net_initdata net_ns_ops = { .exit = net_ns_net_exit, }; -static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { +static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NONE] = { .type = NLA_UNSPEC }, [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, @@ -745,6 +779,8 @@ static int __init net_ns_init(void) if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); + init_net_initialized = true; + rtnl_lock(); list_add_tail_rcu(&init_net.list, &net_namespace_list); rtnl_unlock(); @@ -806,15 +842,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + if (!init_net_initialized) { + list_add_tail(&ops->list, list); + return 0; + } + return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - LIST_HEAD(net_exit_list); - list_add(&init_net.exit_list, &net_exit_list); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + if (!init_net_initialized) { + list_del(&ops->list); + } else { + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); + } } #endif /* CONFIG_NET_NS */ @@ -828,7 +873,7 @@ static int register_pernet_operations(struct list_head *list, if (ops->id) { again: - error = ida_get_new_above(&net_generic_ids, 1, ops->id); + error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id); if (error < 0) { if (error == -EAGAIN) { ida_pre_get(&net_generic_ids, GFP_KERNEL); @@ -836,7 +881,7 @@ again: } return error; } - max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); + max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); } error = __register_pernet_operations(list, ops); if (error) { @@ -991,11 +1036,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *netns_owner(struct ns_common *ns) +{ + return to_net_ns(ns)->user_ns; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .owner = netns_owner, }; #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 53599bd0c82d..9424673009c1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -171,12 +171,12 @@ static void poll_one_napi(struct napi_struct *napi) static void poll_napi(struct net_device *dev) { struct napi_struct *napi; + int cpu = smp_processor_id(); list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (napi->poll_owner != smp_processor_id() && - spin_trylock(&napi->poll_lock)) { + if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); - spin_unlock(&napi->poll_lock); + smp_store_release(&napi->poll_owner, -1); } } } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bbd118b19aef..8e69ce472236 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -216,8 +216,8 @@ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) spin_lock(&(t->if_lock)); -#define if_unlock(t) spin_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)); +#define if_unlock(t) mutex_unlock(&(t->if_lock)); /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -413,7 +413,7 @@ struct pktgen_hdr { }; -static int pg_net_id __read_mostly; +static unsigned int pg_net_id __read_mostly; struct pktgen_net { struct net *net; @@ -423,7 +423,7 @@ struct pktgen_net { }; struct pktgen_thread { - spinlock_t if_lock; /* for list of devices */ + struct mutex if_lock; /* for list of devices */ struct list_head if_list; /* All device here */ struct list_head th_list; struct task_struct *tsk; @@ -2010,11 +2010,13 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d { struct pktgen_thread *t; + mutex_lock(&pktgen_thread_lock); + list_for_each_entry(t, &pn->pktgen_threads, th_list) { struct pktgen_dev *pkt_dev; - rcu_read_lock(); - list_for_each_entry_rcu(pkt_dev, &t->if_list, list) { + if_lock(t); + list_for_each_entry(pkt_dev, &t->if_list, list) { if (pkt_dev->odev != dev) continue; @@ -2029,8 +2031,9 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d dev->name); break; } - rcu_read_unlock(); + if_unlock(t); } + mutex_unlock(&pktgen_thread_lock); } static int pktgen_device_event(struct notifier_block *unused, @@ -2286,7 +2289,7 @@ out: static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { - pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); + pkt_dev->pkt_overhead = 0; pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2777,13 +2780,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, - struct pktgen_dev *pkt_dev, - unsigned int extralen) + struct pktgen_dev *pkt_dev) { + unsigned int extralen = LL_RESERVED_SPACE(dev); struct sk_buff *skb = NULL; - unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + - pkt_dev->pkt_overhead; + unsigned int size; + size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead; if (pkt_dev->flags & F_NODE) { int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); @@ -2796,8 +2799,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + /* the caller pre-fetches from skb->data and reserves for the mac hdr */ if (likely(skb)) - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, extralen - 16); return skb; } @@ -2830,16 +2834,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - datalen = (odev->hard_header_len + 16) & ~0xf; - - skb = pktgen_alloc_skb(odev, pkt_dev, datalen); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } prefetchw(skb->data); - skb_reserve(skb, datalen); + skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ eth = (__u8 *) skb_push(skb, 14); @@ -2959,7 +2961,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - skb = pktgen_alloc_skb(odev, pkt_dev, 16); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; @@ -3763,7 +3765,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) return -ENOMEM; } - spin_lock_init(&t->if_lock); + mutex_init(&t->if_lock); t->cpu = cpu; INIT_LIST_HEAD(&t->if_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 189cc78c77eb..c482491a63d8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -275,6 +275,7 @@ int rtnl_unregister(int protocol, int msgtype) rtnl_msg_handlers[protocol][msgindex].doit = NULL; rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + rtnl_msg_handlers[protocol][msgindex].calcit = NULL; return 0; } @@ -704,6 +705,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + if (!user_features) + continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; @@ -837,15 +840,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, if (dev->dev.parent && dev_is_pci(dev->dev.parent) && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); - size_t size = nla_total_size(sizeof(struct nlattr)); - size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size_t size = nla_total_size(0); size += num_vfs * - (nla_total_size(sizeof(struct ifla_vf_mac)) + + (nla_total_size(0) + + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + + nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ @@ -893,7 +901,8 @@ static size_t rtnl_port_size(const struct net_device *dev, static size_t rtnl_xdp_size(const struct net_device *dev) { - size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */ + size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ + nla_total_size(1); /* XDP_ATTACHED */ if (!dev->netdev_ops->ndo_xdp) return 0; @@ -922,8 +931,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ - + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */ - + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */ + + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1109,14 +1118,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct nlattr *vfinfo) { struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; - struct nlattr *vf, *vfstats; struct ifla_vf_mac vf_mac; struct ifla_vf_info ivi; @@ -1133,11 +1143,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; + memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); + vf_mac.vf = vf_vlan.vf = + vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = @@ -1148,6 +1163,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; @@ -1156,10 +1174,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vf) + goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), @@ -1175,17 +1191,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) - return -EMSGSIZE; + goto nla_put_vf_failure; + vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vfstats) + goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, @@ -1197,11 +1219,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) - return -EMSGSIZE; + vf_stats.multicast, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); +nla_put_vfinfo_failure: + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) @@ -1446,6 +1476,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, @@ -1474,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, + [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -1578,7 +1610,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (link_dump_filtered(dev, master_idx, kind_ops)) - continue; + goto cont; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, @@ -1702,7 +1734,37 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, - ivv->qos); + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < NLA_HDRLEN) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; + } + if (len == 0) + return -EINVAL; + + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } @@ -2103,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; + u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy); @@ -2113,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + + if (xdp[IFLA_XDP_FLAGS]) { + xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); + if (xdp_flags & ~XDP_FLAGS_MASK) { + err = -EINVAL; + goto errout; + } + } + if (xdp[IFLA_XDP_FD]) { err = dev_change_xdp_fd(dev, - nla_get_s32(xdp[IFLA_XDP_FD])); + nla_get_s32(xdp[IFLA_XDP_FD]), + xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; @@ -2676,7 +2749,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) ext_filter_mask)); } - return min_ifinfo_dump_size; + return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) @@ -2791,7 +2864,10 @@ nla_put_failure: static inline size_t rtnl_fdb_nlmsg_size(void) { - return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN); + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(ETH_ALEN) + /* NDA_LLADDR */ + nla_total_size(sizeof(u16)) + /* NDA_VLAN */ + 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, @@ -3066,7 +3142,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[0]) + if (*idx < cb->args[2]) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -3093,19 +3169,18 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx) + int *idx) { int err; netif_addr_lock_bh(dev); - err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; - nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); - cb->args[1] = err; - return idx; + return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); @@ -3118,9 +3193,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) const struct net_device_ops *cops = NULL; struct ifinfomsg *ifm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); + struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int idx = 0; + int h, s_h; + int idx = 0, s_idx; + int err = 0; + int fidx = 0; if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy) == 0) { @@ -3138,49 +3217,71 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - cb->args[1] = 0; - for_each_netdev(net, dev) { - if (brport_idx && (dev->ifindex != brport_idx)) - continue; + s_h = cb->args[0]; + s_idx = cb->args[1]; - if (!br_idx) { /* user did not specify a specific bridge */ - if (dev->priv_flags & IFF_BRIDGE_PORT) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { - } else { - if (dev != br_dev && - !(dev->priv_flags & IFF_BRIDGE_PORT)) + if (brport_idx && (dev->ifindex != brport_idx)) continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !(dev->priv_flags & IFF_EBRIDGE)) - continue; + if (!br_idx) { /* user did not specify a specific bridge */ + if (dev->priv_flags & IFF_BRIDGE_PORT) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + } else { + if (dev != br_dev && + !(dev->priv_flags & IFF_BRIDGE_PORT)) + continue; - cops = ops; - } + if (br_dev != netdev_master_upper_dev_get(dev) && + !(dev->priv_flags & IFF_EBRIDGE)) + continue; + cops = ops; + } - if (dev->priv_flags & IFF_BRIDGE_PORT) { - if (cops && cops->ndo_fdb_dump) - idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, - idx); - } - if (cb->args[1] == -EMSGSIZE) - break; + if (idx < s_idx) + goto cont; - if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, - idx); - else - idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); - if (cb->args[1] == -EMSGSIZE) - break; + if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, + br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + goto out; + } + } - cops = NULL; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, + dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, + &fidx); + if (err == -EMSGSIZE) + goto out; + + cops = NULL; + + /* reset fdb offset to 0 for rest of the interfaces */ + cb->args[2] = 0; + fidx = 0; +cont: + idx++; + } } - cb->args[0] = idx; +out: + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = fidx; + return skb->len; } @@ -3550,6 +3651,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } +#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) +static int rtnl_get_offload_stats_attr_size(int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return sizeof(struct rtnl_link_stats64); + } + + return 0; +} + +static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, + int *prividx) +{ + struct nlattr *attr = NULL; + int attr_id, size; + void *attr_data; + int err; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return -ENODATA; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (attr_id < *prividx) + continue; + + size = rtnl_get_offload_stats_attr_size(attr_id); + if (!size) + continue; + + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) + continue; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, + attr_data); + if (err) + goto get_offload_stats_failure; + } + + if (!attr) + return -ENODATA; + + *prividx = 0; + return 0; + +nla_put_failure: + err = -EMSGSIZE; +get_offload_stats_failure: + *prividx = attr_id; + return err; +} + +static int rtnl_get_offload_stats_size(const struct net_device *dev) +{ + int nla_size = 0; + int attr_id; + int size; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return 0; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) + continue; + size = rtnl_get_offload_stats_attr_size(attr_id); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, @@ -3559,6 +3745,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; + int err; ASSERT_RTNL(); @@ -3587,8 +3774,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS); @@ -3612,8 +3797,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS_SLAVE); @@ -3628,6 +3811,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) + goto nla_put_failure; + + err = rtnl_get_offload_stats(skb, dev, prividx); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3642,10 +3843,6 @@ nla_put_failure: return -EMSGSIZE; } -static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { - [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, -}; - static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { @@ -3685,6 +3882,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) + size += rtnl_get_offload_stats_size(dev); + return size; } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index fd3ce461fbe6..88a8e429fc3e 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -12,6 +12,7 @@ #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <net/tcp.h> #define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; @@ -40,8 +41,8 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport) +u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 *tsoff) { u32 secret[MD5_MESSAGE_BYTES / 4]; u32 hash[MD5_DIGEST_WORDS]; @@ -58,6 +59,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, md5_transform(hash, secret); + *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; return seq_scale(hash[0]); } EXPORT_SYMBOL(secure_tcpv6_sequence_number); @@ -86,8 +88,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #ifdef CONFIG_INET -__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, u32 *tsoff) { u32 hash[MD5_DIGEST_WORDS]; @@ -99,6 +101,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, md5_transform(hash, net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; return seq_scale(hash[0]); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b68fa1..65a74e13c45b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -354,7 +354,7 @@ EXPORT_SYMBOL(build_skb); struct napi_alloc_cache { struct page_frag_cache page; - size_t skb_count; + unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } -ssize_t skb_socket_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - - /* Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, spd); - lock_sock(sk); - - return ret; -} - /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags, - ssize_t (*splice_cb)(struct sock *, - struct pipe_inode_info *, - struct splice_pipe_desc *)) + unsigned int flags) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); if (spd.nr_pages) - ret = splice_cb(sk, pipe, &spd); + ret = splice_to_pipe(pipe, &spd); return ret; } @@ -2445,6 +2421,25 @@ void skb_queue_purge(struct sk_buff_head *list) EXPORT_SYMBOL(skb_queue_purge); /** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + +/** * skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue @@ -2661,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) struct skb_frag_struct *fragfrom, *fragto; BUG_ON(shiftlen > skb->len); - BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + if (skb_headlen(skb)) + return 0; todo = shiftlen; from = 0; @@ -3078,11 +3075,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); - /* GSO partial only requires that we trim off any excess that - * doesn't fit into an MSS sized block, so take care of that - * now. - */ - if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* Split the buffer at the frag_list pointer. + * This is based on the assumption that all + * buffers in the chain excluding the last + * containing the same amount of data. + */ + skb_walk_frags(head_skb, iter) { + if (skb_headlen(iter)) + goto normal; + + len -= iter->len; + } + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ partial_segs = len / mss; if (partial_segs > 1) mss *= partial_segs; @@ -3090,6 +3107,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, partial_segs = 0; } +normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3281,21 +3299,29 @@ perform_csum_check: */ segs->prev = tail; - /* Update GSO info on first skb in partial sequence. */ if (partial_segs) { + struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ - type |= SKB_GSO_PARTIAL; + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ - skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; - skb_shinfo(segs)->gso_segs = partial_segs; - skb_shinfo(segs)->gso_type = type; - SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } + + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols @@ -3688,21 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); +static bool is_icmp_err_skb(const struct sk_buff *skb) +{ + return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); +} + struct sk_buff *sock_dequeue_err_skb(struct sock *sk) { struct sk_buff_head *q = &sk->sk_error_queue; - struct sk_buff *skb, *skb_next; + struct sk_buff *skb, *skb_next = NULL; + bool icmp_next = false; unsigned long flags; - int err = 0; spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); if (skb && (skb_next = skb_peek(q))) - err = SKB_EXT_ERR(skb_next)->ee.ee_errno; + icmp_next = is_icmp_err_skb(skb_next); spin_unlock_irqrestore(&q->lock, flags); - sk->sk_err = err; - if (err) + if (is_icmp_err_skb(skb) && !icmp_next) + sk->sk_err = 0; + + if (skb_next) sk->sk_error_report(sk); return skb; @@ -3814,10 +3848,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!skb_may_tx_timestamp(sk, tsonly)) return; - if (tsonly) - skb = alloc_skb(0, GFP_ATOMIC); - else + if (tsonly) { +#ifdef CONFIG_INET + if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk->sk_protocol == IPPROTO_TCP && + sk->sk_type == SOCK_STREAM) + skb = tcp_get_timestamping_opt_stats(sk); + else +#endif + skb = alloc_skb(0, GFP_ATOMIC); + } else { skb = skb_clone(orig_skb, GFP_ATOMIC); + } if (!skb) return; @@ -4474,17 +4516,24 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); -/* remove VLAN header from packet and update csum accordingly. */ -static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; - unsigned int offset = skb->data - skb_mac_header(skb); + int offset = skb->data - skb_mac_header(skb); int err; - __skb_push(skb, offset); + if (WARN_ONCE(offset, + "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); if (unlikely(err)) - goto pull; + return err; skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); @@ -4501,12 +4550,14 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); -pull: - __skb_pull(skb, offset); return err; } +EXPORT_SYMBOL(__skb_vlan_pop); +/* Pop a vlan tag either from hwaccel or from payload. + * Expects skb->data at mac header. + */ int skb_vlan_pop(struct sk_buff *skb) { u16 vlan_tci; @@ -4516,9 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4526,9 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; @@ -4541,29 +4588,30 @@ int skb_vlan_pop(struct sk_buff *skb) } EXPORT_SYMBOL(skb_vlan_pop); +/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). + * Expects skb->data at mac header. + */ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { if (skb_vlan_tag_present(skb)) { - unsigned int offset = skb->data - skb_mac_header(skb); + int offset = skb->data - skb_mac_header(skb); int err; - /* __vlan_insert_tag expect skb->data pointing to mac header. - * So change skb->data before calling it and change back to - * original position later - */ - __skb_push(skb, offset); + if (WARN_ONCE(offset, + "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); - if (err) { - __skb_pull(skb, offset); + if (err) return err; - } skb->protocol = skb->vlan_proto; skb->mac_len += VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); - __skb_pull(skb, offset); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; @@ -4883,3 +4931,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, return clone; } EXPORT_SYMBOL(pskb_extract); + +/** + * skb_condense - try to get rid of fragments/frag_list if possible + * @skb: buffer + * + * Can be used to save memory before skb is added to a busy queue. + * If packet has bytes in frags and enough tail room in skb->head, + * pull all of them, so that we can free the frags right now and adjust + * truesize. + * Notes: + * We do not reallocate skb->head thus can not fail. + * Caller must re-evaluate skb->truesize if needed. + */ +void skb_condense(struct sk_buff *skb) +{ + if (skb->data_len) { + if (skb->data_len > skb->end - skb->tail || + skb_cloned(skb)) + return; + + /* Nice, we can free page frag(s) right now */ + __pskb_pull_tail(skb, skb->data_len); + } + /* At this point, skb->truesize might be over estimated, + * because skb had a fragment, and fragments do not tell + * their truesize. + * When we pulled its content into skb->head, fragment + * was freed, but __pskb_pull_tail() could not possibly + * adjust skb->truesize, not knowing the frag truesize. + */ + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); +} diff --git a/net/core/sock.c b/net/core/sock.c index fd7b41edf1ce..9fa46b956bdc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -453,7 +453,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(sock_queue_rcv_skb); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, - const int nested, unsigned int trim_cap) + const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; @@ -487,7 +487,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, bh_unlock_sock(sk); out: - sock_put(sk); + if (refcounted) + sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); @@ -714,7 +715,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, val = min_t(u32, val, sysctl_wmem_max); set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); /* Wake up sending tasks if we upped the value. */ sk->sk_write_space(sk); break; @@ -750,7 +751,7 @@ set_rcvbuf: * returning the value we actually used in getsockopt * is the most desirable behavior. */ - sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_RCVBUFFORCE: @@ -853,6 +854,13 @@ set_rcvbuf: sk->sk_tskey = 0; } } + + if (val & SOF_TIMESTAMPING_OPT_STATS && + !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { + ret = -EINVAL; + break; + } + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, @@ -1315,24 +1323,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) -{ - unsigned long nulls1, nulls2; - - nulls1 = offsetof(struct sock, __sk_common.skc_node.next); - nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); - if (nulls1 > nulls2) - swap(nulls1, nulls2); - - if (nulls1 != 0) - memset((char *)sk, 0, nulls1); - memset((char *)sk + nulls1 + sizeof(void *), 0, - nulls2 - nulls1 - sizeof(void *)); - memset((char *)sk + nulls2 + sizeof(void *), 0, - size - nulls2 - sizeof(void *)); -} -EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); - static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { @@ -1344,12 +1334,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) { - if (prot->clear_sk) - prot->clear_sk(sk, prot->obj_size); - else - sk_prot_clear_nulls(sk, prot->obj_size); - } + if (priority & __GFP_ZERO) + sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); @@ -1385,6 +1371,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) slab = prot->slab; cgroup_sk_free(&sk->sk_cgrp_data); + mem_cgroup_sk_free(sk); security_sk_free(sk); if (slab != NULL) kmem_cache_free(slab, sk); @@ -1421,6 +1408,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); atomic_set(&sk->sk_wmem_alloc, 1); + mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); @@ -1563,10 +1551,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; + newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + mem_cgroup_sk_alloc(newsk); cgroup_sk_alloc(&newsk->sk_cgrp_data); /* @@ -1591,9 +1581,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - sock_update_memcg(newsk); - if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); @@ -2100,37 +2087,31 @@ void __sk_flush_backlog(struct sock *sk) */ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc; - DEFINE_WAIT(wait); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); + rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); return rc; } EXPORT_SYMBOL(sk_wait_data); /** - * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * __sk_mem_raise_allocated - increase memory_allocated * @sk: socket * @size: memory size to allocate + * @amt: pages to allocate * @kind: allocation type * - * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means - * rmem allocation. This function assumes that protocols which have - * memory_pressure use sk_wmem_queued as write buffer accounting. + * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc */ -int __sk_mem_schedule(struct sock *sk, int size, int kind) +int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; - int amt = sk_mem_pages(size); - long allocated; - - sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - - allocated = sk_memory_allocated_add(sk, amt); + long allocated = sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg && !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) @@ -2191,9 +2172,6 @@ suppress_allocation: trace_sock_exceed_buf_limit(sk, prot, allocated); - /* Alas. Undo changes. */ - sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) @@ -2201,18 +2179,40 @@ suppress_allocation: return 0; } +EXPORT_SYMBOL(__sk_mem_raise_allocated); + +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + int ret, amt = sk_mem_pages(size); + + sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; + ret = __sk_mem_raise_allocated(sk, size, amt, kind); + if (!ret) + sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; + return ret; +} EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_mem_reclaim - reclaim memory_allocated + * __sk_mem_reduce_allocated - reclaim memory_allocated * @sk: socket - * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) + * @amount: number of quanta + * + * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc */ -void __sk_mem_reclaim(struct sock *sk, int amount) +void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; sk_memory_allocated_sub(sk, amount); - sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); @@ -2221,6 +2221,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount) (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } +EXPORT_SYMBOL(__sk_mem_reduce_allocated); + +/** + * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated + * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) + */ +void __sk_mem_reclaim(struct sock *sk, int amount) +{ + amount >>= SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + __sk_mem_reduce_allocated(sk, amount); +} EXPORT_SYMBOL(__sk_mem_reclaim); int sk_set_peek_off(struct sock *sk, int val) @@ -2456,8 +2469,11 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_type = sock->type; sk->sk_wq = sock->wq; sock->sk = sk; - } else + sk->sk_uid = SOCK_INODE(sock)->i_uid; + } else { sk->sk_wq = NULL; + sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); + } rwlock_init(&sk->sk_callback_lock); lockdep_set_class_and_name(&sk->sk_callback_lock, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index e92b759d906c..9a1a352fd1eb 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -129,7 +129,6 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2) return 0; } -EXPORT_SYMBOL(reuseport_add_sock); static void reuseport_free_rcu(struct rcu_head *head) { diff --git a/net/core/stream.c b/net/core/stream.c index 159516a11b7e..f575bcf64af2 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk) rcu_read_unlock(); } } -EXPORT_SYMBOL(sk_stream_write_space); /** * sk_stream_wait_connect - Wait for a socket to get into the connected state @@ -54,8 +53,8 @@ EXPORT_SYMBOL(sk_stream_write_space); */ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); struct task_struct *tsk = current; - DEFINE_WAIT(wait); int done; do { @@ -69,13 +68,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, !sk->sk_err && !((1 << sk->sk_state) & - ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); - finish_wait(sk_sleep(sk), &wait); + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); return 0; @@ -95,16 +94,16 @@ static inline int sk_stream_closing(struct sock *sk) void sk_stream_wait_close(struct sock *sk, long timeout) { if (timeout) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); do { - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); - if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait)) break; } while (!signal_pending(current) && timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); } } EXPORT_SYMBOL(sk_stream_wait_close); @@ -120,16 +119,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) long vm_wait = 0; long current_timeo = *timeo_p; bool noblock = (*timeo_p ? false : true); - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; + add_wait_queue(sk_sleep(sk), &wait); + while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; if (!*timeo_p) { @@ -148,7 +147,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) sk_wait_event(sk, ¤t_timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && - !vm_wait)); + !vm_wait), &wait); sk->sk_write_pending--; if (vm_wait) { @@ -162,7 +161,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0df2aa652530..2a46e4009f62 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); - if (sock_table) + if (sock_table) { static_key_slow_inc(&rps_needed); + static_key_slow_inc(&rfs_needed); + } if (orig_sock_table) { static_key_slow_dec(&rps_needed); + static_key_slow_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } |