summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c72
-rw-r--r--net/core/dev.c1018
-rw-r--r--net/core/devlink.c100
-rw-r--r--net/core/drop_monitor.c23
-rw-r--r--net/core/ethtool.c98
-rw-r--r--net/core/fib_rules.c78
-rw-r--r--net/core/filter.c848
-rw-r--r--net/core/flow.c66
-rw-r--r--net/core/flow_dissector.c206
-rw-r--r--net/core/gen_estimator.c294
-rw-r--r--net/core/gen_stats.c20
-rw-r--r--net/core/lwt_bpf.c396
-rw-r--r--net/core/lwtunnel.c52
-rw-r--r--net/core/neighbour.c18
-rw-r--r--net/core/net-sysfs.c65
-rw-r--r--net/core/net_namespace.c97
-rw-r--r--net/core/netpoll.c6
-rw-r--r--net/core/pktgen.c40
-rw-r--r--net/core/rtnetlink.c344
-rw-r--r--net/core/secure_seq.c11
-rw-r--r--net/core/skbuff.c220
-rw-r--r--net/core/sock.c126
-rw-r--r--net/core/sock_reuseport.c1
-rw-r--r--net/core/stream.c29
-rw-r--r--net/core/sysctl_net_core.c5
26 files changed, 2992 insertions, 1242 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..f6761b6e3b29 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7de71f8d5d3..9482037a5c8c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -165,6 +165,7 @@ done:
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
+ * @destructor: invoked under the receive lock on successful dequeue
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
@@ -197,6 +198,8 @@ done:
* the standard around please.
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err,
struct sk_buff **last)
{
@@ -211,6 +214,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
+ *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -224,26 +228,28 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
*last = skb;
- *peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
skb->peeked)) {
_off -= skb->len;
continue;
}
-
- skb = skb_set_peeked(skb);
- error = PTR_ERR(skb);
- if (IS_ERR(skb)) {
- spin_unlock_irqrestore(&queue->lock,
- cpu_flags);
- goto no_packet;
+ if (!skb->len) {
+ skb = skb_set_peeked(skb);
+ if (IS_ERR(skb)) {
+ error = PTR_ERR(skb);
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
}
-
+ *peeked = 1;
atomic_inc(&skb->users);
- } else
+ } else {
__skb_unlink(skb, queue);
-
+ if (destructor)
+ destructor(sk, skb);
+ }
spin_unlock_irqrestore(&queue->lock, cpu_flags);
*off = _off;
return skb;
@@ -262,6 +268,8 @@ no_packet:
EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb),
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
@@ -270,8 +278,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
+ off, err, &last);
if (skb)
return skb;
@@ -290,7 +298,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- &peeked, &off, err);
+ NULL, &peeked, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -323,6 +331,27 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
}
EXPORT_SYMBOL(__skb_free_datagram_locked);
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags)
+{
+ int err = 0;
+
+ if (flags & MSG_PEEK) {
+ err = -ENOENT;
+ spin_lock_bh(&sk->sk_receive_queue.lock);
+ if (skb == skb_peek(&sk->sk_receive_queue)) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ atomic_dec(&skb->users);
+ err = 0;
+ }
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
+ }
+
+ atomic_inc(&sk->sk_drops);
+ return err;
+}
+EXPORT_SYMBOL(__sk_queue_drop_skb);
+
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
@@ -346,23 +375,10 @@ EXPORT_SYMBOL(__skb_free_datagram_locked);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
- int err = 0;
-
- if (flags & MSG_PEEK) {
- err = -ENOENT;
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (skb == skb_peek(&sk->sk_receive_queue)) {
- __skb_unlink(skb, &sk->sk_receive_queue);
- atomic_dec(&skb->users);
- err = 0;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- }
+ int err = __sk_queue_drop_skb(sk, skb, flags);
kfree_skb(skb);
- atomic_inc(&sk->sk_drops);
sk_mem_reclaim_partial(sk);
-
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index ea6312057a71..6372117f653f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -139,7 +139,6 @@
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
-#include <linux/sctp.h>
#include <linux/crash_dump.h>
#include "net-sysfs.h"
@@ -1766,19 +1765,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
- if (skb_orphan_frags(skb, GFP_ATOMIC) ||
- unlikely(!is_skb_forwardable(dev, skb))) {
- atomic_long_inc(&dev->rx_dropped);
- kfree_skb(skb);
- return NET_RX_DROP;
- }
+ int ret = ____dev_forward_skb(dev, skb);
- skb_scrub_packet(skb, true);
- skb->priority = 0;
- skb->protocol = eth_type_trans(skb, dev);
- skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ if (likely(!ret)) {
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+ }
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
@@ -1949,37 +1943,80 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
}
}
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+ if (dev->num_tc) {
+ struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+ int i;
+
+ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+ if ((txq - tc->offset) < tc->count)
+ return i;
+ }
+
+ return -1;
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_XPS
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
-static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
- int cpu, u16 index)
+static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ return false;
- for (pos = 0; map && pos < map->len; pos++) {
- if (map->queues[pos] == index) {
- if (map->len > 1) {
- map->queues[pos] = map->queues[--map->len];
- } else {
- RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
- kfree_rcu(map, rcu);
- map = NULL;
- }
+ for (pos = map->len; pos--;) {
+ if (map->queues[pos] != index)
+ continue;
+
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
break;
}
+
+ RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ kfree_rcu(map, rcu);
+ return false;
}
- return map;
+ return true;
}
-static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+static bool remove_xps_queue_cpu(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ int cpu, u16 offset, u16 count)
+{
+ int num_tc = dev->num_tc ? : 1;
+ bool active = false;
+ int tci;
+
+ for (tci = cpu * num_tc; num_tc--; tci++) {
+ int i, j;
+
+ for (i = count, j = offset; i--; j++) {
+ if (!remove_xps_queue(dev_maps, cpu, j))
+ break;
+ }
+
+ active |= i < 0;
+ }
+
+ return active;
+}
+
+static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
+ u16 count)
{
struct xps_dev_maps *dev_maps;
int cpu, i;
@@ -1991,21 +2028,16 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
if (!dev_maps)
goto out_no_maps;
- for_each_possible_cpu(cpu) {
- for (i = index; i < dev->num_tx_queues; i++) {
- if (!remove_xps_queue(dev_maps, cpu, i))
- break;
- }
- if (i == dev->num_tx_queues)
- active = true;
- }
+ for_each_possible_cpu(cpu)
+ active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
+ offset, count);
if (!active) {
RCU_INIT_POINTER(dev->xps_maps, NULL);
kfree_rcu(dev_maps, rcu);
}
- for (i = index; i < dev->num_tx_queues; i++)
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
NUMA_NO_NODE);
@@ -2013,6 +2045,11 @@ out_no_maps:
mutex_unlock(&xps_map_mutex);
}
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
+}
+
static struct xps_map *expand_xps_map(struct xps_map *map,
int cpu, u16 index)
{
@@ -2052,20 +2089,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ int i, cpu, tci, numa_node_id = -2;
+ int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
- int cpu, numa_node_id = -2;
bool active = false;
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
+ maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
+
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps);
/* allocate memory for queue storage */
- for_each_online_cpu(cpu) {
- if (!cpumask_test_cpu(cpu, mask))
- continue;
-
+ for_each_cpu_and(cpu, cpu_online_mask, mask) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2073,25 +2118,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ tci = cpu * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
NULL;
map = expand_xps_map(map, cpu, index);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
for_each_possible_cpu(cpu) {
+ /* copy maps belonging to foreign traffic classes */
+ for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
+
+ /* We need to explicitly update tci as prevous loop
+ * could break out early if dev_maps is NULL.
+ */
+ tci = cpu * num_tc + tc;
+
if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
/* add queue to CPU maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(new_dev_maps->cpu_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2105,26 +2163,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
}
+ /* copy maps belonging to foreign traffic classes */
+ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ }
}
rcu_assign_pointer(dev->xps_maps, new_dev_maps);
/* Cleanup old maps */
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (!dev_maps)
+ goto out_no_old_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->cpu_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
-
- kfree_rcu(dev_maps, rcu);
}
+ kfree_rcu(dev_maps, rcu);
+
+out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
@@ -2139,11 +2207,12 @@ out_no_new_maps:
/* removes queue from unused CPUs */
for_each_possible_cpu(cpu) {
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
- continue;
-
- if (remove_xps_queue(dev_maps, cpu, index))
- active = true;
+ for (i = tc, tci = cpu * num_tc; i--; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
+ if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ active |= remove_xps_queue(dev_maps, tci, index);
+ for (i = num_tc - tc, tci++; --i; tci++)
+ active |= remove_xps_queue(dev_maps, tci, index);
}
/* free map if not active */
@@ -2159,11 +2228,14 @@ out_no_maps:
error:
/* remove any maps that we added */
for_each_possible_cpu(cpu) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
- NULL;
- if (new_map && new_map != map)
- kfree(new_map);
+ for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = dev_maps ?
+ xmap_dereference(dev_maps->cpu_map[tci]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
}
mutex_unlock(&xps_map_mutex);
@@ -2174,6 +2246,44 @@ error:
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
+void netdev_reset_tc(struct net_device *dev)
+{
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = 0;
+ memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+ memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+ if (tc >= dev->num_tc)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues(dev, offset, count);
+#endif
+ dev->tc_to_txq[tc].count = count;
+ dev->tc_to_txq[tc].offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+ if (num_tc > TC_MAX_QUEUE)
+ return -EINVAL;
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ dev->num_tc = num_tc;
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2484,7 +2594,7 @@ int skb_checksum_help(struct sk_buff *skb)
goto out;
}
- *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+ *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
out:
@@ -2492,141 +2602,6 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/* skb_csum_offload_check - Driver helper function to determine if a device
- * with limited checksum offload capabilities is able to offload the checksum
- * for a given packet.
- *
- * Arguments:
- * skb - sk_buff for the packet in question
- * spec - contains the description of what device can offload
- * csum_encapped - returns true if the checksum being offloaded is
- * encpasulated. That is it is checksum for the transport header
- * in the inner headers.
- * checksum_help - when set indicates that helper function should
- * call skb_checksum_help if offload checks fail
- *
- * Returns:
- * true: Packet has passed the checksum checks and should be offloadable to
- * the device (a driver may still need to check for additional
- * restrictions of its device)
- * false: Checksum is not offloadable. If checksum_help was set then
- * skb_checksum_help was called to resolve checksum for non-GSO
- * packets and when IP protocol is not SCTP
- */
-bool __skb_csum_offload_chk(struct sk_buff *skb,
- const struct skb_csum_offl_spec *spec,
- bool *csum_encapped,
- bool csum_help)
-{
- struct iphdr *iph;
- struct ipv6hdr *ipv6;
- void *nhdr;
- int protocol;
- u8 ip_proto;
-
- if (skb->protocol == htons(ETH_P_8021Q) ||
- skb->protocol == htons(ETH_P_8021AD)) {
- if (!spec->vlan_okay)
- goto need_help;
- }
-
- /* We check whether the checksum refers to a transport layer checksum in
- * the outermost header or an encapsulated transport layer checksum that
- * corresponds to the inner headers of the skb. If the checksum is for
- * something else in the packet we need help.
- */
- if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
- /* Non-encapsulated checksum */
- protocol = eproto_to_ipproto(vlan_get_protocol(skb));
- nhdr = skb_network_header(skb);
- *csum_encapped = false;
- if (spec->no_not_encapped)
- goto need_help;
- } else if (skb->encapsulation && spec->encap_okay &&
- skb_checksum_start_offset(skb) ==
- skb_inner_transport_offset(skb)) {
- /* Encapsulated checksum */
- *csum_encapped = true;
- switch (skb->inner_protocol_type) {
- case ENCAP_TYPE_ETHER:
- protocol = eproto_to_ipproto(skb->inner_protocol);
- break;
- case ENCAP_TYPE_IPPROTO:
- protocol = skb->inner_protocol;
- break;
- }
- nhdr = skb_inner_network_header(skb);
- } else {
- goto need_help;
- }
-
- switch (protocol) {
- case IPPROTO_IP:
- if (!spec->ipv4_okay)
- goto need_help;
- iph = nhdr;
- ip_proto = iph->protocol;
- if (iph->ihl != 5 && !spec->ip_options_okay)
- goto need_help;
- break;
- case IPPROTO_IPV6:
- if (!spec->ipv6_okay)
- goto need_help;
- if (spec->no_encapped_ipv6 && *csum_encapped)
- goto need_help;
- ipv6 = nhdr;
- nhdr += sizeof(*ipv6);
- ip_proto = ipv6->nexthdr;
- break;
- default:
- goto need_help;
- }
-
-ip_proto_again:
- switch (ip_proto) {
- case IPPROTO_TCP:
- if (!spec->tcp_okay ||
- skb->csum_offset != offsetof(struct tcphdr, check))
- goto need_help;
- break;
- case IPPROTO_UDP:
- if (!spec->udp_okay ||
- skb->csum_offset != offsetof(struct udphdr, check))
- goto need_help;
- break;
- case IPPROTO_SCTP:
- if (!spec->sctp_okay ||
- skb->csum_offset != offsetof(struct sctphdr, checksum))
- goto cant_help;
- break;
- case NEXTHDR_HOP:
- case NEXTHDR_ROUTING:
- case NEXTHDR_DEST: {
- u8 *opthdr = nhdr;
-
- if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
- goto need_help;
-
- ip_proto = opthdr[0];
- nhdr += (opthdr[1] + 1) << 3;
-
- goto ip_proto_again;
- }
- default:
- goto need_help;
- }
-
- /* Passed the tests for offloading checksum */
- return true;
-
-need_help:
- if (csum_help && !skb_shinfo(skb)->gso_size)
- skb_checksum_help(skb);
-cant_help:
- return false;
-}
-EXPORT_SYMBOL(__skb_csum_offload_chk);
-
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -3035,6 +3010,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
return head;
}
+EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
@@ -3220,8 +3196,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[skb->sender_cpu - 1]);
+ unsigned int tci = skb->sender_cpu - 1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
@@ -3355,16 +3337,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
-#ifdef CONFIG_NET_SWITCHDEV
- /* Don't forward if offload device already forwarded */
- if (skb->offload_fwd_mark &&
- skb->offload_fwd_mark == dev->offload_fwd_mark) {
- consume_skb(skb);
- rc = NET_XMIT_SUCCESS;
- goto out;
- }
-#endif
-
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -3475,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
@@ -3855,7 +3829,7 @@ int netif_rx_ni(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_rx_ni);
-static void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -3914,8 +3888,7 @@ static void net_tx_action(struct softirq_action *h)
}
}
-#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
- (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
@@ -4066,12 +4039,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
{
#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
+ int ingress_retval;
+
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
- return nf_hook_ingress(skb);
+ rcu_read_lock();
+ ingress_retval = nf_hook_ingress(skb);
+ rcu_read_unlock();
+ return ingress_retval;
}
#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
@@ -4308,32 +4286,53 @@ int netif_receive_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb);
-/* Network device is going away, flush any packets still pending
- * Called with irqs disabled.
- */
-static void flush_backlog(void *arg)
+DEFINE_PER_CPU(struct work_struct, flush_works);
+
+/* Network device is going away, flush any packets still pending */
+static void flush_backlog(struct work_struct *work)
{
- struct net_device *dev = arg;
- struct softnet_data *sd = this_cpu_ptr(&softnet_data);
struct sk_buff *skb, *tmp;
+ struct softnet_data *sd;
+
+ local_bh_disable();
+ sd = this_cpu_ptr(&softnet_data);
+ local_irq_disable();
rps_lock(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev == dev) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
rps_unlock(sd);
+ local_irq_enable();
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev == dev) {
+ if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
+ local_bh_enable();
+}
+
+static void flush_all_backlogs(void)
+{
+ unsigned int cpu;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(&flush_works, cpu));
+
+ put_online_cpus();
}
static int napi_gro_complete(struct sk_buff *skb)
@@ -4480,7 +4479,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
- if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
+ if (skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
@@ -4493,9 +4492,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = 0;
+ NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->recursion_counter = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
@@ -4821,8 +4821,9 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
static int process_backlog(struct napi_struct *napi, int quota)
{
- int work = 0;
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+ bool again = true;
+ int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
@@ -4833,23 +4834,20 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
napi->weight = weight_p;
- local_irq_disable();
- while (1) {
+ while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
- local_irq_enable();
__netif_receive_skb(skb);
rcu_read_unlock();
- local_irq_disable();
input_queue_head_incr(sd);
- if (++work >= quota) {
- local_irq_enable();
+ if (++work >= quota)
return work;
- }
+
}
+ local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
@@ -4861,16 +4859,14 @@ static int process_backlog(struct napi_struct *napi, int quota)
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
- rps_unlock(sd);
-
- break;
+ again = false;
+ } else {
+ skb_queue_splice_tail_init(&sd->input_pkt_queue,
+ &sd->process_queue);
}
-
- skb_queue_splice_tail_init(&sd->input_pkt_queue,
- &sd->process_queue);
rps_unlock(sd);
+ local_irq_enable();
}
- local_irq_enable();
return work;
}
@@ -4904,26 +4900,36 @@ void __napi_schedule_irqoff(struct napi_struct *n)
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
-void __napi_complete(struct napi_struct *n)
+bool __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ /* Some drivers call us directly, instead of calling
+ * napi_complete_done().
+ */
+ if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
+ return false;
+
list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
+ return true;
}
EXPORT_SYMBOL(__napi_complete);
-void napi_complete_done(struct napi_struct *n, int work_done)
+bool napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;
/*
- * don't let napi dequeue from the cpu poll list
- * just in case its running on a different cpu
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
*/
- if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
- return;
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
+ return false;
if (n->gro_list) {
unsigned long timeout = 0;
@@ -4945,6 +4951,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
__napi_complete(n);
local_irq_restore(flags);
}
+ return true;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -4962,13 +4969,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
+
#define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+ int rc;
+
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == BUSY_POLL_BUDGET)
+ __napi_schedule(napi);
+ local_bh_enable();
+ if (local_softirq_pending())
+ do_softirq();
+}
+
bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+ int (*napi_poll)(struct napi_struct *napi, int budget);
int (*busy_poll)(struct napi_struct *dev);
+ void *have_poll_lock = NULL;
struct napi_struct *napi;
- int rc = false;
+ int rc;
+
+restart:
+ rc = false;
+ napi_poll = NULL;
rcu_read_lock();
@@ -4979,24 +5014,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
/* Note: ndo_busy_poll method is optional in linux-4.5 */
busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
- do {
+ preempt_disable();
+ for (;;) {
rc = 0;
local_bh_disable();
if (busy_poll) {
rc = busy_poll(napi);
- } else if (napi_schedule_prep(napi)) {
- void *have = netpoll_poll_lock(napi);
-
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
- if (rc == BUSY_POLL_BUDGET) {
- napi_complete_done(napi, rc);
- napi_schedule(napi);
- }
- }
- netpoll_poll_unlock(have);
+ goto count;
+ }
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL))
+ goto count;
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val)
+ goto count;
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
}
+ rc = napi_poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+count:
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
@@ -5005,10 +5049,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */
- cpu_relax();
- } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
- !need_resched() && !busy_loop_timeout(end_time));
+ if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+ busy_loop_timeout(end_time))
+ break;
+ if (unlikely(need_resched())) {
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+ if (rc || busy_loop_timeout(end_time))
+ return rc;
+ goto restart;
+ }
+ cpu_relax();
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
@@ -5018,7 +5078,7 @@ EXPORT_SYMBOL(sk_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
-void napi_hash_add(struct napi_struct *napi)
+static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
@@ -5038,7 +5098,6 @@ void napi_hash_add(struct napi_struct *napi)
spin_unlock(&napi_hash_lock);
}
-EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
@@ -5086,7 +5145,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
- spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
@@ -5187,7 +5245,7 @@ out_unlock:
return work;
}
-static void net_rx_action(struct softirq_action *h)
+static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
@@ -5204,7 +5262,7 @@ static void net_rx_action(struct softirq_action *h)
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- return;
+ goto out;
break;
}
@@ -5222,7 +5280,6 @@ static void net_rx_action(struct softirq_action *h)
}
}
- __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -5232,6 +5289,8 @@ static void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
+out:
+ __kfree_skb_flush();
}
struct netdev_adjacent {
@@ -5262,6 +5321,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+ struct net_device *dev = data;
+
+ return upper_dev == dev;
+}
+
/**
* netdev_has_upper_dev - Check if device is linked to an upper device
* @dev: device
@@ -5276,11 +5342,30 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
+ return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
+/**
* netdev_has_any_upper_dev - Check if device is linked to some device
* @dev: device
*
@@ -5291,7 +5376,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
- return !list_empty(&dev->all_adj_list.upper);
+ return !list_empty(&dev->adj_list.upper);
}
/**
@@ -5318,6 +5403,20 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+/**
+ * netdev_has_any_lower_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to a lower device and return true in case
+ * it is. The caller must hold the RTNL lock.
+ */
+static bool netdev_has_any_lower_dev(struct net_device *dev)
+{
+ ASSERT_RTNL();
+
+ return !list_empty(&dev->adj_list.lower);
+}
+
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
struct netdev_adjacent *adj;
@@ -5354,16 +5453,8 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
-/**
- * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next device from the dev's upper list, starting from iter
- * position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *upper;
@@ -5371,14 +5462,41 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
- if (&upper->list == &dev->all_adj_list.upper)
+ if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
-EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.upper,
+ udev = netdev_next_upper_dev_rcu(dev, &iter);
+ udev;
+ udev = netdev_next_upper_dev_rcu(dev, &iter)) {
+ /* first is the upper device itself */
+ ret = fn(udev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its upper devices */
+ ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
/**
* netdev_lower_get_next_private - Get the next ->private from the
@@ -5461,51 +5579,90 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
}
EXPORT_SYMBOL(netdev_lower_get_next);
-/**
- * netdev_all_lower_get_next - Get the next device from all lower neighbour list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RTNL lock or
- * its own locking that guarantees that the neighbour all lower
- * list will remain unchanged.
- */
-struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_entry(*iter, struct netdev_adjacent, list);
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
- if (&lower->list == &dev->all_adj_list.lower)
+ if (&lower->list == &dev->adj_list.lower)
return NULL;
- *iter = lower->list.next;
+ *iter = &lower->list;
return lower->dev;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next);
-/**
- * netdev_all_lower_get_next_rcu - Get the next device from all
- * lower neighbour list, RCU variant
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
- struct list_head **iter)
+int netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+
+static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
- struct netdev_adjacent, list);
+ lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+
+ return lower->dev;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev;
+ struct list_head *iter;
+ int ret;
+
+ for (iter = &dev->adj_list.lower,
+ ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev;
+ ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
+ /* first is the lower device itself */
+ ret = fn(ldev, data);
+ if (ret)
+ return ret;
+
+ /* then look at all of its lower devices */
+ ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
+ if (ret)
+ return ret;
+ }
- return lower ? lower->dev : NULL;
+ return 0;
}
-EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
@@ -5587,7 +5744,10 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
- adj->ref_nr++;
+ adj->ref_nr += 1;
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
+ dev->name, adj_dev->name, adj->ref_nr);
+
return 0;
}
@@ -5601,8 +5761,8 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->private = private;
dev_hold(adj_dev);
- pr_debug("dev_hold for %s, because of link added from %s to %s\n",
- adj_dev->name, dev->name, adj_dev->name);
+ pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
+ dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
@@ -5636,22 +5796,28 @@ free_adj:
static void __netdev_adjacent_dev_remove(struct net_device *dev,
struct net_device *adj_dev,
+ u16 ref_nr,
struct list_head *dev_list)
{
struct netdev_adjacent *adj;
+ pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
+ dev->name, adj_dev->name, ref_nr);
+
adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
- pr_err("tried to remove device %s from %s\n",
+ pr_err("Adjacency does not exist for device %s from %s\n",
dev->name, adj_dev->name);
- BUG();
+ WARN_ON(1);
+ return;
}
- if (adj->ref_nr > 1) {
- pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
- adj->ref_nr-1);
- adj->ref_nr--;
+ if (adj->ref_nr > ref_nr) {
+ pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
+ dev->name, adj_dev->name, ref_nr,
+ adj->ref_nr - ref_nr);
+ adj->ref_nr -= ref_nr;
return;
}
@@ -5662,7 +5828,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
- pr_debug("dev_put for %s, because link removed from %s to %s\n",
+ pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
@@ -5676,73 +5842,45 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
{
int ret;
- ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
- master);
+ ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
+ private, master);
if (ret)
return ret;
- ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
- false);
+ ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
+ private, false);
if (ret) {
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
+ __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
return ret;
}
return 0;
}
-static int __netdev_adjacent_dev_link(struct net_device *dev,
- struct net_device *upper_dev)
-{
- return __netdev_adjacent_dev_link_lists(dev, upper_dev,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower,
- NULL, false);
-}
-
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
+ u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list)
{
- __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
- __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
-}
-
-static void __netdev_adjacent_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
-{
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
- &dev->all_adj_list.upper,
- &upper_dev->all_adj_list.lower);
+ __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
+ __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
- int ret = __netdev_adjacent_dev_link(dev, upper_dev);
-
- if (ret)
- return ret;
-
- ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
- &dev->adj_list.upper,
- &upper_dev->adj_list.lower,
- private, master);
- if (ret) {
- __netdev_adjacent_dev_unlink(dev, upper_dev);
- return ret;
- }
-
- return 0;
+ return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+ &dev->adj_list.upper,
+ &upper_dev->adj_list.lower,
+ private, master);
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
- __netdev_adjacent_dev_unlink(dev, upper_dev);
- __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
+ __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
}
@@ -5752,7 +5890,6 @@ static int __netdev_upper_dev_link(struct net_device *dev,
void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j, *to_i, *to_j;
int ret = 0;
ASSERT_RTNL();
@@ -5761,10 +5898,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
+ if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
- if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
+ if (netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
@@ -5786,80 +5923,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
return ret;
- /* Now that we linked these devs, make all the upper_dev's
- * all_adj_list.upper visible to every dev's all_adj_list.lower an
- * versa, and don't forget the devices itself. All of these
- * links are non-neighbours.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- pr_debug("Interlinking %s with %s, non-neighbour\n",
- i->dev->name, j->dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, j->dev);
- if (ret)
- goto rollback_mesh;
- }
- }
-
- /* add dev to every upper_dev's upper device */
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- pr_debug("linking %s's upper device %s with %s\n",
- upper_dev->name, i->dev->name, dev->name);
- ret = __netdev_adjacent_dev_link(dev, i->dev);
- if (ret)
- goto rollback_upper_mesh;
- }
-
- /* add upper_dev to every dev's lower device */
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- pr_debug("linking %s's lower device %s with %s\n", dev->name,
- i->dev->name, upper_dev->name);
- ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
- if (ret)
- goto rollback_lower_mesh;
- }
-
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
- goto rollback_lower_mesh;
+ goto rollback;
return 0;
-rollback_lower_mesh:
- to_i = i;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
- }
-
- i = NULL;
-
-rollback_upper_mesh:
- to_i = i;
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i)
- break;
- __netdev_adjacent_dev_unlink(dev, i->dev);
- }
-
- i = j = NULL;
-
-rollback_mesh:
- to_i = i;
- to_j = j;
- list_for_each_entry(i, &dev->all_adj_list.lower, list) {
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
- if (i == to_i && j == to_j)
- break;
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
- }
- if (i == to_i)
- break;
- }
-
+rollback:
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
return ret;
@@ -5916,7 +5988,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_notifier_changeupper_info changeupper_info;
- struct netdev_adjacent *i, *j;
ASSERT_RTNL();
changeupper_info.upper_dev = upper_dev;
@@ -5928,23 +5999,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
- /* Here is the tricky part. We must remove all dev's lower
- * devices from all upper_dev's upper devices and vice
- * versa, to maintain the graph relationship.
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(i->dev, j->dev);
-
- /* remove also the devices itself from lower/upper device
- * list
- */
- list_for_each_entry(i, &dev->all_adj_list.lower, list)
- __netdev_adjacent_dev_unlink(i->dev, upper_dev);
-
- list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
- __netdev_adjacent_dev_unlink(dev, i->dev);
-
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
}
@@ -6482,9 +6536,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive. */
- if (new_mtu < 0)
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
+ dev->name, new_mtu, dev->min_mtu);
return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
+ dev->name, new_mtu, dev->max_mtu);
+ return -EINVAL;
+ }
if (!netif_device_present(dev))
return -ENODEV;
@@ -6631,26 +6694,42 @@ EXPORT_SYMBOL(dev_change_proto_down);
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @fd: new program fd or negative value to clear
+ * @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
-int dev_change_xdp_fd(struct net_device *dev, int fd)
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp = {};
+ struct netdev_xdp xdp;
int err;
+ ASSERT_RTNL();
+
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
+ if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ err = ops->ndo_xdp(dev, &xdp);
+ if (err < 0)
+ return err;
+ if (xdp.prog_attached)
+ return -EBUSY;
+ }
+
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ memset(&xdp, 0, sizeof(xdp));
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
+
err = ops->ndo_xdp(dev, &xdp);
if (err < 0 && prog)
bpf_prog_put(prog);
@@ -6723,8 +6802,8 @@ static void rollback_registered_many(struct list_head *head)
unlist_netdevice(dev);
dev->reg_state = NETREG_UNREGISTERING;
- on_each_cpu(flush_backlog, dev, 1);
}
+ flush_all_backlogs();
synchronize_net();
@@ -6759,6 +6838,7 @@ static void rollback_registered_many(struct list_head *head)
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
@@ -7637,16 +7717,17 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
- INIT_LIST_HEAD(&dev->all_adj_list.upper);
- INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
+#ifdef CONFIG_NET_SCHED
+ hash_init(dev->qdisc_hash);
+#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
- dev->tx_queue_len = 1;
+ dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
@@ -7927,18 +8008,13 @@ out:
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
-static int dev_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *ocpu)
+static int dev_cpu_dead(unsigned int oldcpu)
{
struct sk_buff **list_skb;
struct sk_buff *skb;
- unsigned int cpu, oldcpu = (unsigned long)ocpu;
+ unsigned int cpu;
struct softnet_data *sd, *oldsd;
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
- return NOTIFY_OK;
-
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
@@ -7988,10 +8064,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
input_queue_head_incr(oldsd);
}
- return NOTIFY_OK;
+ return 0;
}
-
/**
* netdev_increment_features - increment feature set by one
* @all: current feature set
@@ -8286,8 +8361,11 @@ static int __init net_dev_init(void)
*/
for_each_possible_cpu(i) {
+ struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
+ INIT_WORK(flush, flush_backlog);
+
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
INIT_LIST_HEAD(&sd->poll_list);
@@ -8322,7 +8400,9 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
- hotcpu_notifier(dev_cpu_callback, 0);
+ rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+ NULL, dev_cpu_dead);
+ WARN_ON(rc < 0);
dst_subsys_init();
rc = 0;
out:
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1b5063088f1a..2b5bf9efa720 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,15 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
mutex_unlock(&devlink_mutex);
}
-static struct genl_family devlink_nl_family = {
- .id = GENL_ID_GENERATE,
- .name = DEVLINK_GENL_NAME,
- .version = DEVLINK_GENL_VERSION,
- .maxattr = DEVLINK_ATTR_MAX,
- .netnsok = true,
- .pre_doit = devlink_nl_pre_doit,
- .post_doit = devlink_nl_post_doit,
-};
+static struct genl_family devlink_nl_family;
enum devlink_multicast_groups {
DEVLINK_MCGRP_CONFIG,
@@ -608,6 +600,8 @@ static int devlink_port_type_set(struct devlink *devlink,
if (devlink->ops && devlink->ops->port_type_set) {
if (port_type == DEVLINK_PORT_TYPE_NOTSET)
return -EINVAL;
+ if (port_type == devlink_port->type)
+ return 0;
err = devlink->ops->port_type_set(devlink_port, port_type);
if (err)
return err;
@@ -1400,26 +1394,45 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
- u32 seq, int flags, u16 mode)
+ u32 seq, int flags)
{
+ const struct devlink_ops *ops = devlink->ops;
void *hdr;
+ int err = 0;
+ u16 mode;
+ u8 inline_mode;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out;
- if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
- goto nla_put_failure;
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto out;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto out;
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto out;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto out;
+ }
genlmsg_end(msg, hdr);
return 0;
-nla_put_failure:
+out:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
@@ -1428,22 +1441,17 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct sk_buff *msg;
- u16 mode;
int err;
if (!ops || !ops->eswitch_mode_get)
return -EOPNOTSUPP;
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- return err;
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
- info->snd_portid, info->snd_seq, 0, mode);
+ info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
@@ -1459,15 +1467,32 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
u16 mode;
+ u8 inline_mode;
+ int err = 0;
- if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
- return -EINVAL;
+ if (!ops)
+ return -EOPNOTSUPP;
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = ops->eswitch_mode_set(devlink, mode);
+ if (err)
+ return err;
+ }
- if (ops && ops->eswitch_mode_set)
- return ops->eswitch_mode_set(devlink, mode);
- return -EOPNOTSUPP;
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(
+ info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
@@ -1484,6 +1509,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
[DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1618,6 +1644,20 @@ static const struct genl_ops devlink_nl_ops[] = {
},
};
+static struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+ .module = THIS_MODULE,
+ .ops = devlink_nl_ops,
+ .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
/**
* devlink_alloc - Allocate new devlink instance resources
*
@@ -1840,9 +1880,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
static int __init devlink_module_init(void)
{
- return genl_register_family_with_ops_groups(&devlink_nl_family,
- devlink_nl_ops,
- devlink_nl_mcgrps);
+ return genl_register_family(&devlink_nl_family);
}
static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index d6b3b579560d..8e0c0635ee97 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,12 +59,7 @@ struct dm_hw_stat_delta {
unsigned long last_drop_val;
};
-static struct genl_family net_drop_monitor_family = {
- .id = GENL_ID_GENERATE,
- .hdrsize = 0,
- .name = "NET_DM",
- .version = 2,
-};
+static struct genl_family net_drop_monitor_family;
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
@@ -105,7 +100,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
return skb;
}
-static struct genl_multicast_group dropmon_mcgrps[] = {
+static const struct genl_multicast_group dropmon_mcgrps[] = {
{ .name = "events", },
};
@@ -351,6 +346,17 @@ static const struct genl_ops dropmon_ops[] = {
},
};
+static struct genl_family net_drop_monitor_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = "NET_DM",
+ .version = 2,
+ .module = THIS_MODULE,
+ .ops = dropmon_ops,
+ .n_ops = ARRAY_SIZE(dropmon_ops),
+ .mcgrps = dropmon_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
+};
+
static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
@@ -367,8 +373,7 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
- dropmon_ops, dropmon_mcgrps);
+ rc = genl_register_family(&net_drop_monitor_family);
if (rc) {
pr_err("Could not create drop monitor netlink family\n");
return rc;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 977489820eb9..e23766c7e3ba 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -119,6 +119,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
[ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
};
+static const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+};
+
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -227,6 +233,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_TUNABLES)
return ARRAY_SIZE(tunable_strings);
+ if (sset == ETH_SS_PHY_TUNABLES)
+ return ARRAY_SIZE(phy_tunable_strings);
+
if (sset == ETH_SS_PHY_STATS) {
if (dev->phydev)
return phy_get_sset_count(dev->phydev);
@@ -253,6 +262,8 @@ static void __ethtool_get_strings(struct net_device *dev,
sizeof(rss_hash_func_strings));
else if (stringset == ETH_SS_TUNABLES)
memcpy(data, tunable_strings, sizeof(tunable_strings));
+ else if (stringset == ETH_SS_PHY_TUNABLES)
+ memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
else if (stringset == ETH_SS_PHY_STATS) {
struct phy_device *phydev = dev->phydev;
@@ -2422,6 +2433,85 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
};
}
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+ switch (tuna->id) {
+ case ETHTOOL_PHY_DOWNSHIFT:
+ if (tuna->len != sizeof(u8) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U8)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ if (ret)
+ goto out;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, data, tuna.len))
+ goto out;
+ ret = 0;
+
+out:
+ kfree(data);
+ return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+ int ret;
+ struct ethtool_tunable tuna;
+ struct phy_device *phydev = dev->phydev;
+ void *data;
+
+ if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ return -EOPNOTSUPP;
+ if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+ return -EFAULT;
+ ret = ethtool_phy_tunable_valid(&tuna);
+ if (ret)
+ return ret;
+ data = kmalloc(tuna.len, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+ useraddr += sizeof(tuna);
+ ret = -EFAULT;
+ if (copy_from_user(data, useraddr, tuna.len))
+ goto out;
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+
+out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2479,6 +2569,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GET_TS_INFO:
case ETHTOOL_GEEE:
case ETHTOOL_GTUNABLE:
+ case ETHTOOL_PHY_GTUNABLE:
+ case ETHTOOL_GLINKSETTINGS:
break;
default:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -2684,6 +2776,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SLINKSETTINGS:
rc = ethtool_set_link_ksettings(dev, useraddr);
break;
+ case ETHTOOL_PHY_GTUNABLE:
+ rc = get_phy_tunable(dev, useraddr);
+ break;
+ case ETHTOOL_PHY_STUNABLE:
+ rc = set_phy_tunable(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..b6791d94841d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+static const struct fib_kuid_range fib_kuid_range_unset = {
+ KUIDT_INIT(0),
+ KUIDT_INIT(~0),
+};
+
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+ r->uid_range = fib_kuid_range_unset;
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
@@ -172,6 +178,34 @@ void fib_rules_unregister(struct fib_rules_ops *ops)
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
+static int uid_range_set(struct fib_kuid_range *range)
+{
+ return uid_valid(range->start) && uid_valid(range->end);
+}
+
+static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
+{
+ struct fib_rule_uid_range *in;
+ struct fib_kuid_range out;
+
+ in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);
+
+ out.start = make_kuid(current_user_ns(), in->start);
+ out.end = make_kuid(current_user_ns(), in->end);
+
+ return out;
+}
+
+static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
+{
+ struct fib_rule_uid_range out = {
+ from_kuid_munged(current_user_ns(), range->start),
+ from_kuid_munged(current_user_ns(), range->end)
+ };
+
+ return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -193,6 +227,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
+ if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
+ uid_gt(fl->flowi_uid, rule->uid_range.end))
+ goto out;
+
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +343,10 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
+ if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+ !uid_eq(r->uid_range.end, rule->uid_range.end))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -429,6 +471,21 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (rule->l3mdev && rule->table)
goto errout_free;
+ if (tb[FRA_UID_RANGE]) {
+ if (current_user_ns() != net->user_ns) {
+ err = -EPERM;
+ goto errout_free;
+ }
+
+ rule->uid_range = nla_get_kuid_range(tb);
+
+ if (!uid_range_set(&rule->uid_range) ||
+ !uid_lte(rule->uid_range.start, rule->uid_range.end))
+ goto errout_free;
+ } else {
+ rule->uid_range = fib_kuid_range_unset;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -497,6 +554,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
+ struct fib_kuid_range range;
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
@@ -516,6 +574,14 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
goto errout;
+ if (tb[FRA_UID_RANGE]) {
+ range = nla_get_kuid_range(tb);
+ if (!uid_range_set(&range))
+ goto errout;
+ } else {
+ range = fib_kuid_range_unset;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
@@ -552,6 +618,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh)
(rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
continue;
+ if (uid_range_set(&range) &&
+ (!uid_eq(rule->uid_range.start, range.start) ||
+ !uid_eq(rule->uid_range.end, range.end)))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -619,7 +690,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
- + nla_total_size_64bit(8); /* FRA_TUN_ID */
+ + nla_total_size_64bit(8) /* FRA_TUN_ID */
+ + nla_total_size(sizeof(struct fib_kuid_range));
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -679,7 +751,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->tun_id &&
nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
(rule->l3mdev &&
- nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)))
+ nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
+ (uid_range_set(&rule->uid_range) &&
+ nla_put_uid_range(skb, &rule->uid_range)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index cb06aceb512a..b1461708a977 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
+#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -78,6 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
return -ENOMEM;
+ err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
+ if (err)
+ return err;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
@@ -94,14 +99,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
}
EXPORT_SYMBOL(sk_filter_trim_cap);
-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
{
- return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+ return skb_get_poff(skb);
}
-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
struct nlattr *nla;
if (skb_is_nonlinear(skb))
@@ -120,9 +124,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return 0;
}
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
struct nlattr *nla;
if (skb_is_nonlinear(skb))
@@ -145,7 +148,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
return 0;
}
-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+BPF_CALL_0(__get_raw_cpu_id)
{
return raw_smp_processor_id();
}
@@ -233,9 +236,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
case SKF_AD_OFF + SKF_AD_HATYPE:
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
- BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
BPF_REG_TMP, BPF_REG_CTX,
offsetof(struct sk_buff, dev));
/* if (tmp != 0) goto pc + 1 */
@@ -1350,17 +1352,26 @@ struct bpf_scratchpad {
static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
+static inline int __bpf_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ return skb_ensure_writable(skb, write_len);
+}
+
static inline int bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err;
+ int err = __bpf_try_make_writable(skb, write_len);
- err = skb_ensure_writable(skb, write_len);
bpf_compute_data_end(skb);
-
return err;
}
+static int bpf_try_make_head_writable(struct sk_buff *skb)
+{
+ return bpf_try_make_writable(skb, skb_headlen(skb));
+}
+
static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
if (skb_at_tc_ingress(skb))
@@ -1373,12 +1384,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}
-static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
+BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
+ const void *, from, u32, len, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- unsigned int offset = (unsigned int) r2;
- void *from = (void *) (long) r3;
- unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
@@ -1413,12 +1421,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
+ void *, to, u32, len)
{
- const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
- unsigned int offset = (unsigned int) r2;
- void *to = (void *)(unsigned long) r3;
- unsigned int len = (unsigned int) r4;
void *ptr;
if (unlikely(offset > 0xffff))
@@ -1446,10 +1451,31 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_STACK_SIZE,
};
-static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto bpf_skb_pull_data_proto = {
+ .func = bpf_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- unsigned int offset = (unsigned int) r2;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
@@ -1491,12 +1517,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
+BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
+ u64, from, u64, to, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
- unsigned int offset = (unsigned int) r2;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
@@ -1544,12 +1569,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
+ __be32 *, to, u32, to_size, __wsum, seed)
{
struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
- u64 diff_size = from_size + to_size;
- __be32 *from = (__be32 *) (long) r1;
- __be32 *to = (__be32 *) (long) r3;
+ u32 diff_size = from_size + to_size;
int i, j = 0;
/* This is quite flexible, some examples:
@@ -1575,6 +1599,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
static const struct bpf_func_proto bpf_csum_diff_proto = {
.func = bpf_csum_diff,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_STACK,
.arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
@@ -1583,11 +1608,44 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.arg5_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
+{
+ /* The interface is to be used in combination with bpf_csum_diff()
+ * for direct packet writes. csum rotation for alignment as well
+ * as emulating csum_sub() can be done from the eBPF program.
+ */
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ return (skb->csum = csum_add(skb->csum, csum));
+
+ return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_csum_update_proto = {
+ .func = bpf_csum_update,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
return dev_forward_skb(dev, skb);
}
+static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ int ret = ____dev_forward_skb(dev, skb);
+
+ if (likely(!ret)) {
+ skb->dev = dev;
+ ret = netif_rx(skb);
+ }
+
+ return ret;
+}
+
static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret;
@@ -1607,10 +1665,55 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
-static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ /* skb->mac_len is not set on normal egress */
+ unsigned int mlen = skb->network_header - skb->mac_header;
+
+ __skb_pull(skb, mlen);
+
+ /* At ingress, the mac header has already been pulled once.
+ * At egress, skb_pospull_rcsum has to be done in case that
+ * the skb is originated from ingress (i.e. a forwarded skb)
+ * to ensure that rcsum starts at net header.
+ */
+ if (!skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+ skb_pop_mac_header(skb);
+ skb_reset_mac_len(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ /* Verify that a link layer header is carried */
+ if (unlikely(skb->mac_header >= skb->network_header)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
+
+ bpf_push_mac_rcsum(skb);
+ return flags & BPF_F_INGRESS ?
+ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+}
+
+static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
+ u32 flags)
+{
+ if (dev_is_mac_header_xmit(dev))
+ return __bpf_redirect_common(skb, dev, flags);
+ else
+ return __bpf_redirect_no_mac(skb, dev, flags);
+}
+
+BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct net_device *dev;
+ struct sk_buff *clone;
+ int ret;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return -EINVAL;
@@ -1619,14 +1722,22 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!dev))
return -EINVAL;
- skb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb))
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!clone))
return -ENOMEM;
- bpf_push_mac_rcsum(skb);
+ /* For direct write, we need to keep the invariant that the skbs
+ * we're dealing with need to be uncloned. Should uncloning fail
+ * here, we need to free the just generated clone to unclone once
+ * again.
+ */
+ ret = bpf_try_make_head_writable(skb);
+ if (unlikely(ret)) {
+ kfree_skb(clone);
+ return -ENOMEM;
+ }
- return flags & BPF_F_INGRESS ?
- __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+ return __bpf_redirect(clone, dev, flags);
}
static const struct bpf_func_proto bpf_clone_redirect_proto = {
@@ -1645,7 +1756,7 @@ struct redirect_info {
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
-static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
@@ -1670,10 +1781,7 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
- bpf_push_mac_rcsum(skb);
-
- return ri->flags & BPF_F_INGRESS ?
- __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
+ return __bpf_redirect(skb, dev, ri->flags);
}
static const struct bpf_func_proto bpf_redirect_proto = {
@@ -1684,9 +1792,9 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
- return task_get_classid((struct sk_buff *) (unsigned long) r1);
+ return task_get_classid(skb);
}
static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
@@ -1696,9 +1804,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
- return dst_tclassid((struct sk_buff *) (unsigned long) r1);
+ return dst_tclassid(skb);
}
static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1708,14 +1816,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
/* If skb_clear_hash() was called due to mangling, we can
* trigger SW recalculation here. Later access to hash
* can then use the inline skb->hash via context directly
* instead of calling this helper again.
*/
- return skb_get_hash((struct sk_buff *) (unsigned long) r1);
+ return skb_get_hash(skb);
}
static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
@@ -1725,10 +1833,25 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
+{
+ /* After all direct packet write, this can be used once for
+ * triggering a lazy recalc on next skb_get_hash() invocation.
+ */
+ skb_clear_hash(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
+ .func = bpf_set_hash_invalid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
+ u16, vlan_tci)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- __be16 vlan_proto = (__force __be16) r2;
int ret;
if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
@@ -1753,9 +1876,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
};
EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
-static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
int ret;
bpf_push_mac_rcsum(skb);
@@ -1930,10 +2052,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
return -ENOTSUPP;
}
-static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
+ u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- __be16 proto = (__force __be16) r2;
int ret;
if (unlikely(flags))
@@ -1970,14 +2091,11 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = {
.arg3_type = ARG_ANYTHING,
};
-static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u32 pkt_type = r2;
-
/* We only allow a restricted subset to be changed for now. */
- if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
- pkt_type > PACKET_OTHERHOST))
+ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
+ !skb_pkt_type_ok(pkt_type)))
return -EINVAL;
skb->pkt_type = pkt_type;
@@ -1992,19 +2110,163 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
.arg2_type = ARG_ANYTHING,
};
-bool bpf_helper_changes_skb_data(void *func)
+static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
- if (func == bpf_skb_vlan_push)
- return true;
- if (func == bpf_skb_vlan_pop)
- return true;
- if (func == bpf_skb_store_bytes)
- return true;
- if (func == bpf_skb_change_proto)
- return true;
- if (func == bpf_l3_csum_replace)
- return true;
- if (func == bpf_l4_csum_replace)
+ u32 min_len = skb_network_offset(skb);
+
+ if (skb_transport_header_was_set(skb))
+ min_len = skb_transport_offset(skb);
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ min_len = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ return min_len;
+}
+
+static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+{
+ return skb->dev->mtu + skb->dev->hard_header_len;
+}
+
+static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ unsigned int old_len = skb->len;
+ int ret;
+
+ ret = __skb_grow_rcsum(skb, new_len);
+ if (!ret)
+ memset(skb->data + old_len, 0, new_len - old_len);
+ return ret;
+}
+
+static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
+{
+ return __skb_trim_rcsum(skb, new_len);
+}
+
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 min_len = __bpf_skb_min_len(skb);
+ int ret;
+
+ if (unlikely(flags || new_len > max_len || new_len < min_len))
+ return -EINVAL;
+ if (skb->encapsulation)
+ return -ENOTSUPP;
+
+ /* The basic idea of this helper is that it's performing the
+ * needed work to either grow or trim an skb, and eBPF program
+ * rewrites the rest via helpers like bpf_skb_store_bytes(),
+ * bpf_lX_csum_replace() and others rather than passing a raw
+ * buffer here. This one is a slow path helper and intended
+ * for replies with control messages.
+ *
+ * Like in bpf_skb_change_proto(), we want to keep this rather
+ * minimal and without protocol specifics so that we are able
+ * to separate concerns as in bpf_skb_store_bytes() should only
+ * be the one responsible for writing buffers.
+ *
+ * It's really expected to be a slow path operation here for
+ * control message replies, so we're implicitly linearizing,
+ * uncloning and drop offloads from the skb by this.
+ */
+ ret = __bpf_try_make_writable(skb, skb->len);
+ if (!ret) {
+ if (new_len > skb->len)
+ ret = bpf_skb_grow_rcsum(skb, new_len);
+ else if (new_len < skb->len)
+ ret = bpf_skb_trim_rcsum(skb, new_len);
+ if (!ret && skb_is_gso(skb))
+ skb_gso_reset(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_change_tail_proto = {
+ .func = bpf_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ u32 max_len = __bpf_skb_max_len(skb);
+ u32 new_len = skb->len + head_room;
+ int ret;
+
+ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ new_len < skb->len))
+ return -EINVAL;
+
+ ret = skb_cow(skb, head_room);
+ if (likely(!ret)) {
+ /* Idea for this helper is that we currently only
+ * allow to expand on mac header. This means that
+ * skb->protocol network header, etc, stay as is.
+ * Compared to bpf_skb_change_tail(), we're more
+ * flexible due to not needing to linearize or
+ * reset GSO. Intention for this helper is to be
+ * used by an L3 skb that needs to push mac header
+ * for redirection into L2 device.
+ */
+ __skb_push(skb, head_room);
+ memset(skb->data, 0, head_room);
+ skb_reset_mac_header(skb);
+ }
+
+ bpf_compute_data_end(skb);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+ .func = bpf_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+ void *data = xdp->data + offset;
+
+ if (unlikely(data < xdp->data_hard_start ||
+ data > xdp->data_end - ETH_HLEN))
+ return -EINVAL;
+
+ xdp->data = data;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+ .func = bpf_xdp_adjust_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+ if (func == bpf_skb_vlan_push ||
+ func == bpf_skb_vlan_pop ||
+ func == bpf_skb_store_bytes ||
+ func == bpf_skb_change_proto ||
+ func == bpf_skb_change_head ||
+ func == bpf_skb_change_tail ||
+ func == bpf_skb_pull_data ||
+ func == bpf_l3_csum_replace ||
+ func == bpf_l4_csum_replace ||
+ func == bpf_xdp_adjust_head)
return true;
return false;
@@ -2023,13 +2285,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
return 0;
}
-static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
- u64 meta_size)
+BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
{
- struct sk_buff *skb = (struct sk_buff *)(long) r1;
- struct bpf_map *map = (struct bpf_map *)(long) r2;
u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
- void *meta = (void *)(long) r4;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -2056,10 +2315,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags)
return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}
-static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
+ u32, size, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
u8 compat[sizeof(struct bpf_tunnel_key)];
void *to_orig = to;
@@ -2124,10 +2382,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u8 *to = (u8 *) (long) r2;
const struct ip_tunnel_info *info = skb_tunnel_info(skb);
int err;
@@ -2162,10 +2418,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
static struct metadata_dst __percpu *md_dst;
-static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
+BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
+ const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
struct metadata_dst *md = this_cpu_ptr(md_dst);
u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
@@ -2183,7 +2438,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
*/
memcpy(compat, from, size);
memset(compat + size, 0, sizeof(compat) - size);
- from = (struct bpf_tunnel_key *)compat;
+ from = (const struct bpf_tunnel_key *) compat;
break;
default:
return -EINVAL;
@@ -2233,10 +2488,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
+ const u8 *, from, u32, size)
{
- struct sk_buff *skb = (struct sk_buff *) (long) r1;
- u8 *from = (u8 *) (long) r2;
struct ip_tunnel_info *info = skb_tunnel_info(skb);
const struct metadata_dst *md = this_cpu_ptr(md_dst);
@@ -2282,28 +2536,24 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
}
-#ifdef CONFIG_SOCK_CGROUP_DATA
-static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
+ u32, idx)
{
- struct sk_buff *skb = (struct sk_buff *)(long)r1;
- struct bpf_map *map = (struct bpf_map *)(long)r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct cgroup *cgrp;
struct sock *sk;
- u32 i = (u32)r3;
- sk = skb->sk;
+ sk = skb_to_full_sk(skb);
if (!sk || !sk_fullsock(sk))
return -ENOENT;
-
- if (unlikely(i >= array->map.max_entries))
+ if (unlikely(idx >= array->map.max_entries))
return -E2BIG;
- cgrp = READ_ONCE(array->ptrs[i]);
+ cgrp = READ_ONCE(array->ptrs[idx]);
if (unlikely(!cgrp))
return -EAGAIN;
- return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
+ return sk_under_cgroup_hierarchy(sk, cgrp);
}
static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
@@ -2314,7 +2564,38 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};
-#endif
+
+static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+ unsigned long off, unsigned long len)
+{
+ memcpy(dst_buff, src_buff + off, len);
+ return 0;
+}
+
+BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
+ u64, flags, void *, meta, u64, meta_size)
+{
+ u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
+
+ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+ return -EFAULT;
+
+ return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
+ bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_event_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id)
@@ -2330,6 +2611,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns:
@@ -2350,8 +2633,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
case BPF_FUNC_csum_diff:
return &bpf_csum_diff_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -2368,6 +2655,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_skb_change_proto_proto;
case BPF_FUNC_skb_change_type:
return &bpf_skb_change_type_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -2382,14 +2671,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
-#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
-#endif
default:
return sk_filter_func_proto(func_id);
}
@@ -2398,10 +2687,92 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id)
{
- return sk_filter_func_proto(func_id);
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_xdp_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_xdp_adjust_head:
+ return &bpf_xdp_adjust_head_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_skb_pull_data:
+ return &bpf_skb_pull_data_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_proto;
+ case BPF_FUNC_get_route_realm:
+ return &bpf_get_route_realm_proto;
+ case BPF_FUNC_get_hash_recalc:
+ return &bpf_get_hash_recalc_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_skb_under_cgroup:
+ return &bpf_skb_under_cgroup_proto;
+ default:
+ return sk_filter_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_get_tunnel_key:
+ return &bpf_skb_get_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_key:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_redirect:
+ return &bpf_redirect_proto;
+ case BPF_FUNC_clone_redirect:
+ return &bpf_clone_redirect_proto;
+ case BPF_FUNC_skb_change_tail:
+ return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
+ case BPF_FUNC_skb_store_bytes:
+ return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_csum_update:
+ return &bpf_csum_update_proto;
+ case BPF_FUNC_l3_csum_replace:
+ return &bpf_l3_csum_replace_proto;
+ case BPF_FUNC_l4_csum_replace:
+ return &bpf_l4_csum_replace_proto;
+ case BPF_FUNC_set_hash_invalid:
+ return &bpf_set_hash_invalid_proto;
+ default:
+ return lwt_inout_func_proto(func_id);
+ }
}
-static bool __is_valid_access(int off, int size, enum bpf_access_type type)
+static bool __is_valid_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
@@ -2435,7 +2806,103 @@ static bool sk_filter_is_valid_access(int off, int size,
}
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
+}
+
+static bool lwt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ switch (off) {
+ case offsetof(struct __sk_buff, tc_classid):
+ return false;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct __sk_buff, mark):
+ case offsetof(struct __sk_buff, priority):
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetof(struct __sk_buff, cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct __sk_buff, data):
+ *reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct __sk_buff, data_end):
+ *reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return __is_valid_access(off, size);
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ if (off < 0 || off + size > sizeof(struct bpf_sock))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
+}
+
+static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!direct_write)
+ return 0;
+
+ /* if (!skb->cloned)
+ * goto start;
+ *
+ * (Fast-path, otherwise approximation that we might be
+ * a clone, do the rest in helper.)
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+ /* ret = bpf_skb_pull_data(skb, 0); */
+ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_pull_data);
+ /* if (!ret)
+ * goto restore;
+ * return TC_ACT_SHOT;
+ */
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
+ *insn++ = BPF_EXIT_INSN();
+
+ /* restore: */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ /* start: */
+ *insn++ = prog->insnsi[0];
+
+ return insn - insn_buf;
}
static bool tc_cls_act_is_valid_access(int off, int size,
@@ -2465,17 +2932,16 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
}
- return __is_valid_access(off, size, type);
+ return __is_valid_access(off, size);
}
-static bool __is_valid_xdp_access(int off, int size,
- enum bpf_access_type type)
+static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
return false;
if (off % size != 0)
return false;
- if (size != 4)
+ if (size != sizeof(__u32))
return false;
return true;
@@ -2497,7 +2963,7 @@ static bool xdp_is_valid_access(int off, int size,
break;
}
- return __is_valid_xdp_access(off, size, type);
+ return __is_valid_xdp_access(off, size);
}
void bpf_warn_invalid_xdp_action(u32 act)
@@ -2506,10 +2972,10 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
-static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
@@ -2556,7 +3022,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
case offsetof(struct __sk_buff, ifindex):
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
dst_reg, src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
@@ -2597,7 +3063,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
dst_reg, src_reg, insn);
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
prog->cb_access = 1;
@@ -2621,7 +3087,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
break;
case offsetof(struct __sk_buff, data):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
dst_reg, src_reg,
offsetof(struct sk_buff, data));
break;
@@ -2630,8 +3096,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
ctx_off -= offsetof(struct __sk_buff, data_end);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct bpf_skb_data_end, data_end);
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
- dst_reg, src_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg,
+ ctx_off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -2657,6 +3123,76 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
return insn - insn_buf;
}
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+ int dst_reg, int src_reg,
+ int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, sk_bound_dev_if));
+ break;
+
+ case offsetof(struct bpf_sock, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+ offsetof(struct sock, sk_family));
+ break;
+
+ case offsetof(struct bpf_sock, type):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sock, __sk_flags_offset));
+ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+ int src_reg, int ctx_off,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (ctx_off) {
+ case offsetof(struct __sk_buff, ifindex):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
+ dst_reg, src_reg,
+ offsetof(struct sk_buff, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
+ default:
+ return sk_filter_convert_ctx_access(type, dst_reg, src_reg,
+ ctx_off, insn_buf, prog);
+ }
+
+ return insn - insn_buf;
+}
+
static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
@@ -2666,12 +3202,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
switch (ctx_off) {
case offsetof(struct xdp_md, data):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
dst_reg, src_reg,
offsetof(struct xdp_buff, data));
break;
case offsetof(struct xdp_md, data_end):
- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
dst_reg, src_reg,
offsetof(struct xdp_buff, data_end));
break;
@@ -2683,13 +3219,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
static const struct bpf_verifier_ops sk_filter_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
};
static const struct bpf_verifier_ops tc_cls_act_ops = {
.get_func_proto = tc_cls_act_func_proto,
.is_valid_access = tc_cls_act_is_valid_access,
- .convert_ctx_access = bpf_net_convert_ctx_access,
+ .convert_ctx_access = tc_cls_act_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
};
static const struct bpf_verifier_ops xdp_ops = {
@@ -2698,6 +3235,31 @@ static const struct bpf_verifier_ops xdp_ops = {
.convert_ctx_access = xdp_convert_ctx_access,
};
+static const struct bpf_verifier_ops cg_skb_ops = {
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = sk_filter_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_inout_ops = {
+ .get_func_proto = lwt_inout_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+ .get_func_proto = lwt_xmit_func_proto,
+ .is_valid_access = lwt_is_valid_access,
+ .convert_ctx_access = sk_filter_convert_ctx_access,
+ .gen_prologue = tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops cg_sock_ops = {
+ .get_func_proto = sk_filter_func_proto,
+ .is_valid_access = sock_filter_is_valid_access,
+ .convert_ctx_access = sock_filter_convert_ctx_access,
+};
+
static struct bpf_prog_type_list sk_filter_type __read_mostly = {
.ops = &sk_filter_ops,
.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2718,12 +3280,42 @@ static struct bpf_prog_type_list xdp_type __read_mostly = {
.type = BPF_PROG_TYPE_XDP,
};
+static struct bpf_prog_type_list cg_skb_type __read_mostly = {
+ .ops = &cg_skb_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SKB,
+};
+
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+ .ops = &lwt_inout_ops,
+ .type = BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+ .ops = &lwt_xmit_ops,
+ .type = BPF_PROG_TYPE_LWT_XMIT,
+};
+
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+ .ops = &cg_sock_ops,
+ .type = BPF_PROG_TYPE_CGROUP_SOCK
+};
+
static int __init register_sk_filter_ops(void)
{
bpf_register_prog_type(&sk_filter_type);
bpf_register_prog_type(&sched_cls_type);
bpf_register_prog_type(&sched_act_type);
bpf_register_prog_type(&xdp_type);
+ bpf_register_prog_type(&cg_skb_type);
+ bpf_register_prog_type(&cg_sock_type);
+ bpf_register_prog_type(&lwt_in_type);
+ bpf_register_prog_type(&lwt_out_type);
+ bpf_register_prog_type(&lwt_xmit_type);
return 0;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 3937b1b68d5b..f765c11d8df5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -95,7 +95,6 @@ static void flow_cache_gc_task(struct work_struct *work)
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
flow_entry_kill(fce, xfrm);
atomic_dec(&xfrm->flow_cache_gc_count);
- WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
}
}
@@ -236,9 +235,8 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
if (fcp->hash_count > fc->high_watermark)
flow_cache_shrink(fc, fcp);
- if (fcp->hash_count > 2 * fc->high_watermark ||
- atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
- atomic_inc(&net->xfrm.flow_cache_genid);
+ if (atomic_read(&net->xfrm.flow_cache_gc_count) >
+ 2 * num_online_cpus() * fc->high_watermark) {
flo = ERR_PTR(-ENOBUFS);
goto ret_object;
}
@@ -419,28 +417,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
return 0;
}
-static int flow_cache_cpu(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
{
- struct flow_cache *fc = container_of(nfb, struct flow_cache,
- hotcpu_notifier);
- int res, cpu = (unsigned long) hcpu;
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
+
+ return flow_cache_cpu_prepare(fc, cpu);
+}
+
+static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- res = flow_cache_cpu_prepare(fc, cpu);
- if (res)
- return notifier_from_errno(res);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- __flow_cache_shrink(fc, fcp, 0);
- break;
- }
- return NOTIFY_OK;
+ __flow_cache_shrink(fc, fcp, 0);
+ return 0;
}
int flow_cache_init(struct net *net)
@@ -467,18 +457,8 @@ int flow_cache_init(struct net *net)
if (!fc->percpu)
return -ENOMEM;
- cpu_notifier_register_begin();
-
- for_each_online_cpu(i) {
- if (flow_cache_cpu_prepare(fc, i))
- goto err;
- }
- fc->hotcpu_notifier = (struct notifier_block){
- .notifier_call = flow_cache_cpu,
- };
- __register_hotcpu_notifier(&fc->hotcpu_notifier);
-
- cpu_notifier_register_done();
+ if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
+ goto err;
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -494,8 +474,6 @@ err:
fcp->hash_table = NULL;
}
- cpu_notifier_register_done();
-
free_percpu(fc->percpu);
fc->percpu = NULL;
@@ -509,7 +487,8 @@ void flow_cache_fini(struct net *net)
struct flow_cache *fc = &net->xfrm.flow_cache_global;
del_timer_sync(&fc->rnd_timer);
- unregister_hotcpu_notifier(&fc->hotcpu_notifier);
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
for_each_possible_cpu(i) {
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
@@ -521,3 +500,14 @@ void flow_cache_fini(struct net *net)
fc->percpu = NULL;
}
EXPORT_SYMBOL(flow_cache_fini);
+
+void __init flow_cache_hp_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
+ "net/flow:prepare",
+ flow_cache_cpu_up_prep,
+ flow_cache_cpu_dead);
+ WARN_ON(ret < 0);
+}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 52742a02814f..d6447dc10371 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -6,6 +6,8 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/ipv6.h>
+#include <net/gre.h>
+#include <net/pptp.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
@@ -56,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
EXPORT_SYMBOL(skb_flow_dissector_init);
/**
+ * skb_flow_get_be16 - extract be16 entity
+ * @skb: sk_buff to extract from
+ * @poff: offset to extract at
+ * @data: raw buffer pointer to the packet
+ * @hlen: packet header length
+ *
+ * The function will try to retrieve a be32 entity at
+ * offset poff
+ */
+__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data,
+ int hlen)
+{
+ __be16 *u, _u;
+
+ u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
+ if (u)
+ return *u;
+
+ return 0;
+}
+
+/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
@@ -115,14 +139,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
+ struct flow_dissector_key_vlan *key_vlan;
struct flow_dissector_key_keyid *key_keyid;
+ bool skip_vlan = false;
u8 ip_proto = 0;
- bool ret = false;
+ bool ret;
if (!data) {
data = skb->data;
- proto = skb->protocol;
+ proto = skb_vlan_tag_present(skb) ?
+ skb->vlan_proto : skb->protocol;
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
}
@@ -242,22 +270,42 @@ ipv6:
case htons(ETH_P_8021Q): {
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
+ bool vlan_tag_present = skb && skb_vlan_tag_present(skb);
- vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
- if (!vlan)
- goto out_bad;
+ if (vlan_tag_present)
+ proto = skb->protocol;
+
+ if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
+ vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
+ data, hlen, &_vlan);
+ if (!vlan)
+ goto out_bad;
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ if (skip_vlan)
+ goto again;
+ }
+ skip_vlan = true;
if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_VLANID)) {
- key_tags = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_VLANID,
+ FLOW_DISSECTOR_KEY_VLAN)) {
+ key_vlan = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_VLAN,
target_container);
- key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+ if (vlan_tag_present) {
+ key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
+ key_vlan->vlan_priority =
+ (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
+ } else {
+ key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
+ VLAN_VID_MASK;
+ key_vlan->vlan_priority =
+ (ntohs(vlan->h_vlan_TCI) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ }
}
- proto = vlan->h_vlan_encapsulated_proto;
- nhoff += sizeof(*vlan);
goto again;
}
case htons(ETH_P_PPP_SES): {
@@ -338,32 +386,42 @@ mpls:
ip_proto_again:
switch (ip_proto) {
case IPPROTO_GRE: {
- struct gre_hdr {
- __be16 flags;
- __be16 proto;
- } *hdr, _hdr;
+ struct gre_base_hdr *hdr, _hdr;
+ u16 gre_ver;
+ int offset = 0;
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
goto out_bad;
- /*
- * Only look inside GRE if version zero and no
- * routing
- */
- if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
+
+ /* Only look inside GRE without routing */
+ if (hdr->flags & GRE_ROUTING)
break;
- proto = hdr->proto;
- nhoff += 4;
+ /* Only look inside GRE for version 0 and 1 */
+ gre_ver = ntohs(hdr->flags & GRE_VERSION);
+ if (gre_ver > 1)
+ break;
+
+ proto = hdr->protocol;
+ if (gre_ver) {
+ /* Version1 must be PPTP, and check the flags */
+ if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
+ break;
+ }
+
+ offset += sizeof(struct gre_base_hdr);
+
if (hdr->flags & GRE_CSUM)
- nhoff += 4;
+ offset += sizeof(((struct gre_full_hdr *)0)->csum) +
+ sizeof(((struct gre_full_hdr *)0)->reserved1);
+
if (hdr->flags & GRE_KEY) {
const __be32 *keyid;
__be32 _keyid;
- keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+ keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid),
data, hlen, &_keyid);
-
if (!keyid)
goto out_bad;
@@ -372,32 +430,65 @@ ip_proto_again:
key_keyid = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_GRE_KEYID,
target_container);
- key_keyid->keyid = *keyid;
+ if (gre_ver == 0)
+ key_keyid->keyid = *keyid;
+ else
+ key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
}
- nhoff += 4;
+ offset += sizeof(((struct gre_full_hdr *)0)->key);
}
+
if (hdr->flags & GRE_SEQ)
- nhoff += 4;
- if (proto == htons(ETH_P_TEB)) {
- const struct ethhdr *eth;
- struct ethhdr _eth;
-
- eth = __skb_header_pointer(skb, nhoff,
- sizeof(_eth),
- data, hlen, &_eth);
- if (!eth)
+ offset += sizeof(((struct pptp_gre_header *)0)->seq);
+
+ if (gre_ver == 0) {
+ if (proto == htons(ETH_P_TEB)) {
+ const struct ethhdr *eth;
+ struct ethhdr _eth;
+
+ eth = __skb_header_pointer(skb, nhoff + offset,
+ sizeof(_eth),
+ data, hlen, &_eth);
+ if (!eth)
+ goto out_bad;
+ proto = eth->h_proto;
+ offset += sizeof(*eth);
+
+ /* Cap headers that we access via pointers at the
+ * end of the Ethernet header as our maximum alignment
+ * at that point is only 2 bytes.
+ */
+ if (NET_IP_ALIGN)
+ hlen = (nhoff + offset);
+ }
+ } else { /* version 1, must be PPTP */
+ u8 _ppp_hdr[PPP_HDRLEN];
+ u8 *ppp_hdr;
+
+ if (hdr->flags & GRE_ACK)
+ offset += sizeof(((struct pptp_gre_header *)0)->ack);
+
+ ppp_hdr = skb_header_pointer(skb, nhoff + offset,
+ sizeof(_ppp_hdr), _ppp_hdr);
+ if (!ppp_hdr)
goto out_bad;
- proto = eth->h_proto;
- nhoff += sizeof(*eth);
-
- /* Cap headers that we access via pointers at the
- * end of the Ethernet header as our maximum alignment
- * at that point is only 2 bytes.
- */
- if (NET_IP_ALIGN)
- hlen = nhoff;
+
+ switch (PPP_PROTOCOL(ppp_hdr)) {
+ case PPP_IP:
+ proto = htons(ETH_P_IP);
+ break;
+ case PPP_IPV6:
+ proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ /* Could probably catch some more like MPLS */
+ break;
+ }
+
+ offset += PPP_HDRLEN;
}
+ nhoff += offset;
key_control->flags |= FLOW_DIS_ENCAPSULATION;
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
goto out_good;
@@ -478,15 +569,28 @@ ip_proto_again:
data, hlen);
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP)) {
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+ key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
+ }
+
out_good:
ret = true;
-out_bad:
+ key_control->thoff = (u16)nhoff;
+out:
key_basic->n_proto = proto;
key_basic->ip_proto = ip_proto;
- key_control->thoff = (u16)nhoff;
return ret;
+
+out_bad:
+ ret = false;
+ key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
+ goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);
@@ -653,7 +757,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
struct flow_keys keys;
@@ -874,8 +978,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
.offset = offsetof(struct flow_keys, ports),
},
{
- .key_id = FLOW_DISSECTOR_KEY_VLANID,
- .offset = offsetof(struct flow_keys, tags),
+ .key_id = FLOW_DISSECTOR_KEY_VLAN,
+ .offset = offsetof(struct flow_keys, vlan),
},
{
.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
@@ -940,4 +1044,4 @@ static int __init init_default_flow_dissectors(void)
return 0;
}
-late_initcall_sync(init_default_flow_dissectors);
+core_initcall(init_default_flow_dissectors);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index cad8e791f28e..101b5d0e2142 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -7,6 +7,7 @@
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Eric Dumazet <edumazet@google.com>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,165 +31,79 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
-#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
-/*
- This code is NOT intended to be used for statistics collection,
- its purpose is to provide a base for statistical multiplexing
- for controlled load service.
- If you need only statistics, run a user level daemon which
- periodically reads byte counters.
-
- Unfortunately, rate estimation is not a very easy task.
- F.e. I did not find a simple way to estimate the current peak rate
- and even failed to formulate the problem 8)8)
-
- So I preferred not to built an estimator into the scheduler,
- but run this task separately.
- Ideally, it should be kernel thread(s), but for now it runs
- from timers, which puts apparent top bounds on the number of rated
- flows, has minimal overhead on small, but is enough
- to handle controlled load service, sets of aggregates.
-
- We measure rate over A=(1<<interval) seconds and evaluate EWMA:
-
- avrate = avrate*(1-W) + rate*W
-
- where W is chosen as negative power of 2: W = 2^(-ewma_log)
-
- The resulting time constant is:
-
- T = A/(-ln(1-W))
-
-
- NOTES.
-
- * avbps and avpps are scaled by 2^5.
- * both values are reported as 32 bit unsigned values. bps can
- overflow for fast links : max speed being 34360Mbit/sec
- * Minimal interval is HZ/4=250msec (it is the greatest common divisor
- for HZ=100 and HZ=1024 8)), maximal interval
- is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
- are too expensive, longer ones can be implemented
- at user level painlessly.
+/* This code is NOT intended to be used for statistics collection,
+ * its purpose is to provide a base for statistical multiplexing
+ * for controlled load service.
+ * If you need only statistics, run a user level daemon which
+ * periodically reads byte counters.
*/
-#define EST_MAX_INTERVAL 5
-
-struct gen_estimator
-{
- struct list_head list;
+struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
- struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
- int ewma_log;
+ struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ u8 ewma_log;
+ u8 intvl_log; /* period : (250ms << intvl_log) */
+
+ seqcount_t seq;
u32 last_packets;
- unsigned long avpps;
u64 last_bytes;
+
+ u64 avpps;
u64 avbps;
- struct rcu_head e_rcu;
- struct rb_node node;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
- struct rcu_head head;
-};
-struct gen_estimator_head
-{
- struct timer_list timer;
- struct list_head list;
+ unsigned long next_jiffies;
+ struct timer_list timer;
+ struct rcu_head rcu;
};
-static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
-
-/* Protects against NULL dereference */
-static DEFINE_RWLOCK(est_lock);
-
-/* Protects against soft lockup during large deletion */
-static struct rb_root est_root = RB_ROOT;
-static DEFINE_SPINLOCK(est_tree_lock);
-
-static void est_timer(unsigned long arg)
+static void est_fetch_counters(struct net_rate_estimator *e,
+ struct gnet_stats_basic_packed *b)
{
- int idx = (int)arg;
- struct gen_estimator *e;
+ if (e->stats_lock)
+ spin_lock(e->stats_lock);
- rcu_read_lock();
- list_for_each_entry_rcu(e, &elist[idx].list, list) {
- struct gnet_stats_basic_packed b = {0};
- unsigned long rate;
- u64 brate;
-
- if (e->stats_lock)
- spin_lock(e->stats_lock);
- read_lock(&est_lock);
- if (e->bstats == NULL)
- goto skip;
-
- __gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);
-
- brate = (b.bytes - e->last_bytes)<<(7 - idx);
- e->last_bytes = b.bytes;
- e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);
-
- rate = b.packets - e->last_packets;
- rate <<= (7 - idx);
- e->last_packets = b.packets;
- e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
- WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
-skip:
- read_unlock(&est_lock);
- if (e->stats_lock)
- spin_unlock(e->stats_lock);
- }
+ __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+
+ if (e->stats_lock)
+ spin_unlock(e->stats_lock);
- if (!list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
- rcu_read_unlock();
}
-static void gen_add_node(struct gen_estimator *est)
+static void est_timer(unsigned long arg)
{
- struct rb_node **p = &est_root.rb_node, *parent = NULL;
+ struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+ struct gnet_stats_basic_packed b;
+ u64 rate, brate;
- while (*p) {
- struct gen_estimator *e;
+ est_fetch_counters(est, &b);
+ brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
+ brate -= (est->avbps >> est->ewma_log);
- parent = *p;
- e = rb_entry(parent, struct gen_estimator, node);
+ rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
+ rate -= (est->avpps >> est->ewma_log);
- if (est->bstats > e->bstats)
- p = &parent->rb_right;
- else
- p = &parent->rb_left;
- }
- rb_link_node(&est->node, parent, p);
- rb_insert_color(&est->node, &est_root);
-}
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+ est->avpps += rate;
+ write_seqcount_end(&est->seq);
-static
-struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
-{
- struct rb_node *p = est_root.rb_node;
-
- while (p) {
- struct gen_estimator *e;
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
- e = rb_entry(p, struct gen_estimator, node);
+ est->next_jiffies += ((HZ/4) << est->intvl_log);
- if (bstats > e->bstats)
- p = p->rb_right;
- else if (bstats < e->bstats || rate_est != e->rate_est)
- p = p->rb_left;
- else
- return e;
+ if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
+ /* Ouch... timer was delayed. */
+ est->next_jiffies = jiffies + 1;
}
- return NULL;
+ mod_timer(&est->timer, est->next_jiffies);
}
/**
@@ -211,83 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
- struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
- struct gnet_stats_basic_packed b = {0};
- int idx;
+ struct net_rate_estimator *old, *est;
+ struct gnet_stats_basic_packed b;
+ int intvl_log;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
+ /* allowed timer periods are :
+ * -2 : 250ms, -1 : 500ms, 0 : 1 sec
+ * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec
+ */
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
- if (est == NULL)
+ if (!est)
return -ENOBUFS;
- __gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
-
- idx = parm->interval + 2;
+ seqcount_init(&est->seq);
+ intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
- est->last_bytes = b.bytes;
- est->avbps = rate_est->bps<<5;
- est->last_packets = b.packets;
- est->avpps = rate_est->pps<<10;
+ est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- spin_lock_bh(&est_tree_lock);
- if (!elist[idx].timer.function) {
- INIT_LIST_HEAD(&elist[idx].list);
- setup_timer(&elist[idx].timer, est_timer, idx);
+ est_fetch_counters(est, &b);
+ est->last_bytes = b.bytes;
+ est->last_packets = b.packets;
+ old = rcu_dereference_protected(*rate_est, 1);
+ if (old) {
+ del_timer_sync(&old->timer);
+ est->avbps = old->avbps;
+ est->avpps = old->avpps;
}
- if (list_empty(&elist[idx].list))
- mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
-
- list_add_rcu(&est->list, &elist[idx].list);
- gen_add_node(est);
- spin_unlock_bh(&est_tree_lock);
+ est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
+ setup_timer(&est->timer, est_timer, (unsigned long)est);
+ mod_timer(&est->timer, est->next_jiffies);
+ rcu_assign_pointer(*rate_est, est);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
- * Removes the rate estimator specified by &bstats and &rate_est.
+ * Removes the rate estimator.
*
- * Note : Caller should respect an RCU grace period before freeing stats_lock
*/
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_rate_est64 *rate_est)
+void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
- struct gen_estimator *e;
-
- spin_lock_bh(&est_tree_lock);
- while ((e = gen_find_node(bstats, rate_est))) {
- rb_erase(&e->node, &est_root);
+ struct net_rate_estimator *est;
- write_lock(&est_lock);
- e->bstats = NULL;
- write_unlock(&est_lock);
-
- list_del_rcu(&e->list);
- kfree_rcu(e, e_rcu);
+ est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ if (est) {
+ del_timer_sync(&est->timer);
+ kfree_rcu(est, rcu);
}
- spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -307,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
- struct gnet_stats_rate_est64 *rate_est,
+ struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- gen_kill_estimator(bstats, rate_est);
- return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
+ return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
- * @bstats: basic statistics
- * @rate_est: rate estimator statistics
+ * @rate_est: rate estimator
*
* Returns true if estimator is active, and false if not.
*/
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
- const struct gnet_stats_rate_est64 *rate_est)
+bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
{
- bool res;
+ return !!rcu_access_pointer(*rate_est);
+}
+EXPORT_SYMBOL(gen_estimator_active);
- ASSERT_RTNL();
+bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
+ struct gnet_stats_rate_est64 *sample)
+{
+ struct net_rate_estimator *est;
+ unsigned seq;
+
+ rcu_read_lock();
+ est = rcu_dereference(*rate_est);
+ if (!est) {
+ rcu_read_unlock();
+ return false;
+ }
- spin_lock_bh(&est_tree_lock);
- res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock_bh(&est_tree_lock);
+ do {
+ seq = read_seqcount_begin(&est->seq);
+ sample->bps = est->avbps >> 8;
+ sample->pps = est->avpps >> 8;
+ } while (read_seqcount_retry(&est->seq, seq));
- return res;
+ rcu_read_unlock();
+ return true;
}
-EXPORT_SYMBOL(gen_estimator_active);
+EXPORT_SYMBOL(gen_estimator_read);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051304fb..87f28557b329 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
- * @b: basic statistics
- * @r: rate estimator statistics
+ * @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
- const struct gnet_stats_basic_packed *b,
- struct gnet_stats_rate_est64 *r)
+ struct net_rate_estimator __rcu **rate_est)
{
+ struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
- if (b && !gen_estimator_active(b, r))
+ if (!gen_estimator_read(rate_est, &sample))
return 0;
-
- est.bps = min_t(u64, UINT_MAX, r->bps);
+ est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
- est.pps = r->pps;
+ est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
- if (res < 0 || est.bps == r->bps)
+ if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
- TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
+ sizeof(sample), TCA_STATS_PAD);
}
return 0;
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..71bb3e2eca08
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,396 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+ struct bpf_prog *prog;
+ char *name;
+};
+
+struct bpf_lwt {
+ struct bpf_lwt_prog in;
+ struct bpf_lwt_prog out;
+ struct bpf_lwt_prog xmit;
+ int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+ struct dst_entry *dst, bool can_redirect)
+{
+ int ret;
+
+ /* Preempt disable is needed to protect per-cpu redirect_info between
+ * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+ * access to maps strictly require a rcu_read_lock() for protection,
+ * mixing with BH RCU lock doesn't work.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ bpf_compute_data_end(skb);
+ ret = bpf_prog_run_save_cb(lwt->prog, skb);
+ rcu_read_unlock();
+
+ switch (ret) {
+ case BPF_OK:
+ break;
+
+ case BPF_REDIRECT:
+ if (unlikely(!can_redirect)) {
+ pr_warn_once("Illegal redirect return code in prog %s\n",
+ lwt->name ? : "<unknown>");
+ ret = BPF_OK;
+ } else {
+ ret = skb_do_redirect(skb);
+ if (ret == 0)
+ ret = BPF_REDIRECT;
+ }
+ break;
+
+ case BPF_DROP:
+ kfree_skb(skb);
+ ret = -EPERM;
+ break;
+
+ default:
+ pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+ kfree_skb(skb);
+ ret = -EINVAL;
+ break;
+ }
+
+ preempt_enable();
+
+ return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->in.prog) {
+ ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_input)) {
+ pr_warn_once("orig_input not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+ int ret;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->out.prog) {
+ ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (unlikely(!dst->lwtstate->orig_output)) {
+ pr_warn_once("orig_output not set on dst for prog %s\n",
+ bpf->out.name);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+ int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct bpf_lwt *bpf;
+
+ bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+ if (bpf->xmit.prog) {
+ int ret;
+
+ ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+ switch (ret) {
+ case BPF_OK:
+ /* If the header was expanded, headroom might be too
+ * small for L2 header to come, expand as needed.
+ */
+ ret = xmit_check_hhlen(skb);
+ if (unlikely(ret))
+ return ret;
+
+ return LWTUNNEL_XMIT_CONTINUE;
+ case BPF_REDIRECT:
+ return LWTUNNEL_XMIT_DONE;
+ default:
+ return ret;
+ }
+ }
+
+ return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+ if (prog->prog)
+ bpf_prog_put(prog->prog);
+
+ kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ bpf_lwt_prog_destroy(&bpf->in);
+ bpf_lwt_prog_destroy(&bpf->out);
+ bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+ [LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+ [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+ .len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+ enum bpf_prog_type type)
+{
+ struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+ struct bpf_prog *p;
+ int ret;
+ u32 fd;
+
+ ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+ return -EINVAL;
+
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ if (!prog->name)
+ return -ENOMEM;
+
+ fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+ p = bpf_prog_get_type(fd, type);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ prog->prog = p;
+
+ return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+ [LWT_BPF_IN] = { .type = NLA_NESTED, },
+ [LWT_BPF_OUT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT] = { .type = NLA_NESTED, },
+ [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts)
+{
+ struct nlattr *tb[LWT_BPF_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct bpf_lwt *bpf;
+ int ret;
+
+ if (family != AF_INET && family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(sizeof(*bpf));
+ if (!newts)
+ return -ENOMEM;
+
+ newts->type = LWTUNNEL_ENCAP_BPF;
+ bpf = bpf_lwt_lwtunnel(newts);
+
+ if (tb[LWT_BPF_IN]) {
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+ BPF_PROG_TYPE_LWT_IN);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_OUT]) {
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+ BPF_PROG_TYPE_LWT_OUT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT]) {
+ newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+ ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+ BPF_PROG_TYPE_LWT_XMIT);
+ if (ret < 0)
+ goto errout;
+ }
+
+ if (tb[LWT_BPF_XMIT_HEADROOM]) {
+ u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+ if (headroom > LWT_BPF_MAX_HEADROOM) {
+ ret = -ERANGE;
+ goto errout;
+ }
+
+ newts->headroom = headroom;
+ }
+
+ bpf->family = family;
+ *ts = newts;
+
+ return 0;
+
+errout:
+ bpf_destroy_state(newts);
+ kfree(newts);
+ return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+ struct bpf_lwt_prog *prog)
+{
+ struct nlattr *nest;
+
+ if (!prog->prog)
+ return 0;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (prog->name &&
+ nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+ struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+ if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+ bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ int nest_len = nla_total_size(sizeof(struct nlattr)) +
+ nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+ 0;
+
+ return nest_len + /* LWT_BPF_IN */
+ nest_len + /* LWT_BPF_OUT */
+ nest_len + /* LWT_BPF_XMIT */
+ 0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+ /* FIXME:
+ * The LWT state is currently rebuilt for delete requests which
+ * results in a new bpf_prog instance. Comparing names for now.
+ */
+ if (!a->name && !b->name)
+ return 0;
+
+ if (!a->name || !b->name)
+ return 1;
+
+ return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+ struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+ return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+ bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+ bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+ .build_state = bpf_build_state,
+ .destroy_state = bpf_destroy_state,
+ .input = bpf_input,
+ .output = bpf_output,
+ .xmit = bpf_xmit,
+ .fill_encap = bpf_fill_encap_info,
+ .get_encap_size = bpf_encap_nlsize,
+ .cmp_encap = bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+ return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 669ecc9f884e..a5d4e866ce88 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -39,6 +39,10 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "MPLS";
case LWTUNNEL_ENCAP_ILA:
return "ILA";
+ case LWTUNNEL_ENCAP_SEG6:
+ return "SEG6";
+ case LWTUNNEL_ENCAP_BPF:
+ return "BPF";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -130,6 +134,19 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
}
EXPORT_SYMBOL(lwtunnel_build_state);
+void lwtstate_free(struct lwtunnel_state *lws)
+{
+ const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type];
+
+ if (ops->destroy_state) {
+ ops->destroy_state(lws);
+ kfree_rcu(lws, rcu);
+ } else {
+ kfree(lws);
+ }
+}
+EXPORT_SYMBOL(lwtstate_free);
+
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
{
const struct lwtunnel_encap_ops *ops;
@@ -251,6 +268,41 @@ drop:
}
EXPORT_SYMBOL(lwtunnel_output);
+int lwtunnel_xmit(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ const struct lwtunnel_encap_ops *ops;
+ struct lwtunnel_state *lwtstate;
+ int ret = -EINVAL;
+
+ if (!dst)
+ goto drop;
+
+ lwtstate = dst->lwtstate;
+
+ if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+ lwtstate->type > LWTUNNEL_ENCAP_MAX)
+ return 0;
+
+ ret = -EOPNOTSUPP;
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+ if (likely(ops && ops->xmit))
+ ret = ops->xmit(skb);
+ rcu_read_unlock();
+
+ if (ret == -EOPNOTSUPP)
+ goto drop;
+
+ return ret;
+
+drop:
+ kfree_skb(skb);
+
+ return ret;
+}
+EXPORT_SYMBOL(lwtunnel_xmit);
+
int lwtunnel_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index cf26e04c4046..782dd8663665 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
} else
goto out;
} else {
- if (lladdr == neigh->ha && new == NUD_STALE)
+ if (lladdr == neigh->ha && new == NUD_STALE &&
+ !(flags & NEIGH_UPDATE_F_ADMIN))
new = old;
}
}
@@ -2290,13 +2291,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
- if (!net_eq(dev_net(n->dev), net))
- continue;
- if (neigh_ifindex_filtered(n->dev, filter_idx))
- continue;
- if (neigh_master_filtered(n->dev, filter_master_idx))
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || !net_eq(dev_net(n->dev), net))
+ goto next;
+ if (neigh_ifindex_filtered(n->dev, filter_idx) ||
+ neigh_master_filtered(n->dev, filter_master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2331,9 +2329,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (pneigh_net(n) != net)
- continue;
- if (idx < s_idx)
+ if (idx < s_idx || pneigh_net(n) != net)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6e4f34721080..b0c04cf4851d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -950,10 +950,13 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
+ struct kobject *kobj = &dev->_rx[i].kobj;
+
+ if (!list_empty(&dev_net(dev)->exit_list))
+ kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
- sysfs_remove_group(&dev->_rx[i].kobj,
- dev->sysfs_rx_queue_group);
- kobject_put(&dev->_rx[i].kobj);
+ sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ kobject_put(kobj);
}
return error;
@@ -1021,7 +1024,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
return sprintf(buf, "%lu", trans_timeout);
}
-#ifdef CONFIG_XPS
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -1033,6 +1035,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
+static ssize_t show_traffic_class(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ struct net_device *dev = queue->dev;
+ int index = get_netdev_queue_index(queue);
+ int tc = netdev_txq_to_tc(dev, index);
+
+ if (tc < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%u\n", tc);
+}
+
+#ifdef CONFIG_XPS
static ssize_t show_tx_maxrate(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
char *buf)
@@ -1075,6 +1092,9 @@ static struct netdev_queue_attribute queue_tx_maxrate =
static struct netdev_queue_attribute queue_trans_timeout =
__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+static struct netdev_queue_attribute queue_traffic_class =
+ __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
+
#ifdef CONFIG_BQL
/*
* Byte queue limits sysfs structures and functions.
@@ -1190,29 +1210,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute, char *buf)
{
struct net_device *dev = queue->dev;
+ int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
cpumask_var_t mask;
unsigned long index;
- int i, len;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
index = get_netdev_queue_index(queue);
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
- for_each_possible_cpu(i) {
- struct xps_map *map =
- rcu_dereference(dev_maps->cpu_map[i]);
- if (map) {
- int j;
- for (j = 0; j < map->len; j++) {
- if (map->queues[j] == index) {
- cpumask_set_cpu(i, mask);
- break;
- }
+ for_each_possible_cpu(cpu) {
+ int i, tci = cpu * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->cpu_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ cpumask_set_cpu(cpu, mask);
+ break;
}
}
}
@@ -1260,6 +1289,7 @@ static struct netdev_queue_attribute xps_cpus_attribute =
static struct attribute *netdev_queue_default_attrs[] = {
&queue_trans_timeout.attr,
+ &queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
&queue_tx_maxrate.attr,
@@ -1340,6 +1370,8 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
+ if (!list_empty(&dev_net(dev)->exit_list))
+ queue->kobj.uevent_suppress = 1;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
#endif
@@ -1525,6 +1557,9 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &(ndev->dev);
+ if (!list_empty(&dev_net(ndev)->exit_list))
+ dev_set_uevent_suppress(dev, 1);
+
kobject_get(&dev->kobj);
remove_queue_kobjects(ndev);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 10608dd1489f..3c4bbec39713 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -37,6 +37,11 @@ struct net init_net = {
};
EXPORT_SYMBOL(init_net);
+static bool init_net_initialized;
+
+#define MIN_PERNET_OPS_ID \
+ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -44,27 +49,28 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
- size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+ unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->len = max_gen_ptrs;
+ ng->s.len = max_gen_ptrs;
return ng;
}
-static int net_assign_generic(struct net *net, int id, void *data)
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
- BUG_ON(id == 0);
+ BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&net_mutex));
- ng = old_ng;
- if (old_ng->len >= id)
- goto assign;
+ if (old_ng->s.len > id) {
+ old_ng->ptr[id] = data;
+ return 0;
+ }
ng = net_alloc_generic();
if (ng == NULL)
@@ -81,12 +87,12 @@ static int net_assign_generic(struct net *net, int id, void *data)
* the old copy for kfree after a grace period.
*/
- memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+ (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+ ng->ptr[id] = data;
rcu_assign_pointer(net->gen, ng);
- kfree_rcu(old_ng, rcu);
-assign:
- ng->ptr[id - 1] = data;
+ kfree_rcu(old_ng, s.rcu);
return 0;
}
@@ -120,8 +126,7 @@ out:
static void ops_free(const struct pernet_operations *ops, struct net *net)
{
if (ops->id && ops->size) {
- int id = *ops->id;
- kfree(net_generic(net, id));
+ kfree(net_generic(net, *ops->id));
}
}
@@ -216,6 +221,8 @@ int peernet2id_alloc(struct net *net, struct net *peer)
bool alloc;
int id;
+ if (atomic_read(&net->count) == 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
spin_lock_bh(&net->nsid_lock);
alloc = atomic_read(&peer->count) == 0 ? false : true;
id = __peernet2id_alloc(net, peer, &alloc);
@@ -224,7 +231,6 @@ int peernet2id_alloc(struct net *net, struct net *peer)
rtnl_net_notifyid(net, RTM_NEWNSID, id);
return id;
}
-EXPORT_SYMBOL(peernet2id_alloc);
/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(struct net *net, struct net *peer)
@@ -236,6 +242,7 @@ int peernet2id(struct net *net, struct net *peer)
spin_unlock_bh(&net->nsid_lock);
return id;
}
+EXPORT_SYMBOL(peernet2id);
/* This function returns true is the peer netns has an id assigned into the
* current netns.
@@ -307,6 +314,16 @@ out_undo:
#ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;
@@ -348,19 +365,34 @@ void net_drop_ns(void *p)
struct net *copy_net_ns(unsigned long flags,
struct user_namespace *user_ns, struct net *old_net)
{
+ struct ucounts *ucounts;
struct net *net;
int rv;
if (!(flags & CLONE_NEWNET))
return get_net(old_net);
+ ucounts = inc_net_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
net = net_alloc();
- if (!net)
+ if (!net) {
+ dec_net_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
+ }
get_user_ns(user_ns);
- mutex_lock(&net_mutex);
+ rv = mutex_lock_killable(&net_mutex);
+ if (rv < 0) {
+ net_free(net);
+ dec_net_namespaces(ucounts);
+ put_user_ns(user_ns);
+ return ERR_PTR(rv);
+ }
+
+ net->ucounts = ucounts;
rv = setup_net(net, user_ns);
if (rv == 0) {
rtnl_lock();
@@ -369,6 +401,7 @@ struct net *copy_net_ns(unsigned long flags,
}
mutex_unlock(&net_mutex);
if (rv < 0) {
+ dec_net_namespaces(ucounts);
put_user_ns(user_ns);
net_drop_ns(net);
return ERR_PTR(rv);
@@ -441,6 +474,7 @@ static void cleanup_net(struct work_struct *work)
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ dec_net_namespaces(net->ucounts);
put_user_ns(net->user_ns);
net_drop_ns(net);
}
@@ -528,7 +562,7 @@ static struct pernet_operations __net_initdata net_ns_ops = {
.exit = net_ns_net_exit,
};
-static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
+static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
[NETNSA_NONE] = { .type = NLA_UNSPEC },
[NETNSA_NSID] = { .type = NLA_S32 },
[NETNSA_PID] = { .type = NLA_U32 },
@@ -745,6 +779,8 @@ static int __init net_ns_init(void)
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
+ init_net_initialized = true;
+
rtnl_lock();
list_add_tail_rcu(&init_net.list, &net_namespace_list);
rtnl_unlock();
@@ -806,15 +842,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
+ if (!init_net_initialized) {
+ list_add_tail(&ops->list, list);
+ return 0;
+ }
+
return ops_init(ops, &init_net);
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- LIST_HEAD(net_exit_list);
- list_add(&init_net.exit_list, &net_exit_list);
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ if (!init_net_initialized) {
+ list_del(&ops->list);
+ } else {
+ LIST_HEAD(net_exit_list);
+ list_add(&init_net.exit_list, &net_exit_list);
+ ops_exit_list(ops, &net_exit_list);
+ ops_free_list(ops, &net_exit_list);
+ }
}
#endif /* CONFIG_NET_NS */
@@ -828,7 +873,7 @@ static int register_pernet_operations(struct list_head *list,
if (ops->id) {
again:
- error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+ error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
if (error < 0) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -836,7 +881,7 @@ again:
}
return error;
}
- max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+ max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
}
error = __register_pernet_operations(list, ops);
if (error) {
@@ -991,11 +1036,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+ return to_net_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations netns_operations = {
.name = "net",
.type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
+ .owner = netns_owner,
};
#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 53599bd0c82d..9424673009c1 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -171,12 +171,12 @@ static void poll_one_napi(struct napi_struct *napi)
static void poll_napi(struct net_device *dev)
{
struct napi_struct *napi;
+ int cpu = smp_processor_id();
list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner != smp_processor_id() &&
- spin_trylock(&napi->poll_lock)) {
+ if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
poll_one_napi(napi);
- spin_unlock(&napi->poll_lock);
+ smp_store_release(&napi->poll_owner, -1);
}
}
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index bbd118b19aef..8e69ce472236 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -216,8 +216,8 @@
#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
-#define if_lock(t) spin_lock(&(t->if_lock));
-#define if_unlock(t) spin_unlock(&(t->if_lock));
+#define if_lock(t) mutex_lock(&(t->if_lock));
+#define if_unlock(t) mutex_unlock(&(t->if_lock));
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
@@ -413,7 +413,7 @@ struct pktgen_hdr {
};
-static int pg_net_id __read_mostly;
+static unsigned int pg_net_id __read_mostly;
struct pktgen_net {
struct net *net;
@@ -423,7 +423,7 @@ struct pktgen_net {
};
struct pktgen_thread {
- spinlock_t if_lock; /* for list of devices */
+ struct mutex if_lock; /* for list of devices */
struct list_head if_list; /* All device here */
struct list_head th_list;
struct task_struct *tsk;
@@ -2010,11 +2010,13 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
{
struct pktgen_thread *t;
+ mutex_lock(&pktgen_thread_lock);
+
list_for_each_entry(t, &pn->pktgen_threads, th_list) {
struct pktgen_dev *pkt_dev;
- rcu_read_lock();
- list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
+ if_lock(t);
+ list_for_each_entry(pkt_dev, &t->if_list, list) {
if (pkt_dev->odev != dev)
continue;
@@ -2029,8 +2031,9 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
dev->name);
break;
}
- rcu_read_unlock();
+ if_unlock(t);
}
+ mutex_unlock(&pktgen_thread_lock);
}
static int pktgen_device_event(struct notifier_block *unused,
@@ -2286,7 +2289,7 @@ out:
static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
{
- pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev);
+ pkt_dev->pkt_overhead = 0;
pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
@@ -2777,13 +2780,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
- struct pktgen_dev *pkt_dev,
- unsigned int extralen)
+ struct pktgen_dev *pkt_dev)
{
+ unsigned int extralen = LL_RESERVED_SPACE(dev);
struct sk_buff *skb = NULL;
- unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen +
- pkt_dev->pkt_overhead;
+ unsigned int size;
+ size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead;
if (pkt_dev->flags & F_NODE) {
int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id();
@@ -2796,8 +2799,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
+ /* the caller pre-fetches from skb->data and reserves for the mac hdr */
if (likely(skb))
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+ skb_reserve(skb, extralen - 16);
return skb;
}
@@ -2830,16 +2834,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- datalen = (odev->hard_header_len + 16) & ~0xf;
-
- skb = pktgen_alloc_skb(odev, pkt_dev, datalen);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
}
prefetchw(skb->data);
- skb_reserve(skb, datalen);
+ skb_reserve(skb, 16);
/* Reserve for ethernet and IP header */
eth = (__u8 *) skb_push(skb, 14);
@@ -2959,7 +2961,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
mod_cur_headers(pkt_dev);
queue_map = pkt_dev->cur_queue_map;
- skb = pktgen_alloc_skb(odev, pkt_dev, 16);
+ skb = pktgen_alloc_skb(odev, pkt_dev);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
return NULL;
@@ -3763,7 +3765,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
return -ENOMEM;
}
- spin_lock_init(&t->if_lock);
+ mutex_init(&t->if_lock);
t->cpu = cpu;
INIT_LIST_HEAD(&t->if_list);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 189cc78c77eb..c482491a63d8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -275,6 +275,7 @@ int rtnl_unregister(int protocol, int msgtype)
rtnl_msg_handlers[protocol][msgindex].doit = NULL;
rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
+ rtnl_msg_handlers[protocol][msgindex].calcit = NULL;
return 0;
}
@@ -704,6 +705,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
} else if (i == RTAX_FEATURES - 1) {
u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+ if (!user_features)
+ continue;
BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
if (nla_put_u32(skb, i + 1, user_features))
goto nla_put_failure;
@@ -837,15 +840,20 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
if (dev->dev.parent && dev_is_pci(dev->dev.parent) &&
(ext_filter_mask & RTEXT_FILTER_VF)) {
int num_vfs = dev_num_vf(dev->dev.parent);
- size_t size = nla_total_size(sizeof(struct nlattr));
- size += nla_total_size(num_vfs * sizeof(struct nlattr));
+ size_t size = nla_total_size(0);
size += num_vfs *
- (nla_total_size(sizeof(struct ifla_vf_mac)) +
+ (nla_total_size(0) +
+ nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
+ nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
+ nla_total_size(MAX_VLAN_LIST_LEN *
+ sizeof(struct ifla_vf_vlan_info)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
+ nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+ nla_total_size(0) + /* nest IFLA_VF_STATS */
/* IFLA_VF_STATS_RX_PACKETS */
nla_total_size_64bit(sizeof(__u64)) +
/* IFLA_VF_STATS_TX_PACKETS */
@@ -893,7 +901,8 @@ static size_t rtnl_port_size(const struct net_device *dev,
static size_t rtnl_xdp_size(const struct net_device *dev)
{
- size_t xdp_size = nla_total_size(1); /* XDP_ATTACHED */
+ size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
+ nla_total_size(1); /* XDP_ATTACHED */
if (!dev->netdev_ops->ndo_xdp)
return 0;
@@ -922,8 +931,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
- + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
- + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1109,14 +1118,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct nlattr *vfinfo)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct nlattr *vf, *vfstats, *vfvlanlist;
struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_vlan_info vf_vlan_info;
struct ifla_vf_spoofchk vf_spoofchk;
struct ifla_vf_tx_rate vf_tx_rate;
struct ifla_vf_stats vf_stats;
struct ifla_vf_trust vf_trust;
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
- struct nlattr *vf, *vfstats;
struct ifla_vf_mac vf_mac;
struct ifla_vf_info ivi;
@@ -1133,11 +1143,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
* IFLA_VF_LINK_STATE_AUTO which equals zero
*/
ivi.linkstate = 0;
+ /* VLAN Protocol by default is 802.1Q */
+ ivi.vlan_proto = htons(ETH_P_8021Q);
if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
return 0;
+ memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+
vf_mac.vf =
vf_vlan.vf =
+ vf_vlan_info.vf =
vf_rate.vf =
vf_tx_rate.vf =
vf_spoofchk.vf =
@@ -1148,6 +1163,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
+ vf_vlan_info.vlan = ivi.vlan;
+ vf_vlan_info.qos = ivi.qos;
+ vf_vlan_info.vlan_proto = ivi.vlan_proto;
vf_tx_rate.rate = ivi.max_tx_rate;
vf_rate.min_tx_rate = ivi.min_tx_rate;
vf_rate.max_tx_rate = ivi.max_tx_rate;
@@ -1156,10 +1174,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_rss_query_en.setting = ivi.rss_query_en;
vf_trust.setting = ivi.trusted;
vf = nla_nest_start(skb, IFLA_VF_INFO);
- if (!vf) {
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vf)
+ goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
@@ -1175,17 +1191,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
&vf_rss_query_en) ||
nla_put(skb, IFLA_VF_TRUST,
sizeof(vf_trust), &vf_trust))
- return -EMSGSIZE;
+ goto nla_put_vf_failure;
+ vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+ if (!vfvlanlist)
+ goto nla_put_vf_failure;
+ if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
+ &vf_vlan_info)) {
+ nla_nest_cancel(skb, vfvlanlist);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfvlanlist);
memset(&vf_stats, 0, sizeof(vf_stats));
if (dev->netdev_ops->ndo_get_vf_stats)
dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
&vf_stats);
vfstats = nla_nest_start(skb, IFLA_VF_STATS);
- if (!vfstats) {
- nla_nest_cancel(skb, vf);
- nla_nest_cancel(skb, vfinfo);
- return -EMSGSIZE;
- }
+ if (!vfstats)
+ goto nla_put_vf_failure;
if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
@@ -1197,11 +1219,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD))
- return -EMSGSIZE;
+ vf_stats.multicast, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
return 0;
+
+nla_put_vf_failure:
+ nla_nest_cancel(skb, vf);
+nla_put_vfinfo_failure:
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
}
static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
@@ -1446,6 +1476,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
+ [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
[IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) },
[IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) },
@@ -1474,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
[IFLA_XDP_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
+ [IFLA_XDP_FLAGS] = { .type = NLA_U32 },
};
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -1578,7 +1610,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
head = &net->dev_index_head[h];
hlist_for_each_entry(dev, head, index_hlist) {
if (link_dump_filtered(dev, master_idx, kind_ops))
- continue;
+ goto cont;
if (idx < s_idx)
goto cont;
err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
@@ -1702,7 +1734,37 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
- ivv->qos);
+ ivv->qos,
+ htons(ETH_P_8021Q));
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[IFLA_VF_VLAN_LIST]) {
+ struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
+ struct nlattr *attr;
+ int rem, len = 0;
+
+ err = -EOPNOTSUPP;
+ if (!ops->ndo_set_vf_vlan)
+ return err;
+
+ nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
+ if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
+ nla_len(attr) < NLA_HDRLEN) {
+ return -EINVAL;
+ }
+ if (len >= MAX_VLAN_LIST_LEN)
+ return -EOPNOTSUPP;
+ ivvl[len] = nla_data(attr);
+
+ len++;
+ }
+ if (len == 0)
+ return -EINVAL;
+
+ err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
+ ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
return err;
}
@@ -2103,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_XDP]) {
struct nlattr *xdp[IFLA_XDP_MAX + 1];
+ u32 xdp_flags = 0;
err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
ifla_xdp_policy);
@@ -2113,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+
+ if (xdp[IFLA_XDP_FLAGS]) {
+ xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+ if (xdp_flags & ~XDP_FLAGS_MASK) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
if (xdp[IFLA_XDP_FD]) {
err = dev_change_xdp_fd(dev,
- nla_get_s32(xdp[IFLA_XDP_FD]));
+ nla_get_s32(xdp[IFLA_XDP_FD]),
+ xdp_flags);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2676,7 +2749,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
ext_filter_mask));
}
- return min_ifinfo_dump_size;
+ return nlmsg_total_size(min_ifinfo_dump_size);
}
static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2791,7 +2864,10 @@ nla_put_failure:
static inline size_t rtnl_fdb_nlmsg_size(void)
{
- return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
+ return NLMSG_ALIGN(sizeof(struct ndmsg)) +
+ nla_total_size(ETH_ALEN) + /* NDA_LLADDR */
+ nla_total_size(sizeof(u16)) + /* NDA_VLAN */
+ 0;
}
static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
@@ -3066,7 +3142,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[0])
+ if (*idx < cb->args[2])
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -3093,19 +3169,18 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev,
- int idx)
+ int *idx)
{
int err;
netif_addr_lock_bh(dev);
- err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
goto out;
- nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
+ err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
- cb->args[1] = err;
- return idx;
+ return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -3118,9 +3193,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
const struct net_device_ops *cops = NULL;
struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
struct net *net = sock_net(skb->sk);
+ struct hlist_head *head;
int brport_idx = 0;
int br_idx = 0;
- int idx = 0;
+ int h, s_h;
+ int idx = 0, s_idx;
+ int err = 0;
+ int fidx = 0;
if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
ifla_policy) == 0) {
@@ -3138,49 +3217,71 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- cb->args[1] = 0;
- for_each_netdev(net, dev) {
- if (brport_idx && (dev->ifindex != brport_idx))
- continue;
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
- if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(dev, head, index_hlist) {
- } else {
- if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
+ if (brport_idx && (dev->ifindex != brport_idx))
continue;
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !(dev->priv_flags & IFF_EBRIDGE))
- continue;
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+ } else {
+ if (dev != br_dev &&
+ !(dev->priv_flags & IFF_BRIDGE_PORT))
+ continue;
- cops = ops;
- }
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !(dev->priv_flags & IFF_EBRIDGE))
+ continue;
+ cops = ops;
+ }
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- if (cops && cops->ndo_fdb_dump)
- idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
- idx);
- }
- if (cb->args[1] == -EMSGSIZE)
- break;
+ if (idx < s_idx)
+ goto cont;
- if (dev->netdev_ops->ndo_fdb_dump)
- idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
- idx);
- else
- idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
- if (cb->args[1] == -EMSGSIZE)
- break;
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb,
+ br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ goto out;
+ }
+ }
- cops = NULL;
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
+ dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ if (err == -EMSGSIZE)
+ goto out;
+
+ cops = NULL;
+
+ /* reset fdb offset to 0 for rest of the interfaces */
+ cb->args[2] = 0;
+ fidx = 0;
+cont:
+ idx++;
+ }
}
- cb->args[0] = idx;
+out:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = fidx;
+
return skb->len;
}
@@ -3550,6 +3651,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
+#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
+static int rtnl_get_offload_stats_attr_size(int attr_id)
+{
+ switch (attr_id) {
+ case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+ return sizeof(struct rtnl_link_stats64);
+ }
+
+ return 0;
+}
+
+static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
+ int *prividx)
+{
+ struct nlattr *attr = NULL;
+ int attr_id, size;
+ void *attr_data;
+ int err;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return -ENODATA;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (attr_id < *prividx)
+ continue;
+
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (!size)
+ continue;
+
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
+ continue;
+
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ goto nla_put_failure;
+
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
+ attr_data);
+ if (err)
+ goto get_offload_stats_failure;
+ }
+
+ if (!attr)
+ return -ENODATA;
+
+ *prividx = 0;
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+get_offload_stats_failure:
+ *prividx = attr_id;
+ return err;
+}
+
+static int rtnl_get_offload_stats_size(const struct net_device *dev)
+{
+ int nla_size = 0;
+ int attr_id;
+ int size;
+
+ if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats))
+ return 0;
+
+ for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
+ attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
+ if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
+ continue;
+ size = rtnl_get_offload_stats_attr_size(attr_id);
+ nla_size += nla_total_size_64bit(size);
+ }
+
+ if (nla_size != 0)
+ nla_size += nla_total_size(0);
+
+ return nla_size;
+}
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, unsigned int filter_mask,
@@ -3559,6 +3745,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
struct nlmsghdr *nlh;
struct nlattr *attr;
int s_prividx = *prividx;
+ int err;
ASSERT_RTNL();
@@ -3587,8 +3774,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS);
@@ -3612,8 +3797,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (master)
ops = master->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
- int err;
-
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
attr = nla_nest_start(skb,
IFLA_STATS_LINK_XSTATS_SLAVE);
@@ -3628,6 +3811,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
+ *idxattr)) {
+ *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
+ attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr)
+ goto nla_put_failure;
+
+ err = rtnl_get_offload_stats(skb, dev, prividx);
+ if (err == -ENODATA)
+ nla_nest_cancel(skb, attr);
+ else
+ nla_nest_end(skb, attr);
+
+ if (err && err != -ENODATA)
+ goto nla_put_failure;
+ *idxattr = 0;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -3642,10 +3843,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = {
- [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) },
-};
-
static size_t if_nlmsg_stats_size(const struct net_device *dev,
u32 filter_mask)
{
@@ -3685,6 +3882,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
+ size += rtnl_get_offload_stats_size(dev);
+
return size;
}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..88a8e429fc3e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -12,6 +12,7 @@
#include <net/secure_seq.h>
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <net/tcp.h>
#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
@@ -40,8 +41,8 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
@@ -58,6 +59,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
md5_transform(hash, secret);
+ *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
return seq_scale(hash[0]);
}
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
@@ -86,8 +88,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#ifdef CONFIG_INET
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, u32 *tsoff)
{
u32 hash[MD5_DIGEST_WORDS];
@@ -99,6 +101,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
md5_transform(hash, net_secret);
+ *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
return seq_scale(hash[0]);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3864b4b68fa1..65a74e13c45b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -354,7 +354,7 @@ EXPORT_SYMBOL(build_skb);
struct napi_alloc_cache {
struct page_frag_cache page;
- size_t skb_count;
+ unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return false;
}
-ssize_t skb_socket_splice(struct sock *sk,
- struct pipe_inode_info *pipe,
- struct splice_pipe_desc *spd)
-{
- int ret;
-
- /* Drop the socket lock, otherwise we have reverse
- * locking dependencies between sk_lock and i_mutex
- * here as compared to sendfile(). We enter here
- * with the socket lock held, and splice_to_pipe() will
- * grab the pipe inode lock. For sendfile() emulation,
- * we call into ->sendpage() with the i_mutex lock held
- * and networking will grab the socket lock.
- */
- release_sock(sk);
- ret = splice_to_pipe(pipe, spd);
- lock_sock(sk);
-
- return ret;
-}
-
/*
* Map data from the skb to a pipe. Should handle both the linear part,
* the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
- unsigned int flags,
- ssize_t (*splice_cb)(struct sock *,
- struct pipe_inode_info *,
- struct splice_pipe_desc *))
+ unsigned int flags)
{
struct partial_page partial[MAX_SKB_FRAGS];
struct page *pages[MAX_SKB_FRAGS];
@@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
if (spd.nr_pages)
- ret = splice_cb(sk, pipe, &spd);
+ ret = splice_to_pipe(pipe, &spd);
return ret;
}
@@ -2445,6 +2421,25 @@ void skb_queue_purge(struct sk_buff_head *list)
EXPORT_SYMBOL(skb_queue_purge);
/**
+ * skb_rbtree_purge - empty a skb rbtree
+ * @root: root of the rbtree to empty
+ *
+ * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
+ * the list and one reference dropped. This function does not take
+ * any lock. Synchronization should be handled by the caller (e.g., TCP
+ * out-of-order queue is protected by the socket lock).
+ */
+void skb_rbtree_purge(struct rb_root *root)
+{
+ struct sk_buff *skb, *next;
+
+ rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
+ kfree_skb(skb);
+
+ *root = RB_ROOT;
+}
+
+/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
@@ -2661,7 +2656,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
struct skb_frag_struct *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
- BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
+
+ if (skb_headlen(skb))
+ return 0;
todo = shiftlen;
from = 0;
@@ -3078,11 +3075,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
- /* GSO partial only requires that we trim off any excess that
- * doesn't fit into an MSS sized block, so take care of that
- * now.
- */
- if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) {
+ if (sg && csum && (mss != GSO_BY_FRAGS)) {
+ if (!(features & NETIF_F_GSO_PARTIAL)) {
+ struct sk_buff *iter;
+
+ if (!list_skb ||
+ !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
+ goto normal;
+
+ /* Split the buffer at the frag_list pointer.
+ * This is based on the assumption that all
+ * buffers in the chain excluding the last
+ * containing the same amount of data.
+ */
+ skb_walk_frags(head_skb, iter) {
+ if (skb_headlen(iter))
+ goto normal;
+
+ len -= iter->len;
+ }
+ }
+
+ /* GSO partial only requires that we trim off any excess that
+ * doesn't fit into an MSS sized block, so take care of that
+ * now.
+ */
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
@@ -3090,6 +3107,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
partial_segs = 0;
}
+normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3281,21 +3299,29 @@ perform_csum_check:
*/
segs->prev = tail;
- /* Update GSO info on first skb in partial sequence. */
if (partial_segs) {
+ struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
+ unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
- type |= SKB_GSO_PARTIAL;
+ type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
- skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size;
- skb_shinfo(segs)->gso_segs = partial_segs;
- skb_shinfo(segs)->gso_type = type;
- SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset;
+ for (iter = segs; iter; iter = iter->next) {
+ skb_shinfo(iter)->gso_size = gso_size;
+ skb_shinfo(iter)->gso_segs = partial_segs;
+ skb_shinfo(iter)->gso_type = type;
+ SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
+ }
+
+ if (tail->len - doffset <= gso_size)
+ skb_shinfo(tail)->gso_size = 0;
+ else if (tail != segs)
+ skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
@@ -3688,21 +3714,29 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
+static bool is_icmp_err_skb(const struct sk_buff *skb)
+{
+ return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
+}
+
struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
struct sk_buff_head *q = &sk->sk_error_queue;
- struct sk_buff *skb, *skb_next;
+ struct sk_buff *skb, *skb_next = NULL;
+ bool icmp_next = false;
unsigned long flags;
- int err = 0;
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q)))
- err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
+ icmp_next = is_icmp_err_skb(skb_next);
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_err = err;
- if (err)
+ if (is_icmp_err_skb(skb) && !icmp_next)
+ sk->sk_err = 0;
+
+ if (skb_next)
sk->sk_error_report(sk);
return skb;
@@ -3814,10 +3848,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
- if (tsonly)
- skb = alloc_skb(0, GFP_ATOMIC);
- else
+ if (tsonly) {
+#ifdef CONFIG_INET
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
+ skb = tcp_get_timestamping_opt_stats(sk);
+ else
+#endif
+ skb = alloc_skb(0, GFP_ATOMIC);
+ } else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
+ }
if (!skb)
return;
@@ -4474,17 +4516,24 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
}
EXPORT_SYMBOL(skb_ensure_writable);
-/* remove VLAN header from packet and update csum accordingly. */
-static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+/* remove VLAN header from packet and update csum accordingly.
+ * expects a non skb_vlan_tag_present skb with a vlan tag payload
+ */
+int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
- unsigned int offset = skb->data - skb_mac_header(skb);
+ int offset = skb->data - skb_mac_header(skb);
int err;
- __skb_push(skb, offset);
+ if (WARN_ONCE(offset,
+ "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
if (unlikely(err))
- goto pull;
+ return err;
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
@@ -4501,12 +4550,14 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
skb_set_network_header(skb, ETH_HLEN);
skb_reset_mac_len(skb);
-pull:
- __skb_pull(skb, offset);
return err;
}
+EXPORT_SYMBOL(__skb_vlan_pop);
+/* Pop a vlan tag either from hwaccel or from payload.
+ * Expects skb->data at mac header.
+ */
int skb_vlan_pop(struct sk_buff *skb)
{
u16 vlan_tci;
@@ -4516,9 +4567,7 @@ int skb_vlan_pop(struct sk_buff *skb)
if (likely(skb_vlan_tag_present(skb))) {
skb->vlan_tci = 0;
} else {
- if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci);
@@ -4526,9 +4575,7 @@ int skb_vlan_pop(struct sk_buff *skb)
return err;
}
/* move next vlan tag to hw accel tag */
- if (likely((skb->protocol != htons(ETH_P_8021Q) &&
- skb->protocol != htons(ETH_P_8021AD)) ||
- skb->len < VLAN_ETH_HLEN))
+ if (likely(!eth_type_vlan(skb->protocol)))
return 0;
vlan_proto = skb->protocol;
@@ -4541,29 +4588,30 @@ int skb_vlan_pop(struct sk_buff *skb)
}
EXPORT_SYMBOL(skb_vlan_pop);
+/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
+ * Expects skb->data at mac header.
+ */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
if (skb_vlan_tag_present(skb)) {
- unsigned int offset = skb->data - skb_mac_header(skb);
+ int offset = skb->data - skb_mac_header(skb);
int err;
- /* __vlan_insert_tag expect skb->data pointing to mac header.
- * So change skb->data before calling it and change back to
- * original position later
- */
- __skb_push(skb, offset);
+ if (WARN_ONCE(offset,
+ "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
+ offset)) {
+ return -EINVAL;
+ }
+
err = __vlan_insert_tag(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
- if (err) {
- __skb_pull(skb, offset);
+ if (err)
return err;
- }
skb->protocol = skb->vlan_proto;
skb->mac_len += VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
- __skb_pull(skb, offset);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
@@ -4883,3 +4931,35 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
return clone;
}
EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ * We do not reallocate skb->head thus can not fail.
+ * Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+ if (skb->data_len) {
+ if (skb->data_len > skb->end - skb->tail ||
+ skb_cloned(skb))
+ return;
+
+ /* Nice, we can free page frag(s) right now */
+ __pskb_pull_tail(skb, skb->data_len);
+ }
+ /* At this point, skb->truesize might be over estimated,
+ * because skb had a fragment, and fragments do not tell
+ * their truesize.
+ * When we pulled its content into skb->head, fragment
+ * was freed, but __pskb_pull_tail() could not possibly
+ * adjust skb->truesize, not knowing the frag truesize.
+ */
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index fd7b41edf1ce..9fa46b956bdc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -453,7 +453,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
EXPORT_SYMBOL(sock_queue_rcv_skb);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
- const int nested, unsigned int trim_cap)
+ const int nested, unsigned int trim_cap, bool refcounted)
{
int rc = NET_RX_SUCCESS;
@@ -487,7 +487,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
bh_unlock_sock(sk);
out:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return rc;
discard_and_relse:
kfree_skb(skb);
@@ -714,7 +715,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
val = min_t(u32, val, sysctl_wmem_max);
set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -750,7 +751,7 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
break;
case SO_RCVBUFFORCE:
@@ -853,6 +854,13 @@ set_rcvbuf:
sk->sk_tskey = 0;
}
}
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+ ret = -EINVAL;
+ break;
+ }
+
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
@@ -1315,24 +1323,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
#endif
}
-void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
-{
- unsigned long nulls1, nulls2;
-
- nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
- nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
- if (nulls1 > nulls2)
- swap(nulls1, nulls2);
-
- if (nulls1 != 0)
- memset((char *)sk, 0, nulls1);
- memset((char *)sk + nulls1 + sizeof(void *), 0,
- nulls2 - nulls1 - sizeof(void *));
- memset((char *)sk + nulls2 + sizeof(void *), 0,
- size - nulls2 - sizeof(void *));
-}
-EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
-
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
@@ -1344,12 +1334,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
- if (priority & __GFP_ZERO) {
- if (prot->clear_sk)
- prot->clear_sk(sk, prot->obj_size);
- else
- sk_prot_clear_nulls(sk, prot->obj_size);
- }
+ if (priority & __GFP_ZERO)
+ sk_prot_clear_nulls(sk, prot->obj_size);
} else
sk = kmalloc(prot->obj_size, priority);
@@ -1385,6 +1371,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
slab = prot->slab;
cgroup_sk_free(&sk->sk_cgrp_data);
+ mem_cgroup_sk_free(sk);
security_sk_free(sk);
if (slab != NULL)
kmem_cache_free(slab, sk);
@@ -1421,6 +1408,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
+ mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
@@ -1563,10 +1551,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(&newsk->sk_cookie, 0);
+ mem_cgroup_sk_alloc(newsk);
cgroup_sk_alloc(&newsk->sk_cgrp_data);
/*
@@ -1591,9 +1581,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- sock_update_memcg(newsk);
-
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
@@ -2100,37 +2087,31 @@ void __sk_flush_backlog(struct sock *sk)
*/
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
int rc;
- DEFINE_WAIT(wait);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+ rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
EXPORT_SYMBOL(sk_wait_data);
/**
- * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * __sk_mem_raise_allocated - increase memory_allocated
* @sk: socket
* @size: memory size to allocate
+ * @amt: pages to allocate
* @kind: allocation type
*
- * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
- * rmem allocation. This function assumes that protocols which have
- * memory_pressure use sk_wmem_queued as write buffer accounting.
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
*/
-int __sk_mem_schedule(struct sock *sk, int size, int kind)
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
- int amt = sk_mem_pages(size);
- long allocated;
-
- sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-
- allocated = sk_memory_allocated_add(sk, amt);
+ long allocated = sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
!mem_cgroup_charge_skmem(sk->sk_memcg, amt))
@@ -2191,9 +2172,6 @@ suppress_allocation:
trace_sock_exceed_buf_limit(sk, prot, allocated);
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-
sk_memory_allocated_sub(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
@@ -2201,18 +2179,40 @@ suppress_allocation:
return 0;
}
+EXPORT_SYMBOL(__sk_mem_raise_allocated);
+
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int ret, amt = sk_mem_pages(size);
+
+ sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ ret = __sk_mem_raise_allocated(sk, size, amt, kind);
+ if (!ret)
+ sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ return ret;
+}
EXPORT_SYMBOL(__sk_mem_schedule);
/**
- * __sk_mem_reclaim - reclaim memory_allocated
+ * __sk_mem_reduce_allocated - reclaim memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of quanta
+ *
+ * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
*/
-void __sk_mem_reclaim(struct sock *sk, int amount)
+void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
sk_memory_allocated_sub(sk, amount);
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
@@ -2221,6 +2221,19 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
+EXPORT_SYMBOL(__sk_mem_reduce_allocated);
+
+/**
+ * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ */
+void __sk_mem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ __sk_mem_reduce_allocated(sk, amount);
+}
EXPORT_SYMBOL(__sk_mem_reclaim);
int sk_set_peek_off(struct sock *sk, int val)
@@ -2456,8 +2469,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
- } else
+ sk->sk_uid = SOCK_INODE(sock)->i_uid;
+ } else {
sk->sk_wq = NULL;
+ sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ }
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index e92b759d906c..9a1a352fd1eb 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -129,7 +129,6 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2)
return 0;
}
-EXPORT_SYMBOL(reuseport_add_sock);
static void reuseport_free_rcu(struct rcu_head *head)
{
diff --git a/net/core/stream.c b/net/core/stream.c
index 159516a11b7e..f575bcf64af2 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk)
rcu_read_unlock();
}
}
-EXPORT_SYMBOL(sk_stream_write_space);
/**
* sk_stream_wait_connect - Wait for a socket to get into the connected state
@@ -54,8 +53,8 @@ EXPORT_SYMBOL(sk_stream_write_space);
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
int done;
do {
@@ -69,13 +68,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
!((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
- finish_wait(sk_sleep(sk), &wait);
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
@@ -95,16 +94,16 @@ static inline int sk_stream_closing(struct sock *sk)
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
do {
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
- if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+ if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
break;
} while (!signal_pending(current) && timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
}
}
EXPORT_SYMBOL(sk_stream_wait_close);
@@ -120,16 +119,16 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
long vm_wait = 0;
long current_timeo = *timeo_p;
bool noblock = (*timeo_p ? false : true);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ add_wait_queue(sk_sleep(sk), &wait);
+
while (1) {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p) {
@@ -148,7 +147,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
sk_wait_event(sk, &current_timeo, sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
(sk_stream_memory_free(sk) &&
- !vm_wait));
+ !vm_wait), &wait);
sk->sk_write_pending--;
if (vm_wait) {
@@ -162,7 +161,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..2a46e4009f62 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- if (sock_table)
+ if (sock_table) {
static_key_slow_inc(&rps_needed);
+ static_key_slow_inc(&rfs_needed);
+ }
if (orig_sock_table) {
static_key_slow_dec(&rps_needed);
+ static_key_slow_dec(&rfs_needed);
synchronize_rcu();
vfree(orig_sock_table);
}