From 8010d7feb2f0367ae573ad601b2905e29db50cd3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 11 Dec 2016 20:09:23 +0100 Subject: netfilter: nft_quota: reset quota after dump Dumping of netlink attributes may fail due to insufficient room in the skbuff, so let's reset consumed quota if we succeed to put netlink attributes into the skbuff. Fixes: 43da04a593d8 ("netfilter: nf_tables: atomic dump and reset for stateful objects") Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_quota.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index bd6efc53f26d..2d6fe3559912 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -110,30 +110,32 @@ static int nft_quota_obj_init(const struct nlattr * const tb[], static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv, bool reset) { + u64 consumed, consumed_cap; u32 flags = priv->flags; - u64 consumed; - - if (reset) { - consumed = atomic64_xchg(&priv->consumed, 0); - if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) - flags |= NFT_QUOTA_F_DEPLETED; - } else { - consumed = atomic64_read(&priv->consumed); - } /* Since we inconditionally increment consumed quota for each packet * that we see, don't go over the quota boundary in what we send to * userspace. */ - if (consumed > priv->quota) - consumed = priv->quota; + consumed = atomic64_read(&priv->consumed); + if (consumed >= priv->quota) { + consumed_cap = priv->quota; + flags |= NFT_QUOTA_F_DEPLETED; + } else { + consumed_cap = consumed; + } if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), NFTA_QUOTA_PAD) || - nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed), + nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap), NFTA_QUOTA_PAD) || nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) goto nla_put_failure; + + if (reset) { + atomic64_sub(consumed, &priv->consumed); + clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags); + } return 0; nla_put_failure: -- cgit v1.2.3 From c2e756ff9e699865d294cdc112acfc36419cf5cc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 11 Dec 2016 20:46:51 +0100 Subject: netfilter: nft_queue: use raw_smp_processor_id() Using smp_processor_id() causes splats with PREEMPT_RCU: [19379.552780] BUG: using smp_processor_id() in preemptible [00000000] code: ping/32389 [19379.552793] caller is debug_smp_processor_id+0x17/0x19 [...] [19379.552823] Call Trace: [19379.552832] [] dump_stack+0x67/0x90 [19379.552837] [] check_preemption_disabled+0xe5/0xf5 [19379.552842] [] debug_smp_processor_id+0x17/0x19 [19379.552849] [] nft_queue_eval+0x35/0x20c [nft_queue] No need to disable preemption since we only fetch the numeric value, so let's use raw_smp_processor_id() instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 3e19fa1230dc..dbb6aaff67ec 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -38,7 +38,7 @@ static void nft_queue_eval(const struct nft_expr *expr, if (priv->queues_total > 1) { if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); queue = priv->queuenum + cpu % priv->queues_total; } else { -- cgit v1.2.3 From 3e38df136e453aa69eb4472108ebce2fb00b1ba6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 13 Dec 2016 13:59:33 +0100 Subject: netfilter: nf_tables: fix oob access BUG: KASAN: slab-out-of-bounds in nf_tables_rule_destroy+0xf1/0x130 at addr ffff88006a4c35c8 Read of size 8 by task nft/1607 When we've destroyed last valid expr, nft_expr_next() returns an invalid expr. We must not dereference it unless it passes != nft_expr_last() check. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a019a87e58ee..0db5f9782265 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2115,7 +2115,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { + while (expr != nft_expr_last(rule) && expr->ops) { nf_tables_expr_destroy(ctx, expr); expr = nft_expr_next(expr); } -- cgit v1.2.3 From 053d20f5712529016eae10356e0dea9b360325bd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Dec 2016 12:15:49 +0100 Subject: netfilter: nft_payload: mangle ckecksum if NFT_PAYLOAD_L4CSUM_PSEUDOHDR is set If the NFT_PAYLOAD_L4CSUM_PSEUDOHDR flag is set, then mangle layer 4 checksum. This should not depend on csum_type NFT_PAYLOAD_CSUM_INET since IPv6 header has no checksum field, but still an update of any of the pseudoheader fields may trigger a layer 4 checksum update. Fixes: 1814096980bb ("netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 36d2b1096546..7d699bbd45b0 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -250,6 +250,22 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, return 0; } +static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, + __wsum fsum, __wsum tsum, int csum_offset) +{ + __sum16 sum; + + if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + nft_csum_replace(&sum, fsum, tsum); + if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + return -1; + + return 0; +} + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -259,7 +275,6 @@ static void nft_payload_set_eval(const struct nft_expr *expr, const u32 *src = ®s->data[priv->sreg]; int offset, csum_offset; __wsum fsum, tsum; - __sum16 sum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: @@ -282,18 +297,14 @@ static void nft_payload_set_eval(const struct nft_expr *expr, csum_offset = offset + priv->csum_offset; offset += priv->offset; - if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) && (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || skb->ip_summed != CHECKSUM_PARTIAL)) { - if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) - goto err; - fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); - nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || - skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) + if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && + nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset)) goto err; if (priv->csum_flags && -- cgit v1.2.3 From 6c5d5cfbe3c59429e6d6f66477a7609aacf69751 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 20 Dec 2016 19:14:34 +0800 Subject: netfilter: ipt_CLUSTERIP: check duplicate config when initializing Now when adding an ipt_CLUSTERIP rule, it only checks duplicate config in clusterip_config_find_get(). But after that, there may be still another thread to insert a config with the same ip, then it leaves proc_create_data to do duplicate check. It's more reasonable to check duplicate config by ipt_CLUSTERIP itself, instead of checking it by proc fs duplicate file check. Before, when proc fs allowed duplicate name files in a directory, It could even crash kernel because of use-after-free. This patch is to check duplicate config under the protection of clusterip net lock when initializing a new config and correct the return err. Note that it also moves proc file node creation after adding new config, as proc_create_data may sleep, it couldn't be called under the clusterip_net lock. clusterip_config_find_get returns NULL if c->pde is null to make sure it can't be used until the proc file node creation is done. Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 21db00d0362b..a6b8c1a4102b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -144,7 +144,7 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { - if (unlikely(!atomic_inc_not_zero(&c->refcount))) + if (!c->pde || unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; else if (entry) atomic_inc(&c->entries); @@ -166,14 +166,15 @@ clusterip_config_init_nodelist(struct clusterip_config *c, static struct clusterip_config * clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, - struct net_device *dev) + struct net_device *dev) { + struct net *net = dev_net(dev); struct clusterip_config *c; - struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id); + struct clusterip_net *cn = net_generic(net, clusterip_net_id); c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) - return NULL; + return ERR_PTR(-ENOMEM); c->dev = dev; c->clusterip = ip; @@ -185,6 +186,17 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, atomic_set(&c->refcount, 1); atomic_set(&c->entries, 1); + spin_lock_bh(&cn->lock); + if (__clusterip_config_find(net, ip)) { + spin_unlock_bh(&cn->lock); + kfree(c); + + return ERR_PTR(-EBUSY); + } + + list_add_rcu(&c->list, &cn->configs); + spin_unlock_bh(&cn->lock); + #ifdef CONFIG_PROC_FS { char buffer[16]; @@ -195,16 +207,16 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { + spin_lock_bh(&cn->lock); + list_del_rcu(&c->list); + spin_unlock_bh(&cn->lock); kfree(c); - return NULL; + + return ERR_PTR(-ENOMEM); } } #endif - spin_lock_bh(&cn->lock); - list_add_rcu(&c->list, &cn->configs); - spin_unlock_bh(&cn->lock); - return c; } @@ -410,9 +422,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) config = clusterip_config_init(cipinfo, e->ip.dst.s_addr, dev); - if (!config) { + if (IS_ERR(config)) { dev_put(dev); - return -ENOMEM; + return PTR_ERR(config); } dev_mc_add(config->dev, config->clustermac); } -- cgit v1.2.3 From 0df0f207aab4f42e5c96a807adf9a6845b69e984 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 28 Dec 2016 14:54:47 +0200 Subject: net/sched: cls_flower: Fix missing addr_type in classify Since we now use a non zero mask on addr_type, we are matching on its value (IPV4/IPV6). So before this fix, matching on enc_src_ip/enc_dst_ip failed in SW/classify path since its value was zero. This patch sets the proper value of addr_type for encapsulated packets. Fixes: 970bfcd09791 ('net/sched: cls_flower: Use mask for addr_type') Signed-off-by: Paul Blakey Reviewed-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 333f8e268431..970db7a41684 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -153,10 +153,14 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, switch (ip_tunnel_info_af(info)) { case AF_INET: + skb_key.enc_control.addr_type = + FLOW_DISSECTOR_KEY_IPV4_ADDRS; skb_key.enc_ipv4.src = key->u.ipv4.src; skb_key.enc_ipv4.dst = key->u.ipv4.dst; break; case AF_INET6: + skb_key.enc_control.addr_type = + FLOW_DISSECTOR_KEY_IPV6_ADDRS; skb_key.enc_ipv6.src = key->u.ipv6.src; skb_key.enc_ipv6.dst = key->u.ipv6.dst; break; -- cgit v1.2.3 From 9dd0f896d2cc5815d859e945db90915071cd44b3 Mon Sep 17 00:00:00 2001 From: Augusto Mecking Caringi Date: Wed, 28 Dec 2016 16:02:05 +0000 Subject: net: atm: Fix warnings in net/atm/lec.c when !CONFIG_PROC_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes the following warnings when CONFIG_PROC_FS is not set: linux/net/atm/lec.c: In function ‘lane_module_cleanup’: linux/net/atm/lec.c:1062:27: error: ‘atm_proc_root’ undeclared (first use in this function) remove_proc_entry("lec", atm_proc_root); ^ linux/net/atm/lec.c:1062:27: note: each undeclared identifier is reported only once for each function it appears in Signed-off-by: Augusto Mecking Caringi Signed-off-by: David S. Miller --- net/atm/lec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/atm/lec.c b/net/atm/lec.c index 019557d0a11d..09cfe87f0a44 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1059,7 +1059,9 @@ static void __exit lane_module_cleanup(void) { int i; +#ifdef CONFIG_PROC_FS remove_proc_entry("lec", atm_proc_root); +#endif deregister_atm_ioctl(&lane_ioctl_ops); -- cgit v1.2.3 From e4c5e13aa45c23692e4acf56f0b3533f328199b2 Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Wed, 28 Dec 2016 23:23:46 +0800 Subject: ipv6: Should use consistent conditional judgement for ip6 fragment between __ip6_append_data and ip6_finish_output There is an inconsistent conditional judgement between __ip6_append_data and ip6_finish_output functions, the variable length in __ip6_append_data just include the length of application's payload and udp6 header, don't include the length of ipv6 header, but in ip6_finish_output use (skb->len > ip6_skb_dst_mtu(skb)) as judgement, and skb->len include the length of ipv6 header. That causes some particular application's udp6 payloads whose length are between (MTU - IPv6 Header) and MTU were fragmented by ip6_fragment even though the rst->dev support UFO feature. Add the length of ipv6 header to length in __ip6_append_data to keep consistent conditional judgement as ip6_finish_output for ip6 fragment. Signed-off-by: Zheng Li Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 70d0de404197..38122d04fadc 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1373,7 +1373,7 @@ emsgsize: */ cork->length += length; - if (((length > mtu) || + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && -- cgit v1.2.3 From 4775cc1f2d5abca894ac32774eefc22c45347d1c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 28 Dec 2016 17:52:15 +0100 Subject: rtnl: stats - add missing netlink message size checks We miss to check if the netlink message is actually big enough to contain a struct if_stats_msg. Add a check to prevent userland from sending us short messages that would make us access memory beyond the end of the message. Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump...") Signed-off-by: Mathias Krause Cc: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 18b5aae99bec..75e3ea7bda08 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3898,6 +3898,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) u32 filter_mask; int err; + if (nlmsg_len(nlh) < sizeof(*ifsm)) + return -EINVAL; + ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); @@ -3947,6 +3950,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; + if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) + return -EINVAL; + ifsm = nlmsg_data(cb->nlh); filter_mask = ifsm->filter_mask; if (!filter_mask) -- cgit v1.2.3 From f0c16ba8933ed217c2688b277410b2a37ba81591 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Thu, 29 Dec 2016 16:45:04 +0800 Subject: net: fix incorrect original ingress device index in PKTINFO When we send a packet for our own local address on a non-loopback interface (e.g. eth0), due to the change had been introduced from commit 0b922b7a829c ("net: original ingress device index in PKTINFO"), the original ingress device index would be set as the loopback interface. However, the packet should be considered as if it is being arrived via the sending interface (eth0), otherwise it would break the expectation of the userspace application (e.g. the DHCPRELEASE message from dhcp_release binary would be ignored by the dnsmasq daemon, since it come from lo which is not the interface dnsmasq bind to) Fixes: 0b922b7a829c ("net: original ingress device index in PKTINFO") Acked-by: David Ahern Signed-off-by: Wei Zhang Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 57e1405e8282..53ae0c6315ad 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1225,8 +1225,14 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) * which has interface index (iif) as the first member of the * underlying inet{6}_skb_parm struct. This code then overlays * PKTINFO_SKB_CB and in_pktinfo also has iif as the first - * element so the iif is picked up from the prior IPCB + * element so the iif is picked up from the prior IPCB. If iif + * is the loopback interface, then return the sending interface + * (e.g., process binds socket to eth0 for Tx which is + * redirected to loopback in the rtable/dst). */ + if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) + pktinfo->ipi_ifindex = inet_iif(skb); + pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; -- cgit v1.2.3 From f5a0aab84b74de68523599817569c057c7ac1622 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 29 Dec 2016 15:29:03 -0800 Subject: net: ipv4: dst for local input routes should use l3mdev if relevant IPv4 output routes already use l3mdev device instead of loopback for dst's if it is applicable. Change local input routes to do the same. This fixes icmp responses for unreachable UDP ports which are directed to the wrong table after commit 9d1a6c4ea43e4 because local_input routes use the loopback device. Moving from ingress device to loopback loses the L3 domain causing responses based on the dst to get to lost. Fixes: 9d1a6c4ea43e4 ("net: icmp_route_lookup should use rt dev to determine L3 domain") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a82a11747b3f..0fcac8e7a2b2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1914,7 +1914,8 @@ local_input: } } - rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type, + rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, + flags | RTCF_LOCAL, res.type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; -- cgit v1.2.3 From 14221cc45caad2fcab3a8543234bb7eda9b540d5 Mon Sep 17 00:00:00 2001 From: Artur Molchanov Date: Fri, 30 Dec 2016 19:46:36 +0300 Subject: bridge: netfilter: Fix dropping packets that moving through bridge interface Problem: br_nf_pre_routing_finish() calls itself instead of br_nf_pre_routing_finish_bridge(). Due to this bug reverse path filter drops packets that go through bridge interface. User impact: Local docker containers with bridge network can not communicate with each other. Fixes: c5136b15ea36 ("netfilter: bridge: add and use br_nf_hook_thresh") Signed-off-by: Artur Molchanov Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b12501a77f18..135cc8ab813c 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -399,7 +399,7 @@ bridged_dnat: br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish); + br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); -- cgit v1.2.3 From e1a3a60a2ebe991605acb14cd58e39c0545e174e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 30 Dec 2016 17:42:32 -0600 Subject: net: socket: don't set sk_uid to garbage value in ->setattr() ->setattr() was recently implemented for socket files to sync the socket inode's uid to the new 'sk_uid' member of struct sock. It does this by copying over the ia_uid member of struct iattr. However, ia_uid is actually only valid when ATTR_UID is set in ia_valid, indicating that the uid is being changed, e.g. by chown. Other metadata operations such as chmod or utimes leave ia_uid uninitialized. Therefore, sk_uid could be set to a "garbage" value from the stack. Fix this by only copying the uid over when ATTR_UID is set. Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Eric Biggers Tested-by: Lorenzo Colitti Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 8487bf136e5c..a8c2307590b8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -537,7 +537,7 @@ int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); - if (!err) { + if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); sock->sk->sk_uid = iattr->ia_uid; -- cgit v1.2.3 From 4200462d88f47f3759bdf4705f87e207b0f5b2e4 Mon Sep 17 00:00:00 2001 From: Reiter Wolfgang Date: Sat, 31 Dec 2016 21:11:57 +0100 Subject: drop_monitor: add missing call to genlmsg_end Update nlmsg_len field with genlmsg_end to enable userspace processing using nlmsg_next helper. Also adds error handling. Signed-off-by: Reiter Wolfgang Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 8e0c0635ee97..f465bad2ef2c 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -75,6 +75,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) struct nlattr *nla; struct sk_buff *skb; unsigned long flags; + void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); @@ -82,17 +83,31 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) skb = genlmsg_new(al, GFP_KERNEL); - if (skb) { - genlmsg_put(skb, 0, 0, &net_drop_monitor_family, - 0, NET_DM_CMD_ALERT); - nla = nla_reserve(skb, NLA_UNSPEC, - sizeof(struct net_dm_alert_msg)); - msg = nla_data(nla); - memset(msg, 0, al); - } else { - mod_timer(&data->send_timer, jiffies + HZ / 10); + if (!skb) + goto err; + + msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + if (!msg_header) { + nlmsg_free(skb); + skb = NULL; + goto err; + } + nla = nla_reserve(skb, NLA_UNSPEC, + sizeof(struct net_dm_alert_msg)); + if (!nla) { + nlmsg_free(skb); + skb = NULL; + goto err; } + msg = nla_data(nla); + memset(msg, 0, al); + genlmsg_end(skb, msg_header); + goto out; +err: + mod_timer(&data->send_timer, jiffies + HZ / 10); +out: spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); spin_unlock_irqrestore(&data->lock, flags); -- cgit v1.2.3 From 97b84fd6d91766ea57dcc350d78f42639e011c30 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 30 Dec 2016 19:48:19 +0100 Subject: l2tp: consider '::' as wildcard address in l2tp_ip6 socket lookup An L2TP socket bound to the unspecified address should match with any address. If not, it can't receive any packet and __l2tp_ip6_bind_lookup() can't prevent another socket from binding on the same device/tunnel ID. While there, rename the 'addr' variable to 'sk_laddr' (local addr), to make following patch clearer. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f092ac441fdd..3135b9d55df5 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -64,7 +64,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net, struct sock *sk; sk_for_each_bound(sk, &l2tp_ip6_bind_table) { - const struct in6_addr *addr = inet6_rcv_saddr(sk); + const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); if (l2tp == NULL) @@ -72,7 +72,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net, if ((l2tp->conn_id == tunnel_id) && net_eq(sock_net(sk), net) && - (!addr || ipv6_addr_equal(addr, laddr)) && + (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) && (!sk->sk_bound_dev_if || !dif || sk->sk_bound_dev_if == dif)) goto found; -- cgit v1.2.3 From a9b2dff80be979432484afaf7f8d8e73f9e8838a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 30 Dec 2016 19:48:20 +0100 Subject: l2tp: take remote address into account in l2tp_ip and l2tp_ip6 socket lookups For connected sockets, __l2tp_ip{,6}_bind_lookup() needs to check the remote IP when looking for a matching socket. Otherwise a connected socket can receive traffic not originating from its peer. Drop l2tp_ip_bind_lookup() and l2tp_ip6_bind_lookup() instead of updating their prototype, as these functions aren't used. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 19 ++++++------------- net/l2tp/l2tp_ip6.c | 20 ++++++-------------- 2 files changed, 12 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 8938b6ba57a0..3d73278b86ca 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -47,7 +47,8 @@ static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) return (struct l2tp_ip_sock *)sk; } -static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) +static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, + __be32 raddr, int dif, u32 tunnel_id) { struct sock *sk; @@ -61,6 +62,7 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif if ((l2tp->conn_id == tunnel_id) && net_eq(sock_net(sk), net) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && + (!inet->inet_daddr || !raddr || inet->inet_daddr == raddr) && (!sk->sk_bound_dev_if || !dif || sk->sk_bound_dev_if == dif)) goto found; @@ -71,15 +73,6 @@ found: return sk; } -static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id) -{ - struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id); - if (sk) - sock_hold(sk); - - return sk; -} - /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header @@ -183,8 +176,8 @@ pass_up: struct iphdr *iph = (struct iphdr *) skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); - sk = __l2tp_ip_bind_lookup(net, iph->daddr, inet_iif(skb), - tunnel_id); + sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, + inet_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip_lock); goto discard; @@ -280,7 +273,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ write_lock_bh(&l2tp_ip_lock); - if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, + if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip_lock); ret = -EADDRINUSE; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 3135b9d55df5..331ccf5a7bad 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -59,12 +59,14 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) static struct sock *__l2tp_ip6_bind_lookup(struct net *net, struct in6_addr *laddr, + const struct in6_addr *raddr, int dif, u32 tunnel_id) { struct sock *sk; sk_for_each_bound(sk, &l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); + const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); if (l2tp == NULL) @@ -73,6 +75,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net, if ((l2tp->conn_id == tunnel_id) && net_eq(sock_net(sk), net) && (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) && + (!raddr || ipv6_addr_any(sk_raddr) || ipv6_addr_equal(sk_raddr, raddr)) && (!sk->sk_bound_dev_if || !dif || sk->sk_bound_dev_if == dif)) goto found; @@ -83,17 +86,6 @@ found: return sk; } -static inline struct sock *l2tp_ip6_bind_lookup(struct net *net, - struct in6_addr *laddr, - int dif, u32 tunnel_id) -{ - struct sock *sk = __l2tp_ip6_bind_lookup(net, laddr, dif, tunnel_id); - if (sk) - sock_hold(sk); - - return sk; -} - /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header @@ -197,8 +189,8 @@ pass_up: struct ipv6hdr *iph = ipv6_hdr(skb); read_lock_bh(&l2tp_ip6_lock); - sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, inet6_iif(skb), - tunnel_id); + sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, + inet6_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip6_lock); goto discard; @@ -330,7 +322,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) rcu_read_unlock(); write_lock_bh(&l2tp_ip6_lock); - if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, bound_dev_if, + if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip6_lock); err = -EADDRINUSE; -- cgit v1.2.3 From 35f432a03e41d3bf08c51ede917f94e2288fbe8c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 2 Jan 2017 11:19:29 +0100 Subject: mac80211: initialize fast-xmit 'info' later In ieee80211_xmit_fast(), 'info' is initialized to point to the skb that's passed in, but that skb may later be replaced by a clone (if it was shared), leading to an invalid pointer. This can lead to use-after-free and also later crashes since the real SKB's info->hw_queue doesn't get initialized properly. Fix this by assigning info only later, when it's needed, after the skb replacement (may have) happened. Cc: stable@vger.kernel.org Reported-by: Ben Greear Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2c21b7039136..0d8b716e509e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3287,7 +3287,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); int hw_headroom = sdata->local->hw.extra_tx_headroom; struct ethhdr eth; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info *info; struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; struct ieee80211_tx_data tx; ieee80211_tx_result r; @@ -3351,6 +3351,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, memcpy(skb->data + fast_tx->da_offs, eth.h_dest, ETH_ALEN); memcpy(skb->data + fast_tx->sa_offs, eth.h_source, ETH_ALEN); + info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); info->band = fast_tx->band; info->control.vif = &sdata->vif; -- cgit v1.2.3 From d0af683407a26a4437d8fa6e283ea201f2ae8146 Mon Sep 17 00:00:00 2001 From: Ian Kumlien Date: Mon, 2 Jan 2017 09:18:35 +0100 Subject: flow_dissector: Update pptp handling to avoid null pointer deref. __skb_flow_dissect can be called with a skb or a data packet, either can be NULL. All calls seems to have been moved to __skb_header_pointer except the pptp handling which is still calling skb_header_pointer. skb_header_pointer will use skb->data and thus: [ 109.556866] BUG: unable to handle kernel NULL pointer dereference at 0000000000000080 [ 109.557102] IP: [] __skb_flow_dissect+0xa88/0xce0 [ 109.557263] PGD 0 [ 109.557338] [ 109.557484] Oops: 0000 [#1] SMP [ 109.557562] Modules linked in: chaoskey [ 109.557783] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.0 #79 [ 109.557867] Hardware name: Supermicro A1SRM-LN7F/LN5F/A1SRM-LN7F-2758, BIOS 1.0c 11/04/2015 [ 109.557957] task: ffff94085c27bc00 task.stack: ffffb745c0068000 [ 109.558041] RIP: 0010:[] [] __skb_flow_dissect+0xa88/0xce0 [ 109.558203] RSP: 0018:ffff94087fc83d40 EFLAGS: 00010206 [ 109.558286] RAX: 0000000000000130 RBX: ffffffff8975bf80 RCX: ffff94084fab6800 [ 109.558373] RDX: 0000000000000010 RSI: 000000000000000c RDI: 0000000000000000 [ 109.558460] RBP: 0000000000000b88 R08: 0000000000000000 R09: 0000000000000022 [ 109.558547] R10: 0000000000000008 R11: ffff94087fc83e04 R12: 0000000000000000 [ 109.558763] R13: ffff94084fab6800 R14: ffff94087fc83e04 R15: 000000000000002f [ 109.558979] FS: 0000000000000000(0000) GS:ffff94087fc80000(0000) knlGS:0000000000000000 [ 109.559326] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 109.559539] CR2: 0000000000000080 CR3: 0000000281809000 CR4: 00000000001026e0 [ 109.559753] Stack: [ 109.559957] 000000000000000c ffff94084fab6822 0000000000000001 ffff94085c2b5fc0 [ 109.560578] 0000000000000001 0000000000002000 0000000000000000 0000000000000000 [ 109.561200] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 109.561820] Call Trace: [ 109.562027] [ 109.562108] [] ? eth_get_headlen+0x7a/0xf0 [ 109.562522] [] ? igb_poll+0x96a/0xe80 [ 109.562737] [] ? net_rx_action+0x20b/0x350 [ 109.562953] [] ? __do_softirq+0xe8/0x280 [ 109.563169] [] ? irq_exit+0xaa/0xb0 [ 109.563382] [] ? do_IRQ+0x4b/0xc0 [ 109.563597] [] ? common_interrupt+0x7f/0x7f [ 109.563810] [ 109.563890] [] ? cpuidle_enter_state+0x130/0x2c0 [ 109.564304] [] ? cpuidle_enter_state+0x120/0x2c0 [ 109.564520] [] ? cpu_startup_entry+0x19f/0x1f0 [ 109.564737] [] ? start_secondary+0x12a/0x140 [ 109.564950] Code: 83 e2 20 a8 80 0f 84 60 01 00 00 c7 04 24 08 00 00 00 66 85 d2 0f 84 be fe ff ff e9 69 fe ff ff 8b 34 24 89 f2 83 c2 04 66 85 c0 <41> 8b 84 24 80 00 00 00 0f 49 d6 41 8d 31 01 d6 41 2b 84 24 84 [ 109.569959] RIP [] __skb_flow_dissect+0xa88/0xce0 [ 109.570245] RSP [ 109.570453] CR2: 0000000000000080 Fixes: ab10dccb1160 ("rps: Inspect PPTP encapsulated by GRE to get flow hash") Signed-off-by: Ian Kumlien Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d6447dc10371..fe4e1531976c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -468,8 +468,9 @@ ip_proto_again: if (hdr->flags & GRE_ACK) offset += sizeof(((struct pptp_gre_header *)0)->ack); - ppp_hdr = skb_header_pointer(skb, nhoff + offset, - sizeof(_ppp_hdr), _ppp_hdr); + ppp_hdr = __skb_header_pointer(skb, nhoff + offset, + sizeof(_ppp_hdr), + data, hlen, _ppp_hdr); if (!ppp_hdr) goto out_bad; -- cgit v1.2.3 From 7ababb782690e03b78657e27bd051e20163af2d6 Mon Sep 17 00:00:00 2001 From: Michal Tesar Date: Mon, 2 Jan 2017 14:38:36 +0100 Subject: igmp: Make igmp group member RFC 3376 compliant 5.2. Action on Reception of a Query When a system receives a Query, it does not respond immediately. Instead, it delays its response by a random amount of time, bounded by the Max Resp Time value derived from the Max Resp Code in the received Query message. A system may receive a variety of Queries on different interfaces and of different kinds (e.g., General Queries, Group-Specific Queries, and Group-and-Source-Specific Queries), each of which may require its own delayed response. Before scheduling a response to a Query, the system must first consider previously scheduled pending responses and in many cases schedule a combined response. Therefore, the system must be able to maintain the following state: o A timer per interface for scheduling responses to General Queries. o A per-group and interface timer for scheduling responses to Group- Specific and Group-and-Source-Specific Queries. o A per-group and interface list of sources to be reported in the response to a Group-and-Source-Specific Query. When a new Query with the Router-Alert option arrives on an interface, provided the system has state to report, a delay for a response is randomly selected in the range (0, [Max Resp Time]) where Max Resp Time is derived from Max Resp Code in the received Query message. The following rules are then used to determine if a Report needs to be scheduled and the type of Report to schedule. The rules are considered in order and only the first matching rule is applied. 1. If there is a pending response to a previous General Query scheduled sooner than the selected delay, no additional response needs to be scheduled. 2. If the received Query is a General Query, the interface timer is used to schedule a response to the General Query after the selected delay. Any previously pending response to a General Query is canceled. --8<-- Currently the timer is rearmed with new random expiration time for every incoming query regardless of possibly already pending report. Which is not aligned with the above RFE. It also might happen that higher rate of incoming queries can postpone the report after the expiration time of the first query causing group membership loss. Now the per interface general query timer is rearmed only when there is no pending report already scheduled on that interface or the newly selected expiration time is before the already pending scheduled report. Signed-off-by: Michal Tesar Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 68d622133f53..5b15459955f8 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -219,9 +219,14 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { int tv = prandom_u32() % in_dev->mr_maxdelay; + unsigned long exp = jiffies + tv + 2; + + if (in_dev->mr_gq_running && + time_after_eq(exp, (in_dev->mr_gq_timer).expires)) + return; in_dev->mr_gq_running = 1; - if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) + if (!mod_timer(&in_dev->mr_gq_timer, exp)) in_dev_hold(in_dev); } -- cgit v1.2.3 From 5350d54f6cd12eaff623e890744c79b700bd3f17 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 2 Jan 2017 13:32:54 -0800 Subject: ipv4: Do not allow MAIN to be alias for new LOCAL w/ custom rules In the case of custom rules being present we need to handle the case of the LOCAL table being intialized after the new rule has been added. To address that I am adding a new check so that we can make certain we don't use an alias of MAIN for LOCAL when allocating a new table. Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") Reported-by: Oliver Brunel Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3ff8938893ec..eae0332b0e8c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -85,7 +85,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - if (id == RT_TABLE_LOCAL) + if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); -- cgit v1.2.3 From 3b48ab2248e61408910e792fe84d6ec466084c1a Mon Sep 17 00:00:00 2001 From: Reiter Wolfgang Date: Tue, 3 Jan 2017 01:39:10 +0100 Subject: drop_monitor: consider inserted data in genlmsg_end Final nlmsg_len field update must reflect inserted net_dm_drop_point data. This patch depends on previous patch: "drop_monitor: add missing call to genlmsg_end" Signed-off-by: Reiter Wolfgang Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index f465bad2ef2c..fb55327dcfea 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -102,7 +102,6 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) } msg = nla_data(nla); memset(msg, 0, al); - genlmsg_end(skb, msg_header); goto out; err: @@ -112,6 +111,13 @@ out: swap(data->skb, skb); spin_unlock_irqrestore(&data->lock, flags); + if (skb) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); + + genlmsg_end(skb, genlmsg_data(gnlh)); + } + return skb; } -- cgit v1.2.3 From 4ea33ef0f9e95b69db9131d7afd98563713e81b0 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 27 Dec 2016 08:51:17 +0100 Subject: batman-adv: Decrease hardif refcnt on fragmentation send error An error before the hardif is found has to free the skb. But every error after that has to free the skb + put the hard interface. Fixes: 8def0be82dd1 ("batman-adv: Consume skb in batadv_frag_send_packet") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/fragmentation.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 9c561e683f4b..0854ebd8613e 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -474,7 +474,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) { ret = -EINVAL; - goto put_primary_if; + goto free_skb; } /* Create one header to be copied to all fragments */ @@ -502,7 +502,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, skb_fragment = batadv_frag_create(skb, &frag_header, mtu); if (!skb_fragment) { ret = -ENOMEM; - goto free_skb; + goto put_primary_if; } batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX); @@ -511,7 +511,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, ret = batadv_send_unicast_skb(skb_fragment, neigh_node); if (ret != NET_XMIT_SUCCESS) { ret = NET_XMIT_DROP; - goto free_skb; + goto put_primary_if; } frag_header.no++; @@ -519,7 +519,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, /* The initial check in this function should cover this case */ if (frag_header.no == BATADV_FRAG_MAX_FRAGMENTS - 1) { ret = -EINVAL; - goto free_skb; + goto put_primary_if; } } @@ -527,7 +527,7 @@ int batadv_frag_send_packet(struct sk_buff *skb, if (batadv_skb_head_push(skb, header_size) < 0 || pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) { ret = -ENOMEM; - goto free_skb; + goto put_primary_if; } memcpy(skb->data, &frag_header, header_size); -- cgit v1.2.3 From 753aacfd2e95df6a0caf23c03dc309020765bea9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Jan 2017 10:57:14 +0100 Subject: nl80211: fix sched scan netlink socket owner destruction A single netlink socket might own multiple interfaces *and* a scheduled scan request (which might belong to another interface), so when it goes away both may need to be destroyed. Remove the schedule_scan_stop indirection to fix this - it's only needed for interface destruction because of the way this works right now, with a single work taking care of all interfaces. Cc: stable@vger.kernel.org Fixes: 93a1e86ce10e4 ("nl80211: Stop scheduled scan if netlink client disappears") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3df85a751a85..ef5eff93a8b8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14502,13 +14502,17 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { bool schedule_destroy_work = false; - bool schedule_scan_stop = false; struct cfg80211_sched_scan_request *sched_scan_req = rcu_dereference(rdev->sched_scan_req); if (sched_scan_req && notify->portid && - sched_scan_req->owner_nlportid == notify->portid) - schedule_scan_stop = true; + sched_scan_req->owner_nlportid == notify->portid) { + sched_scan_req->owner_nlportid = 0; + + if (rdev->ops->sched_scan_stop && + rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + schedule_work(&rdev->sched_scan_stop_wk); + } list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); @@ -14539,12 +14543,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb, spin_unlock(&rdev->destroy_list_lock); schedule_work(&rdev->destroy_work); } - } else if (schedule_scan_stop) { - sched_scan_req->owner_nlportid = 0; - - if (rdev->ops->sched_scan_stop && - rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) - schedule_work(&rdev->sched_scan_stop_wk); } } -- cgit v1.2.3 From 93e246f783e6bd1bc64fdfbfe68b18161f69b28e Mon Sep 17 00:00:00 2001 From: David Forster Date: Fri, 6 Jan 2017 10:27:59 +0000 Subject: vti6: fix device register to report IFLA_INFO_KIND vti6 interface is registered before the rtnl_link_ops block is attached. As a result the resulting RTM_NEWLINK is missing IFLA_INFO_KIND. Re-order attachment of rtnl_link_ops block to fix. Signed-off-by: Dave Forster Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index f4b4a4a5f4ba..d82042c8d8fd 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -189,12 +189,12 @@ static int vti6_tnl_create2(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; + dev->rtnl_link_ops = &vti6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); - dev->rtnl_link_ops = &vti6_link_ops; dev_hold(dev); vti6_tnl_link(ip6n, t); -- cgit v1.2.3 From bcd5e1a49f0d54afd3c5411bed2f59996e1c53e4 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 6 Jan 2017 14:26:54 -0500 Subject: netlabel: add CALIPSO to the list of built-in protocols When we added CALIPSO support in Linux v4.8 we forgot to add it to the list of supported protocols with display at boot. Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/netlabel/netlabel_kapi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 28c56b95fb7f..ea7c67050792 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1502,10 +1502,7 @@ static int __init netlbl_init(void) printk(KERN_INFO "NetLabel: Initializing\n"); printk(KERN_INFO "NetLabel: domain hash size = %u\n", (1 << NETLBL_DOMHSH_BITSIZE)); - printk(KERN_INFO "NetLabel: protocols =" - " UNLABELED" - " CIPSOv4" - "\n"); + printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n"); ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE); if (ret_val != 0) -- cgit v1.2.3 From eeb0d56fab4cd7848cf2be6704fa48900dbc1381 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 14 Dec 2016 16:47:43 +0100 Subject: mac80211: implement multicast forwarding on fast-RX path In AP (or VLAN) mode, when unicast 802.11 packets are received, they might actually be multicast after conversion. In this case the fast-RX path didn't handle them properly to send them back to the wireless medium. Implement that by copying the SKB and sending it back out. The possible alternative would be to just punt the packet back to the regular (slow) RX path, but since we have almost all of the required code here already it's not so complicated to add here. Punting it back would also mean acquiring the spinlock, which would be bad for the stated purpose of the fast-RX path, to enable well-performing parallel RX. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3e289a64ed43..c037c5bb6167 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3942,21 +3942,31 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, u64_stats_update_end(&stats->syncp); if (fast_rx->internal_forward) { - struct sta_info *dsta = sta_info_get(rx->sdata, skb->data); + struct sk_buff *xmit_skb = NULL; + bool multicast = is_multicast_ether_addr(skb->data); - if (dsta) { + if (multicast) { + xmit_skb = skb_copy(skb, GFP_ATOMIC); + } else if (sta_info_get(rx->sdata, skb->data)) { + xmit_skb = skb; + skb = NULL; + } + + if (xmit_skb) { /* * Send to wireless media and increase priority by 256 * to keep the received priority instead of * reclassifying the frame (see cfg80211_classify8021d). */ - skb->priority += 256; - skb->protocol = htons(ETH_P_802_3); - skb_reset_network_header(skb); - skb_reset_mac_header(skb); - dev_queue_xmit(skb); - return true; + xmit_skb->priority += 256; + xmit_skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(xmit_skb); + skb_reset_mac_header(xmit_skb); + dev_queue_xmit(xmit_skb); } + + if (!skb) + return true; } /* deliver to local stack */ -- cgit v1.2.3 From cc31d43b4154ad5a7d8aa5543255a93b7e89edc2 Mon Sep 17 00:00:00 2001 From: Pau Espin Pedrol Date: Fri, 6 Jan 2017 20:33:27 +0100 Subject: netfilter: use fwmark_reflect in nf_send_reset Otherwise, RST packets generated by ipt_REJECT always have mark 0 when the routing is checked later in the same code path. Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies") Cc: Lorenzo Colitti Signed-off-by: Pau Espin Pedrol Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_reject_ipv4.c | 2 ++ net/ipv6/netfilter/nf_reject_ipv6.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index fd8220213afc..146d86105183 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -126,6 +126,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); + nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); + skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 10090400c72f..eedee5d108d9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); + fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { @@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) skb_dst_set(nskb, dst); + nskb->mark = fl6.flowi6_mark; + skb_reserve(nskb, hh_len + dst->header_len); ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); -- cgit v1.2.3 From bf99b4ded5f8a4767dbb9d180626f06c51f9881f Mon Sep 17 00:00:00 2001 From: Pau Espin Pedrol Date: Fri, 6 Jan 2017 20:33:28 +0100 Subject: tcp: fix mark propagation with fwmark_reflect enabled Otherwise, RST packets generated by the TCP stack for non-existing sockets always have mark 0. The mark from the original packet is assigned to the netns_ipv4/6 socket used to send the response so that it can get copied into the response skb when the socket sends it. Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies") Cc: Lorenzo Colitti Signed-off-by: Pau Espin Pedrol Signed-off-by: Pablo Neira Ayuso --- net/ipv4/ip_output.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index fac275c48108..b67719f45953 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1629,6 +1629,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 73bc8fc68acd..2b20622a5824 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -840,6 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); + ctl_sk->sk_mark = fl6.flowi6_mark; ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) -- cgit v1.2.3 From 67c408cfa8862fe7e45b3a1f762f7140e03b7217 Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Sat, 7 Jan 2017 23:53:00 +0100 Subject: ipv6: fix typos o s/approriate/appropriate o s/discouvery/discovery Signed-off-by: Alexander Alemayhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8417c41d8ec8..ce5aaf448c54 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1464,7 +1464,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_node *fn; /* Get the "current" route for this destination and - * check if the redirect has come from approriate router. + * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. @@ -2768,7 +2768,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) old MTU is the lowest MTU in the path, update the route PMTU to reflect the increase. In this case if the other nodes' MTU also have the lowest MTU, TOO BIG MESSAGE will be lead to - PMTU discouvery. + PMTU discovery. */ if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && -- cgit v1.2.3 From b007f09072ca8afa118ade333e717ba443e8d807 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 9 Jan 2017 10:45:49 +0300 Subject: ipv4: make tcp_notsent_lowat sysctl knob behave as true unsigned int > cat /proc/sys/net/ipv4/tcp_notsent_lowat -1 > echo 4294967295 > /proc/sys/net/ipv4/tcp_notsent_lowat -bash: echo: write error: Invalid argument > echo -2147483648 > /proc/sys/net/ipv4/tcp_notsent_lowat > cat /proc/sys/net/ipv4/tcp_notsent_lowat -2147483648 but in documentation we have "tcp_notsent_lowat - UNSIGNED INTEGER" v2: simplify to just proc_douintvec Signed-off-by: Pavel Tikhomirov Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 22cbd61079b5..b2fa498b15d1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -951,7 +951,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec, }, { .procname = "tcp_tw_reuse", -- cgit v1.2.3 From 6bb629db5e7daa619f5242b6ad93e4dd9bf7432c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Jan 2017 08:51:32 -0800 Subject: tcp: do not export tcp_peer_is_proven() After commit 1fb6f159fd21 ("tcp: add tcp_conn_request"), tcp_peer_is_proven() no longer needs to be exported. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d46f4d5b1c62..ba8f02d0f283 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -606,7 +606,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, return ret; } -EXPORT_SYMBOL_GPL(tcp_peer_is_proven); void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) { -- cgit v1.2.3 From dc647ec88e029307e60e6bf9988056605f11051a Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jan 2017 09:30:51 +0100 Subject: net: socket: Make unnecessarily global sockfs_setattr() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sockfs_setattr() static as it is not used outside of net/socket.c This fixes the following GCC warning: net/socket.c:534:5: warning: no previous prototype for ‘sockfs_setattr’ [-Wmissing-prototypes] Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Cc: Lorenzo Colitti Signed-off-by: Tobias Klauser Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index a8c2307590b8..0758e13754e2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -533,7 +533,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, return used; } -int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) { int err = simple_setattr(dentry, iattr); -- cgit v1.2.3 From d9584d8ccc06ba98f4fad8ec720de66b6659fd35 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Jan 2017 11:18:01 -0800 Subject: net: skb_flow_get_be16() can be static Removes following sparse complain : net/core/flow_dissector.c:70:8: warning: symbol 'skb_flow_get_be16' was not declared. Should it be static? Fixes: 972d3876faa8 ("flow dissector: ICMP support") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index fe4e1531976c..1b7673aac59d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -67,8 +67,8 @@ EXPORT_SYMBOL(skb_flow_dissector_init); * The function will try to retrieve a be32 entity at * offset poff */ -__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data, - int hlen) +static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, + void *data, int hlen) { __be16 *u, _u; -- cgit v1.2.3 From faf3a932fbeb77860226a8323eacb835edc98648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 9 Jan 2017 11:58:34 -0800 Subject: net: dsa: Ensure validity of dst->ds[0] It is perfectly possible to have non zero indexed switches being present in a DSA switch tree, in such a case, we will be deferencing a NULL pointer while dsa_cpu_port_ethtool_{setup,restore}. Be more defensive and ensure that dst->ds[0] is valid before doing anything with it. Fixes: 0c73c523cf73 ("net: dsa: Initialize CPU port ethtool ops per tree") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5fff951a0a49..da3862124545 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -394,9 +394,11 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->ds[0]); - if (err) - return err; + if (dst->ds[0]) { + err = dsa_cpu_port_ethtool_setup(dst->ds[0]); + if (err) + return err; + } /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -433,7 +435,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->ds[0]); + if (dst->ds[0]) + dsa_cpu_port_ethtool_restore(dst->ds[0]); pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; -- cgit v1.2.3 From 3512a1ad56174308a9fd3e10f4b1e3e152e9ec01 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 9 Jan 2017 14:31:58 -0800 Subject: net: qrtr: Mark 'buf' as little endian Failure to mark this pointer as __le32 causes checkers like sparse to complain: net/qrtr/qrtr.c:274:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:274:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:274:16: got restricted __le32 [usertype] net/qrtr/qrtr.c:275:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:275:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:275:16: got restricted __le32 [usertype] net/qrtr/qrtr.c:276:16: warning: incorrect type in assignment (different base types) net/qrtr/qrtr.c:276:16: expected unsigned int [unsigned] [usertype] net/qrtr/qrtr.c:276:16: got restricted __le32 [usertype] Silence it. Cc: Bjorn Andersson Signed-off-by: Stephen Boyd Acked-by: Bjorn Andersson Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c985ecbe9bd6..ae5ac175b2be 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -252,7 +252,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, const int pkt_len = 20; struct qrtr_hdr *hdr; struct sk_buff *skb; - u32 *buf; + __le32 *buf; skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); if (!skb) @@ -269,7 +269,7 @@ static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, hdr->dst_node_id = cpu_to_le32(dst_node); hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); - buf = (u32 *)skb_put(skb, pkt_len); + buf = (__le32 *)skb_put(skb, pkt_len); memset(buf, 0, pkt_len); buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); buf[1] = cpu_to_le32(src_node); -- cgit v1.2.3 From 5d722b3024f6762addb8642ffddc9f275b5107ae Mon Sep 17 00:00:00 2001 From: "Anna, Suman" Date: Mon, 9 Jan 2017 21:48:56 -0600 Subject: net: add the AF_QIPCRTR entries to family name tables Commit bdabad3e363d ("net: Add Qualcomm IPC router") introduced a new address family. Update the family name tables accordingly so that the lockdep initialization can use the proper names for this family. Cc: Courtney Cavin Cc: Bjorn Andersson Signed-off-by: Suman Anna Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index f560e0826009..4eca27dc5c94 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_MAX" + "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_MAX" + "slock-AF_QIPCRTR", "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_MAX" + "clock-AF_QIPCRTR", "clock-AF_MAX" }; /* -- cgit v1.2.3 From dc5367bcc556e97555fc94a32cd1aadbebdff47e Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 10 Jan 2017 17:10:34 +0100 Subject: net/af_iucv: don't use paged skbs for TX on HiperSockets With commit e53743994e21 ("af_iucv: use paged SKBs for big outbound messages"), we transmit paged skbs for both of AF_IUCV's transport modes (IUCV or HiperSockets). The qeth driver for Layer 3 HiperSockets currently doesn't support NETIF_F_SG, so these skbs would just be linearized again by the stack. Avoid that overhead by using paged skbs only for IUCV transport. cc stable, since this also circumvents a significant skb leak when sending large messages (where the skb then needs to be linearized). Signed-off-by: Julian Wiedmann Signed-off-by: Ursula Braun Cc: # v4.8+ Fixes: e53743994e21 ("af_iucv: use paged SKBs for big outbound messages") Signed-off-by: David S. Miller --- net/iucv/af_iucv.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cfb9e5f4e28f..13190b38f22e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1044,7 +1044,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); - size_t headroom, linear; + size_t headroom = 0; + size_t linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1122,18 +1123,20 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) - ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; - if (headroom + len < PAGE_SIZE) { + if (iucv->transport == AF_IUCV_TRANS_HIPER) { + headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN; linear = len; } else { - /* In nonlinear "classic" iucv skb, - * reserve space for iucv_array - */ - if (iucv->transport != AF_IUCV_TRANS_HIPER) - headroom += sizeof(struct iucv_array) * - (MAX_SKB_FRAGS + 1); - linear = PAGE_SIZE - headroom; + if (len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + headroom = sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } } skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, noblock, &err, 0); -- cgit v1.2.3 From 1272ce87fa017ca4cf32920764d879656b7a005a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:01 -0800 Subject: gro: Enter slow-path if there is no tailroom The GRO path has a fast-path where we avoid calling pskb_may_pull and pskb_expand by directly accessing frag0. However, this should only be done if we have enough tailroom in the skb as otherwise we'll have to expand it later anyway. This patch adds the check by capping frag0_len with the skb tailroom. Fixes: cb18978cbf45 ("gro: Open-code final pskb_may_pull") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8db5a0b4b520..88d2907ca2cd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4441,7 +4441,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0), + skb->end - skb->tail); } } -- cgit v1.2.3 From 57ea52a865144aedbcd619ee0081155e658b6f7d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:15 -0800 Subject: gro: Disable frag0 optimization on IPv6 ext headers The GRO fast path caches the frag0 address. This address becomes invalid if frag0 is modified by pskb_may_pull or its variants. So whenever that happens we must disable the frag0 optimization. This is usually done through the combination of gro_header_hard and gro_header_slow, however, the IPv6 extension header path did the pulling directly and would continue to use the GRO fast path incorrectly. This patch fixes it by disabling the fast path when we enter the IPv6 extension header path. Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 89c59e656f44..fc7b4017ba24 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -191,6 +191,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { __pskb_pull(skb, skb_gro_offset(skb)); + skb_gro_frag0_invalidate(skb); proto = ipv6_gso_pull_exthdrs(skb, proto); skb_gro_pull(skb, -skb_transport_offset(skb)); skb_reset_transport_header(skb); -- cgit v1.2.3 From 8c2dd3e4a4bae78093c4a5cee6494877651be3c9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:06 -0800 Subject: mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free Patch series "Page fragment updates", v4. This patch series takes care of a few cleanups for the page fragments API. First we do some renames so that things are much more consistent. First we move the page_frag_ portion of the name to the front of the functions names. Secondly we split out the cache specific functions from the other page fragment functions by adding the word "cache" to the name. Finally I added a bit of documentation that will hopefully help to explain some of this. I plan to revisit this later as we get things more ironed out in the near future with the changes planned for the DMA setup to support eXpress Data Path. This patch (of 3): This patch renames the page frag functions to be more consistent with other APIs. Specifically we place the name page_frag first in the name and then have either an alloc or free call name that we append as the suffix. This makes it a bit clearer in terms of naming. In addition we drop the leading double underscores since we are technically no longer a backing interface and instead the front end that is called from the networking APIs. Link: http://lkml.kernel.org/r/20170104023854.13451.67390.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/core/skbuff.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a03730fbc1a..734c71468b01 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, fragsz, gfp_mask); + data = page_frag_alloc(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(&nc->page, fragsz, gfp_mask); + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, len, gfp_mask); + data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); @@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(&nc->page, len, gfp_mask); + data = page_frag_alloc(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; -- cgit v1.2.3 From 7cfd5fd5a9813f1430290d20c0fead9b4582a307 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Jan 2017 19:52:43 -0800 Subject: gro: use min_t() in skb_gro_reset_offset() On 32bit arches, (skb->end - skb->data) is not 'unsigned int', so we shall use min_t() instead of min() to avoid a compiler error. Fixes: 1272ce87fa01 ("gro: Enter slow-path if there is no tailroom") Reported-by: kernel test robot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 88d2907ca2cd..07b307b0b414 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4441,8 +4441,9 @@ static void skb_gro_reset_offset(struct sk_buff *skb) pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min(skb_frag_size(frag0), - skb->end - skb->tail); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); } } -- cgit v1.2.3 From 73b351473547e543e9c8166dd67fd99c64c15b0b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jan 2017 13:08:06 +0100 Subject: cgroup: move CONFIG_SOCK_CGROUP_DATA to init/Kconfig We now 'select SOCK_CGROUP_DATA' but Kconfig complains that this is not right when CONFIG_NET is disabled and there is no socket interface: warning: (CGROUP_BPF) selects SOCK_CGROUP_DATA which has unmet direct dependencies (NET) I don't know what the correct solution for this is, but simply removing the dependency on NET from SOCK_CGROUP_DATA by moving it out of the 'if NET' section avoids the warning and does not produce other build errors. Fixes: 483c4933ea09 ("cgroup: Fix CGROUP_BPF config") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/Kconfig | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index a1005007224c..a29bb4b41c50 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -258,10 +258,6 @@ config XPS config HWBM bool -config SOCK_CGROUP_DATA - bool - default n - config CGROUP_NET_PRIO bool "Network priority cgroup" depends on CGROUPS -- cgit v1.2.3 From 7a18c5b9fb31a999afc62b0e60978aa896fc89e9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 10 Jan 2017 14:37:35 -0800 Subject: net: ipv4: Fix multipath selection with vrf fib_select_path does not call fib_select_multipath if oif is set in the flow struct. For VRF use cases oif is always set, so multipath route selection is bypassed. Use the FLOWI_FLAG_SKIP_NH_OIF to skip the oif check similar to what is done in fib_table_lookup. Add saddr and proto to the flow struct for the fib lookup done by the VRF driver to better match hash computation for a flow. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7a5b4c7d9a87..eba1546b5031 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1618,8 +1618,13 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, int mp_hash) { + bool oif_check; + + oif_check = (fl4->flowi4_oif == 0 || + fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (res->fi->fib_nhs > 1 && oif_check) { if (mp_hash < 0) mp_hash = get_hash_from_flowi4(fl4) >> 1; @@ -1629,7 +1634,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && !fl4->flowi4_oif) + res->type == RTN_UNICAST && oif_check) fib_select_default(fl4, res); if (!fl4->saddr) -- cgit v1.2.3 From eb004603c857f3e3bfcda437b6c68fd258c54960 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 10 Jan 2017 22:53:06 +0000 Subject: sctp: Fix spelling mistake: "Atempt" -> "Attempt" Trivial fix to spelling mistake in WARN_ONCE message Signed-off-by: Colin Ian King Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index e54082699520..34efaa4ef2f6 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1048,7 +1048,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) (new_transport->state == SCTP_PF))) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { - WARN_ONCE(1, "Atempt to send packet on unconfirmed path."); + WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; -- cgit v1.2.3 From c38c39bf7cc04d688291f382469e84ec2a8548a4 Mon Sep 17 00:00:00 2001 From: Cedric Izoard Date: Wed, 11 Jan 2017 14:39:07 +0000 Subject: mac80211: Fix headroom allocation when forwarding mesh pkt This patch fix issue introduced by my previous commit that tried to ensure enough headroom was present, and instead broke it. When forwarding mesh pkt, mac80211 may also add security header, and it must therefore be taken into account in the needed headroom. Fixes: d8da0b5d64d5 ("mac80211: Ensure enough headroom when forwarding mesh pkt") Signed-off-by: Cedric Izoard Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c037c5bb6167..c87e61358b77 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2472,7 +2472,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (!ifmsh->mshcfg.dot11MeshForwarding) goto out; - fwd_skb = skb_copy_expand(skb, local->tx_headroom, 0, GFP_ATOMIC); + fwd_skb = skb_copy_expand(skb, local->tx_headroom + + sdata->encrypt_headroom, 0, GFP_ATOMIC); if (!fwd_skb) { net_info_ratelimited("%s: failed to clone mesh frame\n", sdata->name); -- cgit v1.2.3 From d7f842442f766db3f39fc5d166ddcc24bf817056 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 25 Oct 2016 10:32:16 +0300 Subject: mac80211: fix the TID on NDPs sent as EOSP carrier In the commit below, I forgot to translate the mac80211's AC to QoS IE order. Moreover, the condition in the if was wrong. Fix both issues. This bug would hit only with clients that didn't set all the ACs as delivery enabled. Fixes: f438ceb81d4 ("mac80211: uapsd_queues is in QoS IE order") Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index b6cfcf038c11..50c309094c37 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1501,8 +1501,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) - if (ignored_acs & BIT(ac)) - continue; + if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) + break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); -- cgit v1.2.3 From 06f7c88c107fb469f4f1344142e80df5175c6836 Mon Sep 17 00:00:00 2001 From: Beni Lev Date: Tue, 19 Jul 2016 19:28:56 +0300 Subject: cfg80211: consider VHT opmode on station update Currently, this attribute is only fetched on station addition, but not on station change. Since this info is only present in the assoc request, with full station state support in the driver it cannot be present when the station is added. Thus, add support for changing the VHT opmode on station update if done before (or while) the station is marked as associated. After this, ignore it, since it used to be ignored. Signed-off-by: Beni Lev Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ef5eff93a8b8..5c1b267e22be 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4615,6 +4615,15 @@ int cfg80211_check_station_change(struct wiphy *wiphy, break; } + /* + * Older kernel versions ignored this attribute entirely, so don't + * reject attempts to update it but mark it as unused instead so the + * driver won't look at the data. + */ + if (statype != CFG80211_STA_AP_CLIENT_UNASSOC && + statype != CFG80211_STA_TDLS_PEER_SETUP) + params->opmode_notif_used = false; + return 0; } EXPORT_SYMBOL(cfg80211_check_station_change); @@ -4854,6 +4863,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) params.local_pm = pm; } + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { + params.opmode_notif_used = true; + params.opmode_notif = + nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); + } + /* Include parameters for TDLS peer (will check later) */ err = nl80211_set_station_tdls(info, ¶ms); if (err) -- cgit v1.2.3 From 96aa2e7cf126773b16c6c19b7474a8a38d3c707e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 7 Oct 2016 12:23:49 +0200 Subject: mac80211: calculate min channel width correctly In the current minimum chandef code there's an issue in that the recalculation can happen after rate control is initialized for a station that has a wider bandwidth than the current chanctx, and then rate control can immediately start using those higher rates which could cause problems. Observe that first of all that this problem is because we don't take non-associated and non-uploaded stations into account. The restriction to non-associated is quite pointless and is one of the causes for the problem described above, since the rate init will happen before the station is set to associated; no frames could actually be sent until associated, but the rate table can already contain higher rates and that might cause problems. Also, rejecting non-uploaded stations is wrong, since the rate control can select higher rates for those as well. Secondly, it's then necessary to recalculate the minimal config before initializing rate control, so that when rate control is initialized, the higher rates are already available. This can be done easily by adding the necessary function call in rate init. Change-Id: Ib9bc02d34797078db55459d196993f39dcd43070 Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 3 --- net/mac80211/rate.c | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e75cbf6ecc26..a0d901d8992e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -231,9 +231,6 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata) !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; - if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC)) - continue; - max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta)); } rcu_read_unlock(); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 206698bc93f4..9e2641d45587 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -40,6 +40,8 @@ void rate_control_rate_init(struct sta_info *sta) ieee80211_sta_set_rx_nss(sta); + ieee80211_recalc_min_chandef(sta->sdata); + if (!ref) return; -- cgit v1.2.3 From d2941df8fbd9708035d66d889ada4d3d160170ce Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 20 Oct 2016 08:52:50 +0200 Subject: mac80211: recalculate min channel width on VHT opmode changes When an associated station changes its VHT operating mode this can/will affect the bandwidth it's using, and consequently we must recalculate the minimum bandwidth we need to use. Failure to do so can lead to one of two scenarios: 1) we use a too high bandwidth, this is benign 2) we use a too narrow bandwidth, causing rate control and actual PHY configuration to be out of sync, which can in turn cause problems/crashes Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 21 +++++++++++++++++++++ net/mac80211/rx.c | 9 +-------- net/mac80211/vht.c | 4 +++- 3 files changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 41497b670e2b..d37ae7dc114b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -6,6 +6,7 @@ * Copyright (c) 2006 Jiri Benc * Copyright 2008, Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1295,6 +1296,26 @@ static void ieee80211_iface_work(struct work_struct *work) } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_VHT) { switch (mgmt->u.action.u.vht_group_notif.action_code) { + case WLAN_VHT_ACTION_OPMODE_NOTIF: { + struct ieee80211_rx_status *status; + enum nl80211_band band; + u8 opmode; + + status = IEEE80211_SKB_RXCB(skb); + band = status->band; + opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + + mutex_lock(&local->sta_mtx); + sta = sta_info_get_bss(sdata, mgmt->sa); + + if (sta) + ieee80211_vht_handle_opmode(sdata, sta, + opmode, + band); + + mutex_unlock(&local->sta_mtx); + break; + } case WLAN_VHT_ACTION_GROUPID_MGMT: ieee80211_process_mu_groups(sdata, mgmt); break; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c87e61358b77..3090dd4342f6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2881,17 +2881,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.u.vht_opmode_notif.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { - u8 opmode; - /* verify opmode is present */ if (len < IEEE80211_MIN_ACTION_SIZE + 2) goto invalid; - - opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; - - ieee80211_vht_handle_opmode(rx->sdata, rx->sta, - opmode, status->band); - goto handled; + goto queue; } case WLAN_VHT_ACTION_GROUPID_MGMT: { if (len < IEEE80211_MIN_ACTION_SIZE + 25) diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 6832bf6ab69f..43e45bb660bc 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -527,8 +527,10 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band); - if (changed > 0) + if (changed > 0) { + ieee80211_recalc_min_chandef(sdata); rate_control_rate_update(local, sband, sta, changed); + } } void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, -- cgit v1.2.3 From ea7a80858f57d8878b1499ea0f1b8a635cc48de7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 11 Jan 2017 14:29:54 -0800 Subject: net: lwtunnel: Handle lwtunnel_fill_encap failure Handle failure in lwtunnel_fill_encap adding attributes to skb. Fixes: 571e722676fe ("ipv4: support for fib route lwtunnel encap attributes") Fixes: 19e42e451506 ("ipv6: support for fib route lwtunnel encap attributes") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 11 +++++++---- net/ipv6/route.c | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index eba1546b5031..9a375b908d01 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1279,8 +1279,9 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif - if (fi->fib_nh->nh_lwtstate) - lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); + if (fi->fib_nh->nh_lwtstate && + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0) + goto nla_put_failure; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1316,8 +1317,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif - if (nh->nh_lwtstate) - lwtunnel_fill_encap(skb, nh->nh_lwtstate); + if (nh->nh_lwtstate && + lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ce5aaf448c54..4f6b067c8753 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3317,7 +3317,8 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; - lwtunnel_fill_encap(skb, rt->dst.lwtstate); + if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 8a430ed50bb1b19ca14a46661f3b1b35f2fb5c39 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 11 Jan 2017 15:42:17 -0800 Subject: net: ipv4: fix table id in getroute response rtm_table is an 8-bit field while table ids are allowed up to u32. Commit 709772e6e065 ("net: Fix routing tables with id > 255 for legacy software") added the preference to set rtm_table in dumps to RT_TABLE_COMPAT if the table id is > 255. The table id returned on get route requests should do the same. Fixes: c36ba6603a11 ("net: Allow user to get table id from route lookup") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0fcac8e7a2b2..709ffe67d1de 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2472,7 +2472,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = fl4->flowi4_tos; - r->rtm_table = table_id; + r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; -- cgit v1.2.3 From 78794d1890708cf94e3961261e52dcec2cc34722 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 9 Jan 2017 17:15:18 -0500 Subject: svcrpc: don't leak contexts on PROC_DESTROY Context expiry times are in units of seconds since boot, not unix time. The use of get_seconds() here therefore sets the expiry time decades in the future. This prevents timely freeing of contexts destroyed by client RPC_GSS_PROC_DESTROY requests. We'd still free them eventually (when the module is unloaded or the container shut down), but a lot of contexts could pile up before then. Cc: stable@vger.kernel.org Fixes: c5b29f885afe "sunrpc: use seconds since boot in expiry cache" Reported-by: Andy Adamson Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 886e9d381771..153082598522 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1489,7 +1489,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - rsci->h.expiry_time = get_seconds(); + rsci->h.expiry_time = seconds_since_boot(); set_bit(CACHE_NEGATIVE, &rsci->h.flags); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; -- cgit v1.2.3 From 546125d1614264d26080817d0c8cddb9b25081fa Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 5 Jan 2017 16:34:51 -0500 Subject: sunrpc: don't call sleeping functions from the notifier block callbacks The inet6addr_chain is an atomic notifier chain, so we can't call anything that might sleep (like lock_sock)... instead of closing the socket from svc_age_temp_xprts_now (which is called by the notifier function), just have the rpc service threads do it instead. Cc: stable@vger.kernel.org Fixes: c3d4879e01be "sunrpc: Add a function to close..." Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 3bc1d61694cb..9c9db55a0c1e 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -799,6 +799,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); + if (test_and_clear_bit(XPT_KILL_TEMP, &xprt->xpt_flags)) + xprt->xpt_ops->xpo_kill_temp_xprt(xprt); svc_delete_xprt(xprt); /* Leave XPT_BUSY set on the dead xprt: */ goto out; @@ -1020,9 +1022,11 @@ void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) le = to_be_closed.next; list_del_init(le); xprt = list_entry(le, struct svc_xprt, xpt_list); - dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); - xprt->xpt_ops->xpo_kill_temp_xprt(xprt); - svc_close_xprt(xprt); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_KILL_TEMP, &xprt->xpt_flags); + dprintk("svc_age_temp_xprts_now: queuing xprt %p for closing\n", + xprt); + svc_xprt_enqueue(xprt); } } EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); -- cgit v1.2.3 From ce1ca7d2d140a1f4aaffd297ac487f246963dd2f Mon Sep 17 00:00:00 2001 From: Sriharsha Basavapatna Date: Mon, 9 Jan 2017 16:00:44 +0530 Subject: svcrdma: avoid duplicate dma unmapping during error recovery In rdma_read_chunk_frmr() when ib_post_send() fails, the error code path invokes ib_dma_unmap_sg() to unmap the sg list. It then invokes svc_rdma_put_frmr() which in turn tries to unmap the same sg list through ib_dma_unmap_sg() again. This second unmap is invalid and could lead to problems when the iova being unmapped is subsequently reused. Remove the call to unmap in rdma_read_chunk_frmr() and let svc_rdma_put_frmr() handle it. Fixes: 412a15c0fe53 ("svcrdma: Port to new memory registration API") Cc: stable@vger.kernel.org Signed-off-by: Sriharsha Basavapatna Reviewed-by: Chuck Lever Reviewed-by: Yuval Shaia Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 57d35fbb1c28..172b537f8cfc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -347,8 +347,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, atomic_inc(&rdma_stat_read); return ret; err: - ib_dma_unmap_sg(xprt->sc_cm_id->device, - frmr->sg, frmr->sg_nents, frmr->direction); svc_rdma_put_context(ctxt, 0); svc_rdma_put_frmr(xprt, frmr); return ret; -- cgit v1.2.3 From 43071d8fb3b7f589d72663c496a6880fb097533c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 13 Jan 2017 11:28:25 +0100 Subject: mac80211: initialize SMPS field in HT capabilities ibss and mesh modes copy the ht capabilites from the band without overriding the SMPS state. Unfortunately the default value 0 for the SMPS field means static SMPS instead of disabled. This results in HT ibss and mesh setups using only single-stream rates, even though SMPS is not supposed to be active. Initialize SMPS to disabled for all bands on ieee80211_hw_register to ensure that the value is sane where it is not overriden with the real SMPS state. Reported-by: Elektra Wagenrad Signed-off-by: Felix Fietkau [move VHT TODO comment to a better place] Signed-off-by: Johannes Berg --- net/mac80211/main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 1822c77f2b1c..56fb47953b72 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -913,12 +913,17 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; - if (sband->ht_cap.ht_supported) - local->rx_chains = - max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs), - local->rx_chains); + if (!sband->ht_cap.ht_supported) + continue; /* TODO: consider VHT for RX chains, hopefully it's the same */ + local->rx_chains = + max(ieee80211_mcs_to_chains(&sband->ht_cap.mcs), + local->rx_chains); + + /* no need to mask, SM_PS_DISABLED has all bits set */ + sband->ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED << + IEEE80211_HT_CAP_SM_PS_SHIFT; } /* if low-level driver supports AP, we also support VLAN */ -- cgit v1.2.3 From dbef53621116474bb883f76f0ba6b7640bc42332 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Fri, 13 Jan 2017 13:32:51 +0100 Subject: mac80211: prevent skb/txq mismatch Station structure is considered as not uploaded (to driver) until drv_sta_state() finishes. This call is however done after the structure is attached to mac80211 internal lists and hashes. This means mac80211 can lookup (and use) station structure before it is uploaded to a driver. If this happens (structure exists, but sta->uploaded is false) fast_tx path can still be taken. Deep in the fastpath call the sta->uploaded is checked against to derive "pubsta" argument for ieee80211_get_txq(). If sta->uploaded is false (and sta is actually non-NULL) ieee80211_get_txq() effectively downgraded to vif->txq. At first glance this may look innocent but coerces mac80211 into a state that is almost guaranteed (codel may drop offending skb) to crash because a station-oriented skb gets queued up on vif-oriented txq. The ieee80211_tx_dequeue() ends up looking at info->control.flags and tries to use txq->sta which in the fail case is NULL. It's probably pointless to pretend one can downgrade skb from sta-txq to vif-txq. Since downgrading unicast traffic to vif->txq must not be done there's no txq to put a frame on if sta->uploaded is false. Therefore the code is made to fall back to regular tx() op path if the described condition is hit. Only drivers using wake_tx_queue were affected. Example crash dump before fix: Unable to handle kernel paging request at virtual address ffffe26c PC is at ieee80211_tx_dequeue+0x204/0x690 [mac80211] [] (ieee80211_tx_dequeue [mac80211]) from [] (ath10k_mac_tx_push_txq+0x54/0x1c0 [ath10k_core]) [] (ath10k_mac_tx_push_txq [ath10k_core]) from [] (ath10k_htt_txrx_compl_task+0xd78/0x11d0 [ath10k_core]) [] (ath10k_htt_txrx_compl_task [ath10k_core]) [] (ath10k_pci_napi_poll+0x54/0xe8 [ath10k_pci]) [] (ath10k_pci_napi_poll [ath10k_pci]) from [] (net_rx_action+0xac/0x160) Reported-by: Mohammed Shafi Shajakhan Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 0d8b716e509e..797e847cbc49 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1243,7 +1243,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, + struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; @@ -1257,10 +1257,13 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, if (!ieee80211_is_data(hdr->frame_control)) return NULL; - if (pubsta) { + if (sta) { u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - txq = pubsta->txq[tid]; + if (!sta->uploaded) + return NULL; + + txq = sta->sta.txq[tid]; } else if (vif) { txq = vif->txq; } @@ -1503,23 +1506,17 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, struct fq *fq = &local->fq; struct ieee80211_vif *vif; struct txq_info *txqi; - struct ieee80211_sta *pubsta; if (!local->ops->wake_tx_queue || sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; - if (sta && sta->uploaded) - pubsta = &sta->sta; - else - pubsta = NULL; - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); vif = &sdata->vif; - txqi = ieee80211_get_txq(local, vif, pubsta, skb); + txqi = ieee80211_get_txq(local, vif, sta, skb); if (!txqi) return false; -- cgit v1.2.3 From fa79581ea66ca43d56ef065346ac5be767fcb418 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Thu, 12 Jan 2017 21:30:01 +0100 Subject: ipv6: sr: fix several BUGs when preemption is enabled When CONFIG_PREEMPT=y, CONFIG_IPV6=m and CONFIG_SEG6_HMAC=y, seg6_hmac_init() is called during the initialization of the ipv6 module. This causes a subsequent call to smp_processor_id() with preemption enabled, resulting in the following trace. [ 20.451460] BUG: using smp_processor_id() in preemptible [00000000] code: systemd/1 [ 20.452556] caller is debug_smp_processor_id+0x17/0x19 [ 20.453304] CPU: 0 PID: 1 Comm: systemd Not tainted 4.9.0-rc5-00973-g46738b1 #1 [ 20.454406] ffffc9000062fc18 ffffffff813607b2 0000000000000000 ffffffff81a7f782 [ 20.455528] ffffc9000062fc48 ffffffff813778dc 0000000000000000 00000000001dcf98 [ 20.456539] ffffffffa003bd08 ffffffff81af93e0 ffffc9000062fc58 ffffffff81377905 [ 20.456539] Call Trace: [ 20.456539] [] dump_stack+0x63/0x7f [ 20.456539] [] check_preemption_disabled+0xd1/0xe3 [ 20.456539] [] debug_smp_processor_id+0x17/0x19 [ 20.460260] [] seg6_hmac_init+0xfa/0x192 [ipv6] [ 20.460260] [] seg6_init+0x39/0x6f [ipv6] [ 20.460260] [] inet6_init+0x21a/0x321 [ipv6] [ 20.460260] [] ? 0xffffffffa0061000 [ 20.460260] [] do_one_initcall+0x8b/0x115 [ 20.460260] [] do_init_module+0x53/0x1c4 [ 20.460260] [] load_module+0x1153/0x14ec [ 20.460260] [] SYSC_finit_module+0x8c/0xb9 [ 20.460260] [] ? SYSC_finit_module+0x8c/0xb9 [ 20.460260] [] SyS_finit_module+0x9/0xb [ 20.460260] [] do_syscall_64+0x62/0x75 [ 20.460260] [] entry_SYSCALL64_slow_path+0x25/0x25 Moreover, dst_cache_* functions also call smp_processor_id(), generating a similar trace. This patch uses raw_cpu_ptr() in seg6_hmac_init() rather than this_cpu_ptr() and disable preemption when using dst_cache_* functions. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- net/ipv6/seg6_hmac.c | 2 +- net/ipv6/seg6_iptunnel.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index ef1c8a46e7ac..03a064803626 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -400,7 +400,7 @@ static int seg6_hmac_init_algo(void) *p_tfm = tfm; } - p_tfm = this_cpu_ptr(algo->tfms); + p_tfm = raw_cpu_ptr(algo->tfms); tfm = *p_tfm; shsize = sizeof(*shash) + crypto_shash_descsize(tfm); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index bbfca22c34ae..1d60cb132835 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -265,7 +265,9 @@ int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); #ifdef CONFIG_DST_CACHE + preempt_disable(); dst = dst_cache_get(&slwt->cache); + preempt_enable(); #endif if (unlikely(!dst)) { @@ -286,7 +288,9 @@ int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) } #ifdef CONFIG_DST_CACHE + preempt_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + preempt_enable(); #endif } -- cgit v1.2.3 From 003c941057eaa868ca6fedd29a274c863167230d Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Jan 2017 14:24:58 -0800 Subject: tcp: fix tcp_fastopen unaligned access complaints on sparc Fix up a data alignment issue on sparc by swapping the order of the cookie byte array field with the length field in struct tcp_fastopen_cookie, and making it a proper union to clean up the typecasting. This addresses log complaints like these: log_unaligned: 113 callbacks suppressed Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360 Kernel unaligned access at TPC[9764ac] tcp_try_fastopen+0x2ec/0x360 Kernel unaligned access at TPC[9764c8] tcp_try_fastopen+0x308/0x360 Kernel unaligned access at TPC[9764e4] tcp_try_fastopen+0x324/0x360 Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360 Cc: Eric Dumazet Signed-off-by: Shannon Nelson Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 4e777a3243f9..f51919535ca7 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -113,7 +113,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, struct tcp_fastopen_cookie tmp; if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { - struct in6_addr *buf = (struct in6_addr *) tmp.val; + struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) -- cgit v1.2.3 From 75f01a4c9cc291ff5cb28ca1216adb163b7a20ee Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Thu, 12 Jan 2017 19:33:18 -0500 Subject: openvswitch: maintain correct checksum state in conntrack actions When executing conntrack actions on skbuffs with checksum mode CHECKSUM_COMPLETE, the checksum must be updated to account for header pushes and pulls. Otherwise we get "hw csum failure" logs similar to this (ICMP packet received on geneve tunnel via ixgbe NIC): [ 405.740065] genev_sys_6081: hw csum failure [ 405.740106] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G I 4.10.0-rc3+ #1 [ 405.740108] Call Trace: [ 405.740110] [ 405.740113] dump_stack+0x63/0x87 [ 405.740116] netdev_rx_csum_fault+0x3a/0x40 [ 405.740118] __skb_checksum_complete+0xcf/0xe0 [ 405.740120] nf_ip_checksum+0xc8/0xf0 [ 405.740124] icmp_error+0x1de/0x351 [nf_conntrack_ipv4] [ 405.740132] nf_conntrack_in+0xe1/0x550 [nf_conntrack] [ 405.740137] ? find_bucket.isra.2+0x62/0x70 [openvswitch] [ 405.740143] __ovs_ct_lookup+0x95/0x980 [openvswitch] [ 405.740145] ? netif_rx_internal+0x44/0x110 [ 405.740149] ovs_ct_execute+0x147/0x4b0 [openvswitch] [ 405.740153] do_execute_actions+0x22e/0xa70 [openvswitch] [ 405.740157] ovs_execute_actions+0x40/0x120 [openvswitch] [ 405.740161] ovs_dp_process_packet+0x84/0x120 [openvswitch] [ 405.740166] ovs_vport_receive+0x73/0xd0 [openvswitch] [ 405.740168] ? udp_rcv+0x1a/0x20 [ 405.740170] ? ip_local_deliver_finish+0x93/0x1e0 [ 405.740172] ? ip_local_deliver+0x6f/0xe0 [ 405.740174] ? ip_rcv_finish+0x3a0/0x3a0 [ 405.740176] ? ip_rcv_finish+0xdb/0x3a0 [ 405.740177] ? ip_rcv+0x2a7/0x400 [ 405.740180] ? __netif_receive_skb_core+0x970/0xa00 [ 405.740185] netdev_frame_hook+0xd3/0x160 [openvswitch] [ 405.740187] __netif_receive_skb_core+0x1dc/0xa00 [ 405.740194] ? ixgbe_clean_rx_irq+0x46d/0xa20 [ixgbe] [ 405.740197] __netif_receive_skb+0x18/0x60 [ 405.740199] netif_receive_skb_internal+0x40/0xb0 [ 405.740201] napi_gro_receive+0xcd/0x120 [ 405.740204] gro_cell_poll+0x57/0x80 [geneve] [ 405.740206] net_rx_action+0x260/0x3c0 [ 405.740209] __do_softirq+0xc9/0x28c [ 405.740211] irq_exit+0xd9/0xf0 [ 405.740213] do_IRQ+0x51/0xd0 [ 405.740215] common_interrupt+0x93/0x93 Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Signed-off-by: Lance Richardson Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 6b78bab27755..54253ea5976e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -514,7 +514,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, int hooknum, nh_off, err = NF_ACCEPT; nh_off = skb_network_offset(skb); - skb_pull(skb, nh_off); + skb_pull_rcsum(skb, nh_off); /* See HOOK2MANIP(). */ if (maniptype == NF_NAT_MANIP_SRC) @@ -579,6 +579,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, err = nf_nat_packet(ct, ctinfo, hooknum, skb); push: skb_push(skb, nh_off); + skb_postpush_rcsum(skb, skb->data, nh_off); return err; } @@ -886,7 +887,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, /* The conntrack module expects to be working at L3. */ nh_ofs = skb_network_offset(skb); - skb_pull(skb, nh_ofs); + skb_pull_rcsum(skb, nh_ofs); if (key->ip.frag != OVS_FRAG_TYPE_NONE) { err = handle_fragments(net, key, info->zone.id, skb); @@ -900,6 +901,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, err = ovs_ct_lookup(net, key, info, skb); skb_push(skb, nh_ofs); + skb_postpush_rcsum(skb, skb->data, nh_ofs); if (err) kfree_skb(skb); return err; -- cgit v1.2.3 From 6443ebc3fdd6f3c766d9442c18be274b3d736050 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 7 Jan 2017 20:49:18 +0800 Subject: netfilter: rpfilter: fix incorrect loopback packet judgment Currently, we check the existing rtable in PREROUTING hook, if RTCF_LOCAL is set, we assume that the packet is loopback. But this assumption is incorrect, for example, a packet encapsulated in ipsec transport mode was received and routed to local, after decapsulation, it would be delivered to local again, and the rtable was not dropped, so RTCF_LOCAL check would trigger. But actually, the packet was not loopback. So for these normal loopback packets, we can check whether the in device is IFF_LOOPBACK or not. For these locally generated broadcast/multicast, we can check whether the skb->pkt_type is PACKET_LOOPBACK or not. Finally, there's a subtle difference between nft fib expr and xtables rpfilter extension, user can add the following nft rule to do strict rpfilter check: # nft add rule x y meta iif eth0 fib saddr . iif oif != eth0 drop So when the packet is loopback, it's better to store the in device instead of the LOOPBACK_IFINDEX, otherwise, after adding the above nft rule, locally generated broad/multicast packets will be dropped incorrectly. Fixes: f83a7ea2075c ("netfilter: xt_rpfilter: skip locally generated broadcast/multicast, too") Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_rpfilter.c | 8 ++++---- net/ipv4/netfilter/nft_fib_ipv4.c | 15 +++++---------- net/ipv6/netfilter/ip6t_rpfilter.c | 8 ++++---- net/ipv6/netfilter/nft_fib_ipv6.c | 13 ++++--------- 4 files changed, 17 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index f273098e48fd..37fb9552e858 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, return dev_match || flags & XT_RPFILTER_LOOSE; } -static bool rpfilter_is_local(const struct sk_buff *skb) +static bool +rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { - const struct rtable *rt = skb_rtable(skb); - return rt && (rt->rt_flags & RTCF_LOCAL); + return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) @@ -79,7 +79,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; - if (rpfilter_is_local(skb)) + if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ip_hdr(skb); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 965b1a161369..2981291910dd 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -26,13 +26,6 @@ static __be32 get_saddr(__be32 addr) return addr; } -static bool fib4_is_local(const struct sk_buff *skb) -{ - const struct rtable *rt = skb_rtable(skb); - - return rt && (rt->rt_flags & RTCF_LOCAL); -} - #define DSCP_BITS 0xfc void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, @@ -95,8 +88,10 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib4_is_local(pkt->skb)) { - nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && + nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { + nft_fib_store_result(dest, priv->result, pkt, + nft_in(pkt)->ifindex); return; } @@ -131,7 +126,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, switch (res.type) { case RTN_UNICAST: break; - case RTN_LOCAL: /* should not appear here, see fib4_is_local() above */ + case RTN_LOCAL: /* Should not see RTN_LOCAL here */ return; default: break; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index d5263dc364a9..b12e61b7b16c 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -72,10 +72,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, return ret; } -static bool rpfilter_is_local(const struct sk_buff *skb) +static bool +rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { - const struct rt6_info *rt = (const void *) skb_dst(skb); - return rt && (rt->rt6i_flags & RTF_LOCAL); + return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) @@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) struct ipv6hdr *iph; bool invert = info->flags & XT_RPFILTER_INVERT; - if (rpfilter_is_local(skb)) + if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ipv6_hdr(skb); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index c947aad8bcc6..765facf03d45 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -18,13 +18,6 @@ #include #include -static bool fib6_is_local(const struct sk_buff *skb) -{ - const struct rt6_info *rt = (const void *)skb_dst(skb); - - return rt && (rt->rt6i_flags & RTF_LOCAL); -} - static int get_ifindex(const struct net_device *dev) { return dev ? dev->ifindex : 0; @@ -164,8 +157,10 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif); - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib6_is_local(pkt->skb)) { - nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX); + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && + nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { + nft_fib_store_result(dest, priv->result, pkt, + nft_in(pkt)->ifindex); return; } -- cgit v1.2.3 From d21e540b4dd74a26df7a66ebab75c693a4a6a861 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 7 Jan 2017 20:51:50 +0800 Subject: netfilter: nf_tables: fix possible oops when dumping stateful objects When dumping nft stateful objects, if NFTA_OBJ_TABLE and NFTA_OBJ_TYPE attributes are not specified either, filter will become NULL, so oops will happen(actually nft utility will always set NFTA_OBJ_TABLE attr, so I write a test program to make this happen): BUG: unable to handle kernel NULL pointer dereference at (null) IP: nf_tables_dump_obj+0x17c/0x330 [nf_tables] [...] Call Trace: ? nf_tables_dump_obj+0x5/0x330 [nf_tables] ? __kmalloc_reserve.isra.35+0x31/0x90 ? __alloc_skb+0x5b/0x1e0 netlink_dump+0x124/0x2a0 __netlink_dump_start+0x161/0x190 nf_tables_getobj+0xe8/0x280 [nf_tables] Fixes: a9fea2a3c3cf ("netfilter: nf_tables: allow to filter stateful object dumps by type") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0db5f9782265..091d2dcc63b2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4262,10 +4262,11 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (filter->table[0] && + if (filter && filter->table[0] && strcmp(filter->table, table->name)) goto cont; - if (filter->type != NFT_OBJECT_UNSPEC && + if (filter && + filter->type != NFT_OBJECT_UNSPEC && obj->type->type != filter->type) goto cont; -- cgit v1.2.3 From e4670b058af64639ec1aef4db845c39bfdfff7c4 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Mon, 9 Jan 2017 17:24:18 -0500 Subject: netfilter: Fix typo in NF_CONNTRACK Kconfig option description The NF_CONNTRACK Kconfig option description makes an incorrect reference to the "meta" expression where the "ct" expression would be correct.This patch fixes the respective typographical error. Fixes: d497c6352736 ("netfilter: add help information to new nf_tables Kconfig options") Signed-off-by: William Breathitt Gray Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 63729b489c2c..bbc45f8a7b2d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -494,7 +494,7 @@ config NFT_CT depends on NF_CONNTRACK tristate "Netfilter nf_tables conntrack module" help - This option adds the "meta" expression that you can use to match + This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. config NFT_SET_RBTREE -- cgit v1.2.3 From 1666d49e1d416fcc2cce708242a52fe3317ea8ba Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 12 Jan 2017 21:19:37 +0800 Subject: mld: do not remove mld souce list info when set link down This is an IPv6 version of commit 24803f38a5c0 ("igmp: do not remove igmp souce list..."). In mld_del_delrec(), we will restore back all source filter info instead of flush them. Move mld_clear_delrec() from ipv6_mc_down() to ipv6_mc_destroy_dev() since we should not remove source list info when set link down. Remove igmp6_group_dropped() in ipv6_mc_destroy_dev() since we have called it in ipv6_mc_down(). Also clear all source info after igmp6_group_dropped() instead of in it because ipv6_mc_down() will call igmp6_group_dropped(). Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 14a3903f1c82..7139fffd61b6 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -81,7 +81,7 @@ static void mld_gq_timer_expire(unsigned long data); static void mld_ifc_timer_expire(unsigned long data); static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); -static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); +static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); static void mld_clear_delrec(struct inet6_dev *idev); static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); @@ -692,9 +692,9 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) dev_mc_del(dev, buf); } - if (mc->mca_flags & MAF_NOREPORT) - goto done; spin_unlock_bh(&mc->mca_lock); + if (mc->mca_flags & MAF_NOREPORT) + return; if (!mc->idev->dead) igmp6_leave_group(mc); @@ -702,8 +702,6 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) spin_lock_bh(&mc->mca_lock); if (del_timer(&mc->mca_timer)) atomic_dec(&mc->mca_refcnt); -done: - ip6_mc_clear_src(mc); spin_unlock_bh(&mc->mca_lock); } @@ -748,10 +746,11 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) spin_unlock_bh(&idev->mc_lock); } -static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) +static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc, *pmc_prev; - struct ip6_sf_list *psf, *psf_next; + struct ip6_sf_list *psf; + struct in6_addr *pmca = &im->mca_addr; spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; @@ -768,14 +767,20 @@ static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) } spin_unlock_bh(&idev->mc_lock); + spin_lock_bh(&im->mca_lock); if (pmc) { - for (psf = pmc->mca_tomb; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + im->idev = pmc->idev; + im->mca_crcount = idev->mc_qrv; + im->mca_sfmode = pmc->mca_sfmode; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + im->mca_tomb = pmc->mca_tomb; + im->mca_sources = pmc->mca_sources; + for (psf = im->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = im->mca_crcount; } in6_dev_put(pmc->idev); - kfree(pmc); } + spin_unlock_bh(&im->mca_lock); } static void mld_clear_delrec(struct inet6_dev *idev) @@ -904,7 +909,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) mca_get(mc); write_unlock_bh(&idev->lock); - mld_del_delrec(idev, &mc->mca_addr); + mld_del_delrec(idev, mc); igmp6_group_added(mc); ma_put(mc); return 0; @@ -927,6 +932,7 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); igmp6_group_dropped(ma); + ip6_mc_clear_src(ma); ma_put(ma); return 0; @@ -2501,15 +2507,17 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Withdraw multicast list */ read_lock_bh(&idev->lock); - mld_ifc_stop_timer(idev); - mld_gq_stop_timer(idev); - mld_dad_stop_timer(idev); for (i = idev->mc_list; i; i = i->next) igmp6_group_dropped(i); - read_unlock_bh(&idev->lock); - mld_clear_delrec(idev); + /* Should stop timer after group drop. or we will + * start timer again in mld_ifc_event() + */ + mld_ifc_stop_timer(idev); + mld_gq_stop_timer(idev); + mld_dad_stop_timer(idev); + read_unlock_bh(&idev->lock); } static void ipv6_mc_reset(struct inet6_dev *idev) @@ -2531,8 +2539,10 @@ void ipv6_mc_up(struct inet6_dev *idev) read_lock_bh(&idev->lock); ipv6_mc_reset(idev); - for (i = idev->mc_list; i; i = i->next) + for (i = idev->mc_list; i; i = i->next) { + mld_del_delrec(idev, i); igmp6_group_added(i); + } read_unlock_bh(&idev->lock); } @@ -2565,6 +2575,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) /* Deactivate timers */ ipv6_mc_down(idev); + mld_clear_delrec(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in @@ -2579,11 +2590,9 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) write_lock_bh(&idev->lock); while ((i = idev->mc_list) != NULL) { idev->mc_list = i->next; - write_unlock_bh(&idev->lock); - igmp6_group_dropped(i); + write_unlock_bh(&idev->lock); ma_put(i); - write_lock_bh(&idev->lock); } write_unlock_bh(&idev->lock); -- cgit v1.2.3 From 02ca0423fd65a0a9c4d70da0dbb8f4b8503f08c7 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 13 Jan 2017 10:12:20 +0100 Subject: ip6_tunnel: Account for tunnel header in tunnel MTU With ip6gre we have a tunnel header which also makes the tunnel MTU smaller. We need to reserve room for it. Previously we were using up space reserved for the Tunnel Encapsulation Limit option header (RFC 2473). Also, after commit b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions") our contract with the caller has changed. Now we check if the packet length exceeds the tunnel MTU after the tunnel header has been pushed, unlike before. This is reflected in the check where we look at the packet length minus the size of the tunnel header, which is already accounted for in tunnel MTU. Fixes: b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions") Signed-off-by: Jakub Sitnicki Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 36d292180942..753d6d0860fb 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1108,7 +1108,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen; + mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1117,7 +1117,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; -- cgit v1.2.3 From 57d5f64d83ab5b5a5118b1597386dd76eaf4340d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Fri, 13 Jan 2017 15:46:25 +0100 Subject: tipc: allocate user memory with GFP_KERNEL flag Until now, we allocate memory always with GFP_ATOMIC flag. When the system is under memory pressure and a user tries to send, the send fails due to low memory. However, the user application can wait for free memory if we allocate it using GFP_KERNEL flag. In this commit, we use allocate memory with GFP_KERNEL for all user allocation. Reported-by: Rune Torgersen Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/discover.c | 4 ++-- net/tipc/link.c | 2 +- net/tipc/msg.c | 16 ++++++++-------- net/tipc/msg.h | 2 +- net/tipc/name_distr.c | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 6b109a808d4c..02462d67d191 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -169,7 +169,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { - rskb = tipc_buf_acquire(MAX_H_SIZE); + rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); if (!rskb) return; tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); @@ -278,7 +278,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, req = kmalloc(sizeof(*req), GFP_ATOMIC); if (!req) return -ENOMEM; - req->buf = tipc_buf_acquire(MAX_H_SIZE); + req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); if (!req->buf) { kfree(req); return -ENOMEM; diff --git a/net/tipc/link.c b/net/tipc/link.c index bda89bf9f4ff..4e8647aef01c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1395,7 +1395,7 @@ tnl: msg_set_seqno(hdr, seqno++); pktlen = msg_size(hdr); msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); - tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE); + tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC); if (!tnlskb) { pr_warn("%sunable to send packet\n", link_co_err); return; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index a22be502f1bd..ab02d0742476 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -58,12 +58,12 @@ static unsigned int align(unsigned int i) * NOTE: Headroom is reserved to allow prepending of a data link header. * There may also be unrequested tailroom present at the buffer's end. */ -struct sk_buff *tipc_buf_acquire(u32 size) +struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; - skb = alloc_skb_fclone(buf_size, GFP_ATOMIC); + skb = alloc_skb_fclone(buf_size, gfp); if (skb) { skb_reserve(skb, BUF_HEADROOM); skb_put(skb, size); @@ -95,7 +95,7 @@ struct sk_buff *tipc_msg_create(uint user, uint type, struct tipc_msg *msg; struct sk_buff *buf; - buf = tipc_buf_acquire(hdr_sz + data_sz); + buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC); if (unlikely(!buf)) return NULL; @@ -261,7 +261,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, /* No fragmentation needed? */ if (likely(msz <= pktmax)) { - skb = tipc_buf_acquire(msz); + skb = tipc_buf_acquire(msz, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; skb_orphan(skb); @@ -282,7 +282,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, msg_set_importance(&pkthdr, msg_importance(mhdr)); /* Prepare first fragment */ - skb = tipc_buf_acquire(pktmax); + skb = tipc_buf_acquire(pktmax, GFP_KERNEL); if (!skb) return -ENOMEM; skb_orphan(skb); @@ -313,7 +313,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, pktsz = drem + INT_H_SIZE; else pktsz = pktmax; - skb = tipc_buf_acquire(pktsz); + skb = tipc_buf_acquire(pktsz, GFP_KERNEL); if (!skb) { rc = -ENOMEM; goto error; @@ -448,7 +448,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, if (msz > (max / 2)) return false; - _skb = tipc_buf_acquire(max); + _skb = tipc_buf_acquire(max, GFP_ATOMIC); if (!_skb) return false; @@ -496,7 +496,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) /* Never return SHORT header; expand by replacing buffer if necessary */ if (msg_short(hdr)) { - *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen); + *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC); if (!*skb) goto exit; memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 8d408612ffa4..2c3dc38abf9c 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -820,7 +820,7 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } -struct sk_buff *tipc_buf_acquire(u32 size); +struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index c1cfd92de17a..23f8899e0f8c 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -69,7 +69,7 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size); + struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); struct tipc_msg *msg; if (buf != NULL) { -- cgit v1.2.3 From f1f7714ea51c56b7163fb1a5acf39c6a204dd758 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Jan 2017 23:38:15 +0100 Subject: bpf: rework prog_digest into prog_tag Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink") was recently discussed, partially due to admittedly suboptimal name of "prog_digest" in combination with sha1 hash usage, thus inevitably and rightfully concerns about its security in terms of collision resistance were raised with regards to use-cases. The intended use cases are for debugging resp. introspection only for providing a stable "tag" over the instruction sequence that both kernel and user space can calculate independently. It's not usable at all for making a security relevant decision. So collisions where two different instruction sequences generate the same tag can happen, but ideally at a rather low rate. The "tag" will be dumped in hex and is short enough to introspect in tracepoints or kallsyms output along with other data such as stack trace, etc. Thus, this patch performs a rename into prog_tag and truncates the tag to a short output (64 bits) to make it obvious it's not collision-free. Should in future a hash or facility be needed with a security relevant focus, then we can think about requirements, constraints, etc that would fit to that situation. For now, rework the exposed parts for the current use cases as long as nothing has been released yet. Tested on x86_64 and s390x. Fixes: 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 5 ++--- net/sched/cls_bpf.c | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 1c60317f0121..520baa41cba3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -123,12 +123,11 @@ static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; - nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST, - sizeof(prog->filter->digest)); + nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; - memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index adc776048d1a..d9c97018317d 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -555,11 +555,11 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; - nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest)); + nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; - memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } -- cgit v1.2.3 From 8a367e74c0120ef68c8c70d5a025648c96626dff Mon Sep 17 00:00:00 2001 From: Basil Gunn Date: Sat, 14 Jan 2017 12:18:55 -0800 Subject: ax25: Fix segfault after sock connection timeout The ax.25 socket connection timed out & the sock struct has been previously taken down ie. sock struct is now a NULL pointer. Checking the sock_flag causes the segfault. Check if the socket struct pointer is NULL before checking sock_flag. This segfault is seen in timed out netrom connections. Please submit to -stable. Signed-off-by: Basil Gunn Signed-off-by: David S. Miller --- net/ax25/ax25_subr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 4855d18a8511..038b109b2be7 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -264,7 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); - if (!sock_flag(ax25->sk, SOCK_DESTROY)) + if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) ax25_stop_heartbeat(ax25); ax25_stop_t1timer(ax25); ax25_stop_t2timer(ax25); -- cgit v1.2.3 From 0faa9cb5b3836a979864a6357e01d2046884ad52 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 15 Jan 2017 10:14:06 -0500 Subject: net sched actions: fix refcnt when GETing of action after bind Demonstrating the issue: .. add a drop action $sudo $TC actions add action drop index 10 .. retrieve it $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 2 bind 0 installed 29 sec used 29 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 ... bug 1 above: reference is two. Reference is actually 1 but we forget to subtract 1. ... do a GET again and we see the same issue try a few times and nothing changes ~$ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 2 bind 0 installed 31 sec used 31 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 ... lets try to bind the action to a filter.. $ sudo $TC qdisc add dev lo ingress $ sudo $TC filter add dev lo parent ffff: protocol ip prio 1 \ u32 match ip dst 127.0.0.1/32 flowid 1:1 action gact index 10 ... and now a few GETs: $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 3 bind 1 installed 204 sec used 204 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 4 bind 1 installed 206 sec used 206 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 5 bind 1 installed 235 sec used 235 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 .... as can be observed the reference count keeps going up. After the fix $ sudo $TC actions add action drop index 10 $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 1 bind 0 installed 4 sec used 4 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 1 bind 0 installed 6 sec used 6 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 $ sudo $TC qdisc add dev lo ingress $ sudo $TC filter add dev lo parent ffff: protocol ip prio 1 \ u32 match ip dst 127.0.0.1/32 flowid 1:1 action gact index 10 $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 2 bind 1 installed 32 sec used 32 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 $ sudo $TC -s actions get action gact index 10 action order 1: gact action drop random type none pass val 0 index 10 ref 2 bind 1 installed 33 sec used 33 sec Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 Fixes: aecc5cefc389 ("net sched actions: fix GETing actions") Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_api.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2095c83ce773..e10456ef6f7a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -900,8 +900,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; - if (event == RTM_GETACTION) - act->tcfa_refcnt++; list_add_tail(&act->list, &actions); } @@ -914,7 +912,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; } err: - tcf_action_destroy(&actions, 0); + if (event != RTM_GETACTION) + tcf_action_destroy(&actions, 0); return ret; } -- cgit v1.2.3 From 124f930b8cbc4ac11236e6eb1c5f008318864588 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2017 19:16:46 +0100 Subject: libceph: make sure ceph_aes_crypt() IV is aligned ... otherwise the crypto stack will align it for us with a GFP_ATOMIC allocation and a memcpy() -- see skcipher_walk_first(). Signed-off-by: Ilya Dryomov --- net/ceph/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 3949ce70be07..292e33bd916e 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -214,7 +214,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt, SKCIPHER_REQUEST_ON_STACK(req, key->tfm); struct sg_table sgt; struct scatterlist prealloc_sg; - char iv[AES_BLOCK_SIZE]; + char iv[AES_BLOCK_SIZE] __aligned(8); int pad_byte = AES_BLOCK_SIZE - (in_len & (AES_BLOCK_SIZE - 1)); int crypt_len = encrypt ? in_len + pad_byte : in_len; int ret; -- cgit v1.2.3 From 31a86d137219373c3222ca5f4f912e9a4d8065bb Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 17 Jan 2017 19:19:17 +0200 Subject: net: ethtool: Initialize buffer when querying device channel settings Ethtool channels respond struct was uninitialized when querying device channel boundaries settings. As a result, unreported fields by the driver hold garbage. This may cause sending unsupported params to driver. Fixes: 8bf368620486 ('ethtool: ensure channel counts are within bounds ...') Signed-off-by: Eran Ben Elisha Signed-off-by: Tariq Toukan CC: John W. Linville Signed-off-by: David S. Miller --- net/core/ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e23766c7e3ba..236a21e3c878 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1712,7 +1712,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels, max; + struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; u32 max_rx_in_use = 0; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) -- cgit v1.2.3 From 3fd0b634de7d6b9a85f34a4cf9d8afc1df465cc9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jan 2017 16:41:03 +0100 Subject: netfilter: ipt_CLUSTERIP: fix build error without procfs We can't access c->pde if CONFIG_PROC_FS is disabled: net/ipv4/netfilter/ipt_CLUSTERIP.c: In function 'clusterip_config_find_get': net/ipv4/netfilter/ipt_CLUSTERIP.c:147:9: error: 'struct clusterip_config' has no member named 'pde' This moves the check inside of another #ifdef. Fixes: 6c5d5cfbe3c5 ("netfilter: ipt_CLUSTERIP: check duplicate config when initializing") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index a6b8c1a4102b..0a783cd73faf 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -144,7 +144,12 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { - if (!c->pde || unlikely(!atomic_inc_not_zero(&c->refcount))) +#ifdef CONFIG_PROC_FS + if (!c->pde) + c = NULL; + else +#endif + if (unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; else if (entry) atomic_inc(&c->entries); -- cgit v1.2.3 From 7be2c82cfd5d28d7adb66821a992604eb6dd112e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Jan 2017 12:12:17 -0800 Subject: net: fix harmonize_features() vs NETIF_F_HIGHDMA Ashizuka reported a highmem oddity and sent a patch for freescale fec driver. But the problem root cause is that core networking stack must ensure no skb with highmem fragment is ever sent through a device that does not assert NETIF_F_HIGHDMA in its features. We need to call illegal_highdma() from harmonize_features() regardless of CSUM checks. Fixes: ec5f06156423 ("net: Kill link between CSUM and SG features.") Signed-off-by: Eric Dumazet Cc: Pravin Shelar Reported-by: "Ashizuka, Yuusuke" Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 07b307b0b414..7f218e095361 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2795,9 +2795,9 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); - } else if (illegal_highdma(skb->dev, skb)) { - features &= ~NETIF_F_SG; } + if (illegal_highdma(skb->dev, skb)) + features &= ~NETIF_F_SG; return features; } -- cgit v1.2.3 From 9ed59592e3e379b2e9557dc1d9e9ec8fcbb33f16 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 17 Jan 2017 14:57:36 -0800 Subject: lwtunnel: fix autoload of lwt modules Trying to add an mpls encap route when the MPLS modules are not loaded hangs. For example: CONFIG_MPLS=y CONFIG_NET_MPLS_GSO=m CONFIG_MPLS_ROUTING=m CONFIG_MPLS_IPTUNNEL=m $ ip route add 10.10.10.10/32 encap mpls 100 via inet 10.100.1.2 The ip command hangs: root 880 826 0 21:25 pts/0 00:00:00 ip route add 10.10.10.10/32 encap mpls 100 via inet 10.100.1.2 $ cat /proc/880/stack [] call_usermodehelper_exec+0xd6/0x134 [] __request_module+0x27b/0x30a [] lwtunnel_build_state+0xe4/0x178 [] fib_create_info+0x47f/0xdd4 [] fib_table_insert+0x90/0x41f [] inet_rtm_newroute+0x4b/0x52 ... modprobe is trying to load rtnl-lwt-MPLS: root 881 5 0 21:25 ? 00:00:00 /sbin/modprobe -q -- rtnl-lwt-MPLS and it hangs after loading mpls_router: $ cat /proc/881/stack [] rtnl_lock+0x12/0x14 [] register_netdevice_notifier+0x16/0x179 [] mpls_init+0x25/0x1000 [mpls_router] [] do_one_initcall+0x8e/0x13f [] do_init_module+0x5a/0x1e5 [] load_module+0x13bd/0x17d6 ... The problem is that lwtunnel_build_state is called with rtnl lock held preventing mpls_init from registering. Given the potential references held by the time lwtunnel_build_state it can not drop the rtnl lock to the load module. So, extract the module loading code from lwtunnel_build_state into a new function to validate the encap type. The new function is called while converting the user request into a fib_config which is well before any table, device or fib entries are examined. Fixes: 745041e2aaf1 ("lwtunnel: autoload of lwt modules") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/lwtunnel.c | 62 ++++++++++++++++++++++++++++++++++++++++++++----- net/ipv4/fib_frontend.c | 8 +++++++ net/ipv6/route.c | 12 +++++++++- 3 files changed, 75 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index a5d4e866ce88..47b1dd65947b 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef CONFIG_MODULES @@ -114,25 +115,74 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, family, cfg, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_valid_encap_type(u16 encap_type) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + rcu_read_unlock(); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - rcu_read_unlock(); + __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); + rtnl_lock(); + rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); + rcu_read_unlock(); } } #endif - if (likely(ops && ops->build_state)) - ret = ops->build_state(dev, encap, family, cfg, lws); - rcu_read_unlock(); + return ops ? 0 : -EOPNOTSUPP; +} +EXPORT_SYMBOL(lwtunnel_valid_encap_type); - return ret; +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) +{ + struct rtnexthop *rtnh = (struct rtnexthop *)attr; + struct nlattr *nla_entype; + struct nlattr *attrs; + struct nlattr *nla; + u16 encap_type; + int attrlen; + + while (rtnh_ok(rtnh, remaining)) { + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_ENCAP); + nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + + if (nla_entype) { + encap_type = nla_get_u16(nla_entype); + + if (lwtunnel_valid_encap_type(encap_type) != 0) + return -EOPNOTSUPP; + } + } + rtnh = rtnh_next(rtnh, &remaining); + } + + return 0; } -EXPORT_SYMBOL(lwtunnel_build_state); +EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index eae0332b0e8c..7db2ad2e82d3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -677,6 +678,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: + err = lwtunnel_valid_encap_type_attr(nla_data(attr), + nla_len(attr)); + if (err < 0) + goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; @@ -691,6 +696,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + if (err < 0) + goto errout; break; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4f6b067c8753..7ea85370c11c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2896,6 +2896,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_MULTIPATH]) { cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); + + err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, + cfg->fc_mp_len); + if (err < 0) + goto errout; } if (tb[RTA_PREF]) { @@ -2909,9 +2914,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP]) cfg->fc_encap = tb[RTA_ENCAP]; - if (tb[RTA_ENCAP_TYPE]) + if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + if (err < 0) + goto errout; + } + if (tb[RTA_EXPIRES]) { unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); -- cgit v1.2.3 From 524b698db06b9b6da7192e749f637904e2f62d7b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 16 Jan 2017 18:24:56 +0100 Subject: netfilter: conntrack: remove GC_MAX_EVICTS break Instead of breaking loop and instant resched, don't bother checking this in first place (the loop calls cond_resched for every bucket anyway). Suggested-by: Nicolas Dichtel Signed-off-by: Florian Westphal Acked-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3a073cd9fcf4..6feb5d370319 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -88,8 +88,6 @@ static __read_mostly bool nf_conntrack_locks_all; #define GC_MAX_BUCKETS_DIV 64u /* upper bound of scan intervals */ #define GC_INTERVAL_MAX (2 * HZ) -/* maximum conntracks to evict per gc run */ -#define GC_MAX_EVICTS 256u static struct conntrack_gc_work conntrack_gc_work; @@ -979,8 +977,7 @@ static void gc_worker(struct work_struct *work) */ rcu_read_unlock(); cond_resched_rcu_qs(); - } while (++buckets < goal && - expired_count < GC_MAX_EVICTS); + } while (++buckets < goal); if (gc_work->exiting) return; @@ -1005,7 +1002,7 @@ static void gc_worker(struct work_struct *work) * In case we have lots of evictions next scan is done immediately. */ ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio >= 90 || expired_count == GC_MAX_EVICTS) { + if (ratio >= 90) { gc_work->next_gc_run = 0; next_run = 0; } else if (expired_count) { -- cgit v1.2.3 From e5072053b09642b8ff417d47da05b84720aea3ee Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Jan 2017 02:01:22 +0100 Subject: netfilter: conntrack: refine gc worker heuristics, redux This further refines the changes made to conntrack gc_worker in commit e0df8cae6c16 ("netfilter: conntrack: refine gc worker heuristics"). The main idea of that change was to reduce the scan interval when evictions take place. However, on the reporters' setup, there are 1-2 million conntrack entries in total and roughly 8k new (and closing) connections per second. In this case we'll always evict at least one entry per gc cycle and scan interval is always at 1 jiffy because of this test: } else if (expired_count) { gc_work->next_gc_run /= 2U; next_run = msecs_to_jiffies(1); being true almost all the time. Given we scan ~10k entries per run its clearly wrong to reduce interval based on nonzero eviction count, it will only waste cpu cycles since a vast majorities of conntracks are not timed out. Thus only look at the ratio (scanned entries vs. evicted entries) to make a decision on whether to reduce or not. Because evictor is supposed to only kick in when system turns idle after a busy period, pick a high ratio -- this makes it 50%. We thus keep the idea of increasing scan rate when its likely that table contains many expired entries. In order to not let timed-out entries hang around for too long (important when using event logging, in which case we want to timely destroy events), we now scan the full table within at most GC_MAX_SCAN_JIFFIES (16 seconds) even in worst-case scenario where all timed-out entries sit in same slot. I tested this with a vm under synflood (with sysctl net.netfilter.nf_conntrack_tcp_timeout_syn_recv=3). While flood is ongoing, interval now stays at its max rate (GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV -> 125ms). With feedback from Nicolas Dichtel. Reported-by: Denys Fedoryshchenko Cc: Nicolas Dichtel Fixes: b87a2f9199ea82eaadc ("netfilter: conntrack: add gc worker to remove timed-out entries") Signed-off-by: Florian Westphal Tested-by: Nicolas Dichtel Acked-by: Nicolas Dichtel Tested-by: Denys Fedoryshchenko Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6feb5d370319..4e8083c5e01d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -85,9 +85,11 @@ static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ -#define GC_MAX_BUCKETS_DIV 64u -/* upper bound of scan intervals */ -#define GC_INTERVAL_MAX (2 * HZ) +#define GC_MAX_BUCKETS_DIV 128u +/* upper bound of full table scan */ +#define GC_MAX_SCAN_JIFFIES (16u * HZ) +/* desired ratio of entries found to be expired */ +#define GC_EVICT_RATIO 50u static struct conntrack_gc_work conntrack_gc_work; @@ -936,6 +938,7 @@ static noinline int early_drop(struct net *net, unsigned int _hash) static void gc_worker(struct work_struct *work) { + unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); unsigned int i, goal, buckets = 0, expired_count = 0; struct conntrack_gc_work *gc_work; unsigned int ratio, scanned = 0; @@ -994,27 +997,25 @@ static void gc_worker(struct work_struct *work) * 1. Minimize time until we notice a stale entry * 2. Maximize scan intervals to not waste cycles * - * Normally, expired_count will be 0, this increases the next_run time - * to priorize 2) above. + * Normally, expire ratio will be close to 0. * - * As soon as a timed-out entry is found, move towards 1) and increase - * the scan frequency. - * In case we have lots of evictions next scan is done immediately. + * As soon as a sizeable fraction of the entries have expired + * increase scan frequency. */ ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio >= 90) { - gc_work->next_gc_run = 0; - next_run = 0; - } else if (expired_count) { - gc_work->next_gc_run /= 2U; - next_run = msecs_to_jiffies(1); + if (ratio > GC_EVICT_RATIO) { + gc_work->next_gc_run = min_interval; } else { - if (gc_work->next_gc_run < GC_INTERVAL_MAX) - gc_work->next_gc_run += msecs_to_jiffies(1); + unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; - next_run = gc_work->next_gc_run; + BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); + + gc_work->next_gc_run += min_interval; + if (gc_work->next_gc_run > max) + gc_work->next_gc_run = max; } + next_run = gc_work->next_gc_run; gc_work->last_bucket = i; queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); } @@ -1022,7 +1023,7 @@ static void gc_worker(struct work_struct *work) static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); - gc_work->next_gc_run = GC_INTERVAL_MAX; + gc_work->next_gc_run = HZ; gc_work->exiting = false; } @@ -1914,7 +1915,7 @@ int nf_conntrack_init_start(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); conntrack_gc_work_init(&conntrack_gc_work); - queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX); + queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); return 0; -- cgit v1.2.3 From 03e4deff4987f79c34112c5ba4eb195d4f9382b0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 19 Jan 2017 16:26:21 +0800 Subject: ipv6: addrconf: Avoid addrconf_disable_change() using RCU read-side lock Just like commit 4acd4945cd1e ("ipv6: addrconf: Avoid calling netdevice notifiers with RCU read-side lock"), it is unnecessary to make addrconf_disable_change() use RCU iteration over the netdev list, since it already holds the RTNL lock, or we may meet Illegal context switch in RCU read-side critical section. Signed-off-by: Kefeng Wang Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c1e124bc8e1e..f60e88e56255 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5540,8 +5540,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); @@ -5550,7 +5549,6 @@ static void addrconf_disable_change(struct net *net, __s32 newf) dev_disable_change(idev); } } - rcu_read_unlock(); } static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) -- cgit v1.2.3 From 0dbd7ff3ac5017a46033a9d0a87a8267d69119d9 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 19 Jan 2017 16:36:39 +0300 Subject: tcp: initialize max window for a new fastopen socket Found that if we run LTP netstress test with large MSS (65K), the first attempt from server to send data comparable to this MSS on fastopen connection will be delayed by the probe timer. Here is an example: < S seq 0:0 win 43690 options [mss 65495 wscale 7 tfo cookie] length 32 > S. seq 0:0 ack 1 win 43690 options [mss 65495 wscale 7] length 0 < . ack 1 win 342 length 0 Inside tcp_sendmsg(), tcp_send_mss() returns max MSS in 'mss_now', as well as in 'size_goal'. This results the segment not queued for transmition until all the data copied from user buffer. Then, inside __tcp_push_pending_frames(), it breaks on send window test and continues with the check probe timer. Fragmentation occurs in tcp_write_wakeup()... +0.2 > P. seq 1:43777 ack 1 win 342 length 43776 < . ack 43777, win 1365 length 0 > P. seq 43777:65001 ack 1 win 342 options [...] length 21224 ... This also contradicts with the fact that we should bound to the half of the window if it is large. Fix this flaw by correctly initializing max_window. Before that, it could have large values that affect further calculations of 'size_goal'. Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Signed-off-by: Alexey Kodanev Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_fastopen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f51919535ca7..dd2560c83a85 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -205,6 +205,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); + tp->max_window = tp->snd_wnd; /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash -- cgit v1.2.3 From 6391a4481ba0796805d6581e42f9f0418c099e34 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 20 Jan 2017 14:32:42 +0800 Subject: virtio-net: restore VIRTIO_HDR_F_DATA_VALID on receiving Commit 501db511397f ("virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit") in fact disables VIRTIO_HDR_F_DATA_VALID on receiving path too, fixing this by adding a hint (has_data_valid) and set it only on the receiving path. Cc: Rolf Neugebauer Signed-off-by: Jason Wang Acked-by: Rolf Neugebauer Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b9e1a13b4ba3..3d555c79a7b5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1976,7 +1976,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, return -EINVAL; *len -= sizeof(vnet_hdr); - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le())) + if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); @@ -2237,7 +2237,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (po->has_vnet_hdr) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le())) { + vio_le(), true)) { spin_lock(&sk->sk_receive_queue.lock); goto drop_n_account; } -- cgit v1.2.3 From e363116b90906f326c9cde5473b4b9a99ba476df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jan 2017 07:57:42 -0800 Subject: ipv6: seg6_genl_set_tunsrc() must check kmemdup() return value seg6_genl_get_tunsrc() and set_tun_src() do not handle tun_src being possibly NULL, so we must check kmemdup() return value and abort if it is NULL Fixes: 915d7e5e5930 ("ipv6: sr: add code base for control plane support of SR-IPv6") Signed-off-by: Eric Dumazet Cc: David Lebrun Acked-by: David Lebrun Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index b172d85c650a..a855eb325b03 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -176,6 +176,8 @@ static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info) val = nla_data(info->attrs[SEG6_ATTR_DST]); t_new = kmemdup(val, sizeof(*val), GFP_KERNEL); + if (!t_new) + return -ENOMEM; mutex_lock(&sdata->lock); -- cgit v1.2.3 From 0e73fc9a56f22f2eec4d2b2910c649f7af67b74d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 20 Jan 2017 13:01:57 +0000 Subject: net: sctp: fix array overrun read on sctp_timer_tbl The comparison on the timeout can lead to an array overrun read on sctp_timer_tbl because of an off-by-one error. Fix this by using < instead of <= and also compare to the array size rather than SCTP_EVENT_TIMEOUT_MAX. Fixes CoverityScan CID#1397639 ("Out-of-bounds read") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/sctp/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 95d7b15dad21..e371a0d90068 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -166,7 +166,7 @@ static const char *const sctp_timer_tbl[] = { /* Lookup timer debug name. */ const char *sctp_tname(const sctp_subtype_t id) { - if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) + if (id.timeout < ARRAY_SIZE(sctp_timer_tbl)) return sctp_timer_tbl[id.timeout]; return "unknown_timer"; } -- cgit v1.2.3 From 91e744653cb80554f3fdfd1d31c5ddf7b6169f37 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 20 Jan 2017 11:29:43 -0500 Subject: Revert "net: sctp: fix array overrun read on sctp_timer_tbl" This reverts commit 0e73fc9a56f22f2eec4d2b2910c649f7af67b74d. This fix wasn't correct, a better one is coming right up. Signed-off-by: David S. Miller --- net/sctp/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/debug.c b/net/sctp/debug.c index e371a0d90068..95d7b15dad21 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -166,7 +166,7 @@ static const char *const sctp_timer_tbl[] = { /* Lookup timer debug name. */ const char *sctp_tname(const sctp_subtype_t id) { - if (id.timeout < ARRAY_SIZE(sctp_timer_tbl)) + if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) return sctp_timer_tbl[id.timeout]; return "unknown_timer"; } -- cgit v1.2.3 From b6677449dff674cf5b81429b11d5c7f358852ef9 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 20 Jan 2017 18:12:17 +0100 Subject: bridge: netlink: call br_changelink() during br_dev_newlink() Any bridge options specified during link creation (e.g. ip link add) are ignored as br_dev_newlink() does not process them. Use br_changelink() to do it. Fixes: 133235161721 ("bridge: implement rtnl_link_ops->changelink") Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 71c7453268c1..7109b389ea58 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -781,20 +781,6 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } -static int br_dev_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -{ - struct net_bridge *br = netdev_priv(dev); - - if (tb[IFLA_ADDRESS]) { - spin_lock_bh(&br->lock); - br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); - spin_unlock_bh(&br->lock); - } - - return register_netdevice(dev); -} - static int br_port_slave_changelink(struct net_device *brdev, struct net_device *dev, struct nlattr *tb[], @@ -1115,6 +1101,25 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return 0; } +static int br_dev_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct net_bridge *br = netdev_priv(dev); + int err; + + if (tb[IFLA_ADDRESS]) { + spin_lock_bh(&br->lock); + br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); + spin_unlock_bh(&br->lock); + } + + err = br_changelink(dev, tb, data); + if (err) + return err; + + return register_netdevice(dev); +} + static size_t br_get_size(const struct net_device *brdev) { return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ -- cgit v1.2.3 From 9f427a0e474a67b454420c131709600d44850486 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 20 Jan 2017 12:58:34 -0800 Subject: net: mpls: Fix multipath selection for LSR use case MPLS multipath for LSR is broken -- always selecting the first nexthop in the one label case. For example: $ ip -f mpls ro ls 100 nexthop as to 200 via inet 172.16.2.2 dev virt12 nexthop as to 300 via inet 172.16.3.2 dev virt13 101 nexthop as to 201 via inet6 2000:2::2 dev virt12 nexthop as to 301 via inet6 2000:3::2 dev virt13 In this example incoming packets have a single MPLS labels which means BOS bit is set. The BOS bit is passed from mpls_forward down to mpls_multipath_hash which never processes the hash loop because BOS is 1. Update mpls_multipath_hash to process the entire label stack. mpls_hdr_len tracks the total mpls header length on each pass (on pass N mpls_hdr_len is N * sizeof(mpls_shim_hdr)). When the label is found with the BOS set it verifies the skb has sufficient header for ipv4 or ipv6, and find the IPv4 and IPv6 header by using the last mpls_hdr pointer and adding 1 to advance past it. With these changes I have verified the code correctly sees the label, BOS, IPv4 and IPv6 addresses in the network header and icmp/tcp/udp traffic for ipv4 and ipv6 are distributed across the nexthops. Fixes: 1c78efa8319ca ("mpls: flow-based multipath selection") Acked-by: Robert Shearman Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 15fe97644ffe..5b77377e5a15 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -98,18 +98,19 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); -static u32 mpls_multipath_hash(struct mpls_route *rt, - struct sk_buff *skb, bool bos) +static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) { struct mpls_entry_decoded dec; + unsigned int mpls_hdr_len = 0; struct mpls_shim_hdr *hdr; bool eli_seen = false; int label_index; u32 hash = 0; - for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos; + for (label_index = 0; label_index < MAX_MP_SELECT_LABELS; label_index++) { - if (!pskb_may_pull(skb, sizeof(*hdr) * label_index)) + mpls_hdr_len += sizeof(*hdr); + if (!pskb_may_pull(skb, mpls_hdr_len)) break; /* Read and decode the current label */ @@ -134,37 +135,38 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, eli_seen = true; } - bos = dec.bos; - if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index + - sizeof(struct iphdr))) { + if (!dec.bos) + continue; + + /* found bottom label; does skb have room for a header? */ + if (pskb_may_pull(skb, mpls_hdr_len + sizeof(struct iphdr))) { const struct iphdr *v4hdr; - v4hdr = (const struct iphdr *)(mpls_hdr(skb) + - label_index); + v4hdr = (const struct iphdr *)(hdr + 1); if (v4hdr->version == 4) { hash = jhash_3words(ntohl(v4hdr->saddr), ntohl(v4hdr->daddr), v4hdr->protocol, hash); } else if (v4hdr->version == 6 && - pskb_may_pull(skb, sizeof(*hdr) * label_index + - sizeof(struct ipv6hdr))) { + pskb_may_pull(skb, mpls_hdr_len + + sizeof(struct ipv6hdr))) { const struct ipv6hdr *v6hdr; - v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) + - label_index); - + v6hdr = (const struct ipv6hdr *)(hdr + 1); hash = __ipv6_addr_jhash(&v6hdr->saddr, hash); hash = __ipv6_addr_jhash(&v6hdr->daddr, hash); hash = jhash_1word(v6hdr->nexthdr, hash); } } + + break; } return hash; } static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, - struct sk_buff *skb, bool bos) + struct sk_buff *skb) { int alive = ACCESS_ONCE(rt->rt_nhn_alive); u32 hash = 0; @@ -180,7 +182,7 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, if (alive <= 0) return NULL; - hash = mpls_multipath_hash(rt, skb, bos); + hash = mpls_multipath_hash(rt, skb); nh_index = hash % alive; if (alive == rt->rt_nhn) goto out; @@ -278,17 +280,11 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, hdr = mpls_hdr(skb); dec = mpls_entry_decode(hdr); - /* Pop the label */ - skb_pull(skb, sizeof(*hdr)); - skb_reset_network_header(skb); - - skb_orphan(skb); - rt = mpls_route_input_rcu(net, dec.label); if (!rt) goto drop; - nh = mpls_select_multipath(rt, skb, dec.bos); + nh = mpls_select_multipath(rt, skb); if (!nh) goto drop; @@ -297,6 +293,12 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (!mpls_output_possible(out_dev)) goto drop; + /* Pop the label */ + skb_pull(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + + skb_orphan(skb); + if (skb_warn_if_lro(skb)) goto drop; -- cgit v1.2.3 From 4078b76cac68e50ccf1f76a74e7d3d5788aec3fe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 20 Jan 2017 16:05:05 -0800 Subject: net: dsa: Check return value of phy_connect_direct() We need to check the return value of phy_connect_direct() in dsa_slave_phy_connect() otherwise we may be continuing the initialization of a slave network device with a PHY that already attached somewhere else and which will soon be in error because the PHY device is in error. The conditions for such an error to occur are that we have a port of our switch that is not disabled, and has the same port number as a PHY address (say both 5) that can be probed using the DSA slave MII bus. We end-up having this slave network device find a PHY at the same address as our port number, and we try to attach to it. A slave network (e.g: port 0) has already attached to our PHY device, and we try to re-attach it with a different network device, but since we ignore the error we would end-up initializating incorrect device references by the time the slave network interface is opened. The code has been (re)organized several times, making it hard to provide an exact Fixes tag, this is a bugfix nonetheless. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 68c9eea00518..ba1b6b9630d2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1105,10 +1105,8 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) p->phy_interface = p->phy->interface; - phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); - - return 0; + return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, + p->phy_interface); } static int dsa_slave_phy_setup(struct dsa_slave_priv *p, -- cgit v1.2.3 From b2fbd04498789def80ceba3d5bbc5af7f2f70a5f Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Fri, 20 Jan 2017 21:03:03 +0800 Subject: netfilter: nf_tables: validate the name size when possible Currently, if the user add a stateful object with the name size exceed NFT_OBJ_MAXNAMELEN - 1 (i.e. 31), we truncate it down to 31 silently. This is not friendly, furthermore, this will cause duplicated stateful objects when the first 31 characters of the name is same. So limit the stateful object's name size to NFT_OBJ_MAXNAMELEN - 1. After apply this patch, error message will be printed out like this: # name_32=$(printf "%0.sQ" {1..32}) # nft add counter filter $name_32 :1:1-52: Error: Could not process rule: Numerical result out of range add counter filter QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Also this patch cleans up the codes which missing the name size limit validation in nftables. Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 21 ++++++++++++++------- net/netfilter/nft_dynset.c | 3 ++- net/netfilter/nft_lookup.c | 3 ++- net/netfilter/nft_objref.c | 6 ++++-- 4 files changed, 22 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 091d2dcc63b2..b84c7b25219b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -928,7 +928,8 @@ static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, } static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { - [NFTA_CHAIN_TABLE] = { .type = NLA_STRING }, + [NFTA_CHAIN_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_CHAIN_HANDLE] = { .type = NLA_U64 }, [NFTA_CHAIN_NAME] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, @@ -1854,7 +1855,8 @@ static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { - [NFTA_RULE_TABLE] = { .type = NLA_STRING }, + [NFTA_RULE_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_RULE_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, [NFTA_RULE_HANDLE] = { .type = NLA_U64 }, @@ -2443,7 +2445,8 @@ nft_select_set_ops(const struct nlattr * const nla[], } static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { - [NFTA_SET_TABLE] = { .type = NLA_STRING }, + [NFTA_SET_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, @@ -3192,8 +3195,10 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { - [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING }, + [NFTA_SET_ELEM_LIST_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; @@ -4032,8 +4037,10 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { - [NFTA_OBJ_TABLE] = { .type = NLA_STRING }, - [NFTA_OBJ_NAME] = { .type = NLA_STRING }, + [NFTA_OBJ_TABLE] = { .type = NLA_STRING, + .len = NFT_TABLE_MAXNAMELEN - 1 }, + [NFTA_OBJ_NAME] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 7de2f46734a4..049ad2d9ee66 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -98,7 +98,8 @@ out: } static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { - [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING }, + [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, [NFTA_DYNSET_OP] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index d4f97fa7e21d..e21aea7e5ec8 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -49,7 +49,8 @@ static void nft_lookup_eval(const struct nft_expr *expr, } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { - [NFTA_LOOKUP_SET] = { .type = NLA_STRING }, + [NFTA_LOOKUP_SET] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 }, [NFTA_LOOKUP_SREG] = { .type = NLA_U32 }, [NFTA_LOOKUP_DREG] = { .type = NLA_U32 }, diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 415a65ba2b85..1ae8c49ca4a1 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -193,10 +193,12 @@ nft_objref_select_ops(const struct nft_ctx *ctx, } static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { - [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_IMM_NAME] = { .type = NLA_STRING, + .len = NFT_OBJ_MAXNAMELEN - 1 }, [NFTA_OBJREF_IMM_TYPE] = { .type = NLA_U32 }, [NFTA_OBJREF_SET_SREG] = { .type = NLA_U32 }, - [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING }, + [NFTA_OBJREF_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From 115865fa0826ed18ca04717cf72d0fe874c0fe7f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 23 Jan 2017 09:29:09 +0100 Subject: mac80211: don't try to sleep in rate_control_rate_init() In my previous patch, I missed that rate_control_rate_init() is called from some places that cannot sleep, so it cannot call ieee80211_recalc_min_chandef(). Remove that call for now to fix the context bug, we'll have to find a different way to fix the minimum channel width issue. Fixes: 96aa2e7cf126 ("mac80211: calculate min channel width correctly") Reported-by: Xiaolong Ye (via lkp-robot) Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 9e2641d45587..206698bc93f4 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -40,8 +40,6 @@ void rate_control_rate_init(struct sta_info *sta) ieee80211_sta_set_rx_nss(sta); - ieee80211_recalc_min_chandef(sta->sdata); - if (!ref) return; -- cgit v1.2.3 From 0fb44559ffd67de8517098b81f675fa0210f13f0 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 23 Jan 2017 11:17:35 -0800 Subject: af_unix: move unix_mknod() out of bindlock Dmitry reported a deadlock scenario: unix_bind() path: u->bindlock ==> sb_writer do_splice() path: sb_writer ==> pipe->mutex ==> u->bindlock In the unix_bind() code path, unix_mknod() does not have to be done with u->bindlock held, since it is a pure fs operation, so we can just move unix_mknod() out. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Cc: Rainer Weikusat Cc: Al Viro Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/unix/af_unix.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 127656ebe7be..cef79873b09d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -995,6 +995,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned int hash; struct unix_address *addr; struct hlist_head *list; + struct path path = { NULL, NULL }; err = -EINVAL; if (sunaddr->sun_family != AF_UNIX) @@ -1010,9 +1011,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; + if (sun_path[0]) { + umode_t mode = S_IFSOCK | + (SOCK_INODE(sock)->i_mode & ~current_umask()); + err = unix_mknod(sun_path, mode, &path); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + goto out; + } + } + err = mutex_lock_interruptible(&u->bindlock); if (err) - goto out; + goto out_put; err = -EINVAL; if (u->addr) @@ -1029,16 +1041,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) atomic_set(&addr->refcnt, 1); if (sun_path[0]) { - struct path path; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - unix_release_addr(addr); - goto out_up; - } addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); @@ -1065,6 +1067,9 @@ out_unlock: spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->bindlock); +out_put: + if (err) + path_put(&path); out: return err; } -- cgit v1.2.3 From 21b995a9cb093fff33ec91d7cb3822b882a90a1e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Jan 2017 16:43:05 -0800 Subject: ip6_tunnel: must reload ipv6h in ip6ip6_tnl_xmit() Since ip6_tnl_parse_tlv_enc_lim() can call pskb_may_pull(), we must reload any pointer that was related to skb->head (or skb->data), or risk use after free. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Signed-off-by: Eric Dumazet Cc: Dmitry Kozlov Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 +++ net/ipv6/ip6_tunnel.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 75b6108234dd..558631860d91 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -582,6 +582,9 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) return -1; offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + ipv6h = ipv6_hdr(skb); + if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 753d6d0860fb..02923f956ac8 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1303,6 +1303,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowlabel = key->label; } else { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + ipv6h = ipv6_hdr(skb); if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; -- cgit v1.2.3 From fbfa743a9d2a0ffa24251764f10afc13eb21e739 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Jan 2017 16:43:06 -0800 Subject: ipv6: fix ip6_tnl_parse_tlv_enc_lim() This function suffers from multiple issues. First one is that pskb_may_pull() may reallocate skb->head, so the 'raw' pointer needs either to be reloaded or not used at all. Second issue is that NEXTHDR_DEST handling does not validate that the options are present in skb->data, so we might read garbage or access non existent memory. With help from Willem de Bruijn. Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 02923f956ac8..ff8ee06491c3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -400,18 +400,19 @@ ip6_tnl_dev_uninit(struct net_device *dev) __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; - __u8 nexthdr = ipv6h->nexthdr; - __u16 off = sizeof(*ipv6h); + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; + unsigned int nhoff = raw - skb->data; + unsigned int off = nhoff + sizeof(*ipv6h); + u8 next, nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { - __u16 optlen = 0; struct ipv6_opt_hdr *hdr; - if (raw + off + sizeof(*hdr) > skb->data && - !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + u16 optlen; + + if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; - hdr = (struct ipv6_opt_hdr *) (raw + off); + hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; if (frag_hdr->frag_off) @@ -422,20 +423,29 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) } else { optlen = ipv6_optlen(hdr); } + /* cache hdr->nexthdr, since pskb_may_pull() might + * invalidate hdr + */ + next = hdr->nexthdr; if (nexthdr == NEXTHDR_DEST) { - __u16 i = off + 2; + u16 i = 2; + + /* Remember : hdr is no longer valid at this point. */ + if (!pskb_may_pull(skb, off + optlen)) + break; + while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ - if (i + sizeof (*tel) > off + optlen) + if (i + sizeof(*tel) > optlen) break; - tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + tel = (struct ipv6_tlv_tnl_enc_lim *) skb->data + off + i; /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) - return i; + return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; @@ -443,7 +453,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) i++; } } - nexthdr = hdr->nexthdr; + nexthdr = next; off += optlen; } return 0; -- cgit v1.2.3 From 5ce6b04ce96896e8a79e6f60740ced911eaac7a4 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 22 Jan 2017 22:10:32 +0800 Subject: netfilter: nft_log: restrict the log prefix length to 127 First, log prefix will be truncated to NF_LOG_PREFIXLEN-1, i.e. 127, at nf_log_packet(), so the extra part is useless. Second, after adding a log rule with a very very long prefix, we will fail to dump the nft rules after this _special_ one, but acctually, they do exist. For example: # name_65000=$(printf "%0.sQ" {1..65000}) # nft add rule filter output log prefix "$name_65000" # nft add rule filter output counter # nft add rule filter output counter # nft list chain filter output table ip filter { chain output { type filter hook output priority 0; policy accept; } } So now, restrict the log prefix length to NF_LOG_PREFIXLEN-1. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 1 - net/netfilter/nft_log.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 3dca90dc24ad..ffb9e8ada899 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -13,7 +13,6 @@ /* Internal logging interface, which relies on the real LOG target modules */ -#define NF_LOG_PREFIXLEN 128 #define NFLOGGER_NAME_LEN 64 static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 6271e40a3dd6..6f6e64423643 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -39,7 +39,8 @@ static void nft_log_eval(const struct nft_expr *expr, static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_GROUP] = { .type = NLA_U16 }, - [NFTA_LOG_PREFIX] = { .type = NLA_STRING }, + [NFTA_LOG_PREFIX] = { .type = NLA_STRING, + .len = NF_LOG_PREFIXLEN - 1 }, [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, -- cgit v1.2.3 From 35d0ac9070ef619e3bf44324375878a1c540387b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Jan 2017 00:51:32 +0100 Subject: netfilter: nf_tables: fix set->nelems counting with no NLM_F_EXCL If the element exists and no NLM_F_EXCL is specified, do not bump set->nelems, otherwise we leak one set element slot. This problem amplifies if the set is full since the abort path always decrements the counter for the -ENFILE case too, giving one spare extra slot. Fix this by moving set->nelems update to nft_add_set_elem() after successful element insertion. Moreover, remove the element if the set is full so there is no need to rely on the abort path to undo things anymore. Fixes: c016c7e45ddf ("netfilter: nf_tables: honor NLM_F_EXCL flag in set element insertion") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b84c7b25219b..831a9a16f563 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3745,10 +3745,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err5; } + if (set->size && + !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { + err = -ENFILE; + goto err6; + } + nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; +err6: + set->ops->remove(set, &elem); err5: kfree(trans); err4: @@ -3795,15 +3803,9 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - if (set->size && - !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) - return -ENFILE; - err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); - if (err < 0) { - atomic_dec(&set->nelems); + if (err < 0) break; - } } return err; } -- cgit v1.2.3 From de70185de0333783154863278ac87bfbbc54e384 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Jan 2017 00:51:41 +0100 Subject: netfilter: nf_tables: deconstify walk callback function The flush operation needs to modify set and element objects, so let's deconstify this. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 24 ++++++++++++------------ net/netfilter/nft_set_hash.c | 2 +- net/netfilter/nft_set_rbtree.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 831a9a16f563..5bd0068320fb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3087,9 +3087,9 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, } static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); enum nft_registers dreg; @@ -3308,9 +3308,9 @@ struct nft_set_dump_args { }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { struct nft_set_dump_args *args; @@ -3322,7 +3322,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u8 genmask = nft_genmask_cur(net); - const struct nft_set *set; + struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; @@ -3890,9 +3890,9 @@ err1: } static int nft_flush_set(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { struct nft_trans *trans; int err; @@ -3907,8 +3907,8 @@ static int nft_flush_set(const struct nft_ctx *ctx, goto err1; } - nft_trans_elem_set(trans) = (struct nft_set *)set; - nft_trans_elem(trans) = *((struct nft_set_elem *)elem); + nft_trans_elem_set(trans) = set; + nft_trans_elem(trans) = *elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -5019,9 +5019,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, const struct nft_chain *chain); static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, const struct nft_set_iter *iter, - const struct nft_set_elem *elem) + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); const struct nft_data *data; @@ -5045,7 +5045,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, { const struct nft_rule *rule; const struct nft_expr *expr, *last; - const struct nft_set *set; + struct nft_set *set; struct nft_set_binding *binding; struct nft_set_iter iter; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 1e20e2bbb6d9..e36069fb76ae 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -212,7 +212,7 @@ static void nft_hash_remove(const struct nft_set *set, rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); } -static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, +static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_hash *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 08376e50f6cd..f06f55ee516d 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -221,7 +221,7 @@ static void *nft_rbtree_deactivate(const struct net *net, } static void nft_rbtree_walk(const struct nft_ctx *ctx, - const struct nft_set *set, + struct nft_set *set, struct nft_set_iter *iter) { const struct nft_rbtree *priv = nft_set_priv(set); -- cgit v1.2.3 From b2c11e4b9536ebab6b39929e1fe15f57039ab445 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Jan 2017 00:51:48 +0100 Subject: netfilter: nf_tables: bump set->ndeact on set flush Add missing set->ndeact update on each deactivated element from the set flush path. Otherwise, sets with fixed size break after flush since accounting breaks. # nft add set x y { type ipv4_addr\; size 2\; } # nft add element x y { 1.1.1.1 } # nft add element x y { 1.1.1.2 } # nft flush set x y # nft add element x y { 1.1.1.1 } :1:1-28: Error: Could not process rule: Too many open files in system Fixes: 8411b6442e59 ("netfilter: nf_tables: support for set flushing") Reported-by: Elise Lennion Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5bd0068320fb..1b913760f205 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3906,6 +3906,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, err = -ENOENT; goto err1; } + set->ndeact++; nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; -- cgit v1.2.3 From 93f955aad4bacee5acebad141d1a03cd51f27b4e Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 24 Jan 2017 13:00:43 +0100 Subject: tipc: fix nametbl_lock soft lockup at node/link events We trigger a soft lockup as we grab nametbl_lock twice if the node has a pending node up/down or link up/down event while: - we process an incoming named message in tipc_named_rcv() and perform an tipc_update_nametbl(). - we have pending backlog items in the name distributor queue during a nametable update using tipc_nametbl_publish() or tipc_nametbl_withdraw(). The following are the call chain associated: tipc_named_rcv() Grabs nametbl_lock tipc_update_nametbl() (publish/withdraw) tipc_node_subscribe()/unsubscribe() tipc_node_write_unlock() << lockup occurs if an outstanding node/link event exits, as we grabs nametbl_lock again >> tipc_nametbl_withdraw() Grab nametbl_lock tipc_named_process_backlog() tipc_update_nametbl() << rest as above >> The function tipc_node_write_unlock(), in addition to releasing the lock processes the outstanding node/link up/down events. To do this, we need to grab the nametbl_lock again leading to the lockup. In this commit we fix the soft lockup by introducing a fast variant of node_unlock(), where we just release the lock. We adapt the node_subscribe()/node_unsubscribe() to use the fast variants. Reported-and-Tested-by: John Thompson Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/node.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 9d2f4c2b08ab..27753325e06e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -263,6 +263,11 @@ static void tipc_node_write_lock(struct tipc_node *n) write_lock_bh(&n->lock); } +static void tipc_node_write_unlock_fast(struct tipc_node *n) +{ + write_unlock_bh(&n->lock); +} + static void tipc_node_write_unlock(struct tipc_node *n) { struct net *net = n->net; @@ -417,7 +422,7 @@ void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); - tipc_node_write_unlock(n); + tipc_node_write_unlock_fast(n); tipc_node_put(n); } @@ -435,7 +440,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) } tipc_node_write_lock(n); list_del_init(subscr); - tipc_node_write_unlock(n); + tipc_node_write_unlock_fast(n); tipc_node_put(n); } -- cgit v1.2.3 From d094c4d5f5c7e1b225e94227ca3f007be3adc4e8 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 24 Jan 2017 13:00:44 +0100 Subject: tipc: add subscription refcount to avoid invalid delete Until now, the subscribers keep track of the subscriptions using reference count at subscriber level. At subscription cancel or subscriber delete, we delete the subscription only if the timer was pending for the subscription. This approach is incorrect as: 1. del_timer() is not SMP safe, if on CPU0 the check for pending timer returns true but CPU1 might schedule the timer callback thereby deleting the subscription. Thus when CPU0 is scheduled, it deletes an invalid subscription. 2. We export tipc_subscrp_report_overlap(), which accesses the subscription pointer multiple times. Meanwhile the subscription timer can expire thereby freeing the subscription and we might continue to access the subscription pointer leading to memory violations. In this commit, we introduce subscription refcount to avoid deleting an invalid subscription. Reported-and-Tested-by: John Thompson Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/subscr.c | 124 ++++++++++++++++++++++++++++++------------------------ net/tipc/subscr.h | 1 + 2 files changed, 71 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 0dd02244e21d..9d94e65d0894 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -54,6 +54,8 @@ struct tipc_subscriber { static void tipc_subscrp_delete(struct tipc_subscription *sub); static void tipc_subscrb_put(struct tipc_subscriber *subscriber); +static void tipc_subscrp_put(struct tipc_subscription *subscription); +static void tipc_subscrp_get(struct tipc_subscription *subscription); /** * htohl - convert value to endianness used by destination @@ -123,6 +125,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, { struct tipc_name_seq seq; + tipc_subscrp_get(sub); tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; @@ -132,30 +135,23 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, node); + tipc_subscrp_put(sub); } static void tipc_subscrp_timeout(unsigned long data) { struct tipc_subscription *sub = (struct tipc_subscription *)data; - struct tipc_subscriber *subscriber = sub->subscriber; /* Notify subscriber of timeout */ tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, TIPC_SUBSCR_TIMEOUT, 0, 0); - spin_lock_bh(&subscriber->lock); - tipc_subscrp_delete(sub); - spin_unlock_bh(&subscriber->lock); - - tipc_subscrb_put(subscriber); + tipc_subscrp_put(sub); } static void tipc_subscrb_kref_release(struct kref *kref) { - struct tipc_subscriber *subcriber = container_of(kref, - struct tipc_subscriber, kref); - - kfree(subcriber); + kfree(container_of(kref,struct tipc_subscriber, kref)); } static void tipc_subscrb_put(struct tipc_subscriber *subscriber) @@ -168,6 +164,59 @@ static void tipc_subscrb_get(struct tipc_subscriber *subscriber) kref_get(&subscriber->kref); } +static void tipc_subscrp_kref_release(struct kref *kref) +{ + struct tipc_subscription *sub = container_of(kref, + struct tipc_subscription, + kref); + struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + struct tipc_subscriber *subscriber = sub->subscriber; + + spin_lock_bh(&subscriber->lock); + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + atomic_dec(&tn->subscription_count); + spin_unlock_bh(&subscriber->lock); + kfree(sub); + tipc_subscrb_put(subscriber); +} + +static void tipc_subscrp_put(struct tipc_subscription *subscription) +{ + kref_put(&subscription->kref, tipc_subscrp_kref_release); +} + +static void tipc_subscrp_get(struct tipc_subscription *subscription) +{ + kref_get(&subscription->kref); +} + +/* tipc_subscrb_subscrp_delete - delete a specific subscription or all + * subscriptions for a given subscriber. + */ +static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, + struct tipc_subscr *s) +{ + struct list_head *subscription_list = &subscriber->subscrp_list; + struct tipc_subscription *sub, *temp; + + spin_lock_bh(&subscriber->lock); + list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) { + if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) + continue; + + tipc_subscrp_get(sub); + spin_unlock_bh(&subscriber->lock); + tipc_subscrp_delete(sub); + tipc_subscrp_put(sub); + spin_lock_bh(&subscriber->lock); + + if (s) + break; + } + spin_unlock_bh(&subscriber->lock); +} + static struct tipc_subscriber *tipc_subscrb_create(int conid) { struct tipc_subscriber *subscriber; @@ -177,8 +226,8 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) pr_warn("Subscriber rejected, no memory\n"); return NULL; } - kref_init(&subscriber->kref); INIT_LIST_HEAD(&subscriber->subscrp_list); + kref_init(&subscriber->kref); subscriber->conid = conid; spin_lock_init(&subscriber->lock); @@ -187,55 +236,22 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub, *temp; - u32 timeout; - - spin_lock_bh(&subscriber->lock); - /* Destroy any existing subscriptions for subscriber */ - list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, - subscrp_list) { - timeout = htohl(sub->evt.s.timeout, sub->swap); - if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) { - tipc_subscrp_delete(sub); - tipc_subscrb_put(subscriber); - } - } - spin_unlock_bh(&subscriber->lock); - + tipc_subscrb_subscrp_delete(subscriber, NULL); tipc_subscrb_put(subscriber); } static void tipc_subscrp_delete(struct tipc_subscription *sub) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - kfree(sub); - atomic_dec(&tn->subscription_count); + if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) + tipc_subscrp_put(sub); } static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { - struct tipc_subscription *sub, *temp; - u32 timeout; - - spin_lock_bh(&subscriber->lock); - /* Find first matching subscription, exit if not found */ - list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list, - subscrp_list) { - if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { - timeout = htohl(sub->evt.s.timeout, sub->swap); - if ((timeout == TIPC_WAIT_FOREVER) || - del_timer(&sub->timer)) { - tipc_subscrp_delete(sub); - tipc_subscrb_put(subscriber); - } - break; - } - } - spin_unlock_bh(&subscriber->lock); + tipc_subscrb_subscrp_delete(subscriber, s); } static struct tipc_subscription *tipc_subscrp_create(struct net *net, @@ -272,6 +288,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); atomic_inc(&tn->subscription_count); + kref_init(&sub->kref); return sub; } @@ -288,17 +305,16 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); - tipc_subscrb_get(subscriber); sub->subscriber = subscriber; tipc_nametbl_subscribe(sub); + tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); + setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); timeout = htohl(sub->evt.s.timeout, swap); - if (timeout == TIPC_WAIT_FOREVER) - return; - setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); - mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); + if (timeout != TIPC_WAIT_FOREVER) + mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); } /* Handle one termination request for the subscriber */ diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index be60103082c9..ffdc214c117a 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -57,6 +57,7 @@ struct tipc_subscriber; * @evt: template for events generated by subscription */ struct tipc_subscription { + struct kref kref; struct tipc_subscriber *subscriber; struct net *net; struct timer_list timer; -- cgit v1.2.3 From fc0adfc8fd18b61b6f7a3f28b429e134d6f3a008 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 24 Jan 2017 13:00:45 +0100 Subject: tipc: fix connection refcount error Until now, the generic server framework maintains the connection id's per subscriber in server's conn_idr. At tipc_close_conn, we remove the connection id from the server list, but the connection is valid until we call the refcount cleanup. Hence we have a window where the server allocates the same connection to an new subscriber leading to inconsistent reference count. We have another refcount warning we grab the refcount in tipc_conn_lookup() for connections with flag with CF_CONNECTED not set. This usually occurs at shutdown when the we stop the topology server and withdraw TIPC_CFG_SRV publication thereby triggering a withdraw message to subscribers. In this commit, we: 1. remove the connection from the server list at recount cleanup. 2. grab the refcount for a connection only if CF_CONNECTED is set. Tested-by: John Thompson Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/server.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 215849ce453d..2e803601aa99 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -91,7 +91,8 @@ static void tipc_sock_release(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); - struct sockaddr_tipc *saddr = con->server->saddr; + struct tipc_server *s = con->server; + struct sockaddr_tipc *saddr = s->saddr; struct socket *sock = con->sock; struct sock *sk; @@ -106,6 +107,11 @@ static void tipc_conn_kref_release(struct kref *kref) tipc_sock_release(con); sock_release(sock); con->sock = NULL; + + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); } tipc_clean_outqueues(con); @@ -128,8 +134,10 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) spin_lock_bh(&s->idr_lock); con = idr_find(&s->conn_idr, conid); - if (con) + if (con && test_bit(CF_CONNECTED, &con->flags)) conn_get(con); + else + con = NULL; spin_unlock_bh(&s->idr_lock); return con; } @@ -198,15 +206,8 @@ static void tipc_sock_release(struct tipc_conn *con) static void tipc_close_conn(struct tipc_conn *con) { - struct tipc_server *s = con->server; - if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); - /* We shouldn't flush pending works as we may be in the * thread. In fact the races with pending rx/tx work structs * are harmless for us here as we have already deleted this -- cgit v1.2.3 From 9dc3abdd1f7ea524e8552e0a3ef01219892ed1f4 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 24 Jan 2017 13:00:46 +0100 Subject: tipc: fix nametbl_lock soft lockup at module exit Commit 333f796235a527 ("tipc: fix a race condition leading to subscriber refcnt bug") reveals a soft lockup while acquiring nametbl_lock. Before commit 333f796235a527, we call tipc_conn_shutdown() from tipc_close_conn() in the context of tipc_topsrv_stop(). In that context, we are allowed to grab the nametbl_lock. Commit 333f796235a527, moved tipc_conn_release (renamed from tipc_conn_shutdown) to the connection refcount cleanup. This allows either tipc_nametbl_withdraw() or tipc_topsrv_stop() to the cleanup. Since tipc_exit_net() first calls tipc_topsrv_stop() and then tipc_nametble_withdraw() increases the chances for the later to perform the connection cleanup. The soft lockup occurs in the call chain of tipc_nametbl_withdraw(), when it performs the tipc_conn_kref_release() as it tries to grab nametbl_lock again while holding it already. tipc_nametbl_withdraw() grabs nametbl_lock tipc_nametbl_remove_publ() tipc_subscrp_report_overlap() tipc_subscrp_send_event() tipc_conn_sendmsg() << if (con->flags != CF_CONNECTED) we do conn_put(), triggering the cleanup as refcount=0. >> tipc_conn_kref_release tipc_sock_release tipc_conn_release tipc_subscrb_delete tipc_subscrp_delete tipc_nametbl_unsubscribe << Soft Lockup >> The previous changes in this series fixes the race conditions fixed by commit 333f796235a527. Hence we can now revert the commit. Fixes: 333f796235a52727 ("tipc: fix a race condition leading to subscriber refcnt bug") Reported-and-Tested-by: John Thompson Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/server.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 2e803601aa99..826cde2c401e 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -86,7 +86,6 @@ struct outqueue_entry { static void tipc_recv_work(struct work_struct *work); static void tipc_send_work(struct work_struct *work); static void tipc_clean_outqueues(struct tipc_conn *con); -static void tipc_sock_release(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { @@ -104,7 +103,6 @@ static void tipc_conn_kref_release(struct kref *kref) } saddr->scope = -TIPC_NODE_SCOPE; kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); - tipc_sock_release(con); sock_release(sock); con->sock = NULL; @@ -194,19 +192,15 @@ static void tipc_unregister_callbacks(struct tipc_conn *con) write_unlock_bh(&sk->sk_callback_lock); } -static void tipc_sock_release(struct tipc_conn *con) +static void tipc_close_conn(struct tipc_conn *con) { struct tipc_server *s = con->server; - if (con->conid) - s->tipc_conn_release(con->conid, con->usr_data); - - tipc_unregister_callbacks(con); -} - -static void tipc_close_conn(struct tipc_conn *con) -{ if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { + tipc_unregister_callbacks(con); + + if (con->conid) + s->tipc_conn_release(con->conid, con->usr_data); /* We shouldn't flush pending works as we may be in the * thread. In fact the races with pending rx/tx work structs -- cgit v1.2.3 From 4c887aa65d38633885010277f3482400681be719 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 24 Jan 2017 13:00:47 +0100 Subject: tipc: ignore requests when the connection state is not CONNECTED In tipc_conn_sendmsg(), we first queue the request to the outqueue followed by the connection state check. If the connection is not connected, we should not queue this message. In this commit, we reject the messages if the connection state is not CF_CONNECTED. Acked-by: Ying Xue Acked-by: Jon Maloy Tested-by: John Thompson Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/server.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 826cde2c401e..04ff441b8065 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -453,6 +453,11 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, if (!con) return -EINVAL; + if (!test_bit(CF_CONNECTED, &con->flags)) { + conn_put(con); + return 0; + } + e = tipc_alloc_entry(data, len); if (!e) { conn_put(con); @@ -466,12 +471,8 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); - if (test_bit(CF_CONNECTED, &con->flags)) { - if (!queue_work(s->send_wq, &con->swork)) - conn_put(con); - } else { + if (!queue_work(s->send_wq, &con->swork)) conn_put(con); - } return 0; } @@ -495,7 +496,7 @@ static void tipc_send_to_sock(struct tipc_conn *con) int ret; spin_lock_bh(&con->outqueue_lock); - while (1) { + while (test_bit(CF_CONNECTED, &con->flags)) { e = list_entry(con->outqueue.next, struct outqueue_entry, list); if ((struct list_head *) e == &con->outqueue) -- cgit v1.2.3 From 35e22e49a5d6a741ebe7f2dd280b2052c3003ef7 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 24 Jan 2017 13:00:48 +0100 Subject: tipc: fix cleanup at module unload In tipc_server_stop(), we iterate over the connections with limiting factor as server's idr_in_use. We ignore the fact that this variable is decremented in tipc_close_conn(), leading to premature exit. In this commit, we iterate until the we have no connections left. Acked-by: Ying Xue Acked-by: Jon Maloy Tested-by: John Thompson Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/server.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 04ff441b8065..3cd6402e812c 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -619,14 +619,12 @@ int tipc_server_start(struct tipc_server *s) void tipc_server_stop(struct tipc_server *s) { struct tipc_conn *con; - int total = 0; int id; spin_lock_bh(&s->idr_lock); - for (id = 0; total < s->idr_in_use; id++) { + for (id = 0; s->idr_in_use; id++) { con = idr_find(&s->conn_idr, id); if (con) { - total++; spin_unlock_bh(&s->idr_lock); tipc_close_conn(con); spin_lock_bh(&s->idr_lock); -- cgit v1.2.3 From 88ff7334f25909802140e690c0e16433e485b0a0 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Tue, 24 Jan 2017 16:26:47 +0000 Subject: net: Specify the owning module for lwtunnel ops Modules implementing lwtunnel ops should not be allowed to unload while there is state alive using those ops, so specify the owning module for all lwtunnel ops. Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- net/core/lwt_bpf.c | 1 + net/ipv4/ip_tunnel_core.c | 2 ++ net/ipv6/ila/ila_lwt.c | 1 + net/ipv6/seg6_iptunnel.c | 1 + net/mpls/mpls_iptunnel.c | 1 + 5 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 71bb3e2eca08..b3eef90b2df9 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -386,6 +386,7 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { .fill_encap = bpf_fill_encap_info, .get_encap_size = bpf_encap_nlsize, .cmp_encap = bpf_encap_cmp, + .owner = THIS_MODULE, }; static int __init bpf_lwt_init(void) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index fed3d29f9eb3..0fd1976ab63b 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -313,6 +313,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .fill_encap = ip_tun_fill_encap_info, .get_encap_size = ip_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, + .owner = THIS_MODULE, }; static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { @@ -403,6 +404,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { .fill_encap = ip6_tun_fill_encap_info, .get_encap_size = ip6_tun_encap_nlsize, .cmp_encap = ip_tun_cmp_encap, + .owner = THIS_MODULE, }; void __init ip_tunnel_core_init(void) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index a7bc54ab46e2..13b5e85fe0d5 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -238,6 +238,7 @@ static const struct lwtunnel_encap_ops ila_encap_ops = { .fill_encap = ila_fill_encap_info, .get_encap_size = ila_encap_nlsize, .cmp_encap = ila_encap_cmp, + .owner = THIS_MODULE, }; int ila_lwt_init(void) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 1d60cb132835..c46f8cbf5ab5 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -422,6 +422,7 @@ static const struct lwtunnel_encap_ops seg6_iptun_ops = { .fill_encap = seg6_fill_encap_info, .get_encap_size = seg6_encap_nlsize, .cmp_encap = seg6_encap_cmp, + .owner = THIS_MODULE, }; int __init seg6_iptunnel_init(void) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 2f7ccd934416..1d281c1ff7c1 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -215,6 +215,7 @@ static const struct lwtunnel_encap_ops mpls_iptun_ops = { .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, + .owner = THIS_MODULE, }; static int __init mpls_iptunnel_init(void) -- cgit v1.2.3 From 85c814016ce3b371016c2c054a905fa2492f5a65 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Tue, 24 Jan 2017 16:26:48 +0000 Subject: lwtunnel: Fix oops on state free after encap module unload When attempting to free lwtunnel state after the module for the encap has been unloaded an oops occurs: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: lwtstate_free+0x18/0x40 [..] task: ffff88003e372380 task.stack: ffffc900001fc000 RIP: 0010:lwtstate_free+0x18/0x40 RSP: 0018:ffff88003fd83e88 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88002bbb3380 RCX: ffff88000c91a300 [..] Call Trace: free_fib_info_rcu+0x195/0x1a0 ? rt_fibinfo_free+0x50/0x50 rcu_process_callbacks+0x2d3/0x850 ? rcu_process_callbacks+0x296/0x850 __do_softirq+0xe4/0x4cb irq_exit+0xb0/0xc0 smp_apic_timer_interrupt+0x3d/0x50 apic_timer_interrupt+0x93/0xa0 [..] Code: e8 6e c6 fc ff 89 d8 5b 5d c3 bb de ff ff ff eb f4 66 90 66 66 66 66 90 55 48 89 e5 53 0f b7 07 48 89 fb 48 8b 04 c5 00 81 d5 81 <48> 8b 40 08 48 85 c0 74 13 ff d0 48 8d 7b 20 be 20 00 00 00 e8 The problem is after the module for the encap can be unloaded the corresponding ops is removed and is thus NULL here. Modules implementing lwtunnel ops should not be allowed to unload while there is state alive using those ops, so grab the module reference for the ops on creating lwtunnel state and of course release the reference when freeing the state. Fixes: 1104d9ba443a ("lwtunnel: Add destroy state operation") Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- net/core/lwtunnel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 47b1dd65947b..c23465005f2f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -115,8 +115,11 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); - if (likely(ops && ops->build_state)) + if (likely(ops && ops->build_state && try_module_get(ops->owner))) { ret = ops->build_state(dev, encap, family, cfg, lws); + if (ret) + module_put(ops->owner); + } rcu_read_unlock(); return ret; @@ -194,6 +197,7 @@ void lwtstate_free(struct lwtunnel_state *lws) } else { kfree(lws); } + module_put(ops->owner); } EXPORT_SYMBOL(lwtstate_free); -- cgit v1.2.3 From 6f29a130613191d3c6335169febe002cba00edf5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Jan 2017 14:01:53 +0800 Subject: sctp: sctp_addr_id2transport should verify the addr before looking up assoc sctp_addr_id2transport is a function for sockopt to look up assoc by address. As the address is from userspace, it can be a v4-mapped v6 address. But in sctp protocol stack, it always handles a v4-mapped v6 address as a v4 address. So it's necessary to convert it to a v4 address before looking up assoc by address. This patch is to fix it by calling sctp_verify_addr in which it can do this conversion before calling sctp_endpoint_lookup_assoc, just like what sctp_sendmsg and __sctp_connect do for the address from users. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 318c6786d653..37eeab7899fc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -235,8 +235,12 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, sctp_assoc_t id) { struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; - struct sctp_transport *transport; + struct sctp_af *af = sctp_get_af_specific(addr->ss_family); union sctp_addr *laddr = (union sctp_addr *)addr; + struct sctp_transport *transport; + + if (sctp_verify_addr(sk, laddr, af->sockaddr_len)) + return NULL; addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, laddr, -- cgit v1.2.3 From 5207f3996338e1db71363fe381c81aaf1e54e4e3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Jan 2017 14:05:16 +0800 Subject: sctp: sctp gso should set feature with NETIF_F_SG when calling skb_segment Now sctp gso puts segments into skb's frag_list, then processes these segments in skb_segment. But skb_segment handles them only when gs is enabled, as it's in the same branch with skb's frags. Although almost all the NICs support sg other than some old ones, but since commit 1e16aa3ddf86 ("net: gso: use feature flag argument in all protocol gso handlers"), features &= skb->dev->hw_enc_features, and xfrm_output_gso call skb_segment with features = 0, which means sctp gso would call skb_segment with sg = 0, and skb_segment would not work as expected. This patch is to fix it by setting features param with NETIF_F_SG when calling skb_segment so that it can go the right branch to process the skb's frag_list. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 7e869d0cca69..4f5a2b580aa5 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, goto out; } - segs = skb_segment(skb, features | NETIF_F_HW_CSUM); + segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG); if (IS_ERR(segs)) goto out; -- cgit v1.2.3 From 56d806222ace4c3aeae516cd7a855340fb2839d8 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 24 Jan 2017 21:49:41 -0500 Subject: tcp: correct memory barrier usage in tcp_check_space() sock_reset_flag() maps to __clear_bit() not the atomic version clear_bit(). Thus, we need smp_mb(), smp_mb__after_atomic() is not sufficient. Fixes: 3c7151275c0c ("tcp: add memory barriers to write space paths") Cc: Eric Dumazet Cc: Oleg Nesterov Signed-off-by: Jason Baron Acked-by: Eric Dumazet Reported-by: Oleg Nesterov Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6c790754ae3e..41dcbd568cbe 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5078,7 +5078,7 @@ static void tcp_check_space(struct sock *sk) if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); /* pairs with tcp_poll() */ - smp_mb__after_atomic(); + smp_mb(); if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { tcp_new_space(sk); -- cgit v1.2.3 From f154be241d22298d2b63c9b613f619fa1086ea75 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 25 Jan 2017 09:10:41 -0800 Subject: net: dsa: Bring back device detaching in dsa_slave_suspend() Commit 448b4482c671 ("net: dsa: Add lockdep class to tx queues to avoid lockdep splat") removed the netif_device_detach() call done in dsa_slave_suspend() which is necessary, and paired with a corresponding netif_device_attach(), bring it back. Fixes: 448b4482c671 ("net: dsa: Add lockdep class to tx queues to avoid lockdep splat") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ba1b6b9630d2..7d4596110851 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1201,6 +1201,8 @@ int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); + netif_device_detach(slave_dev); + if (p->phy) { phy_stop(p->phy); p->old_pause = -1; -- cgit v1.2.3 From 92e55f412cffd016cc245a74278cb4d7b89bb3bc Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Thu, 26 Jan 2017 22:56:21 +0100 Subject: tcp: don't annotate mark on control socket from tcp_v6_send_response() Unlike ipv4, this control socket is shared by all cpus so we cannot use it as scratchpad area to annotate the mark that we pass to ip6_xmit(). Add a new parameter to ip6_xmit() to indicate the mark. The SCTP socket family caches the flowi6 structure in the sctp_transport structure, so we cannot use to carry the mark unless we later on reset it back, which I discarded since it looks ugly to me. Fixes: bf99b4ded5f8 ("tcp: fix mark propagation with fwmark_reflect enabled") Suggested-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 4 ++-- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 5 ++--- net/sctp/ipv6.c | 3 ++- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index adfc790f7193..c4e879c02186 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -227,7 +227,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); + err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -281,7 +281,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); + ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 7396e75e161b..75c308239243 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -176,7 +176,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused /* Restore final destination back after routing done */ fl6.daddr = sk->sk_v6_daddr; - res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), + res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass); rcu_read_unlock(); return res; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 38122d04fadc..2c0df09e9036 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -172,7 +172,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - struct ipv6_txoptions *opt, int tclass) + __u32 mark, struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); @@ -240,7 +240,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2b20622a5824..cb8929681dc7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -469,7 +469,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, opt, np->tclass); + err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -840,8 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); - ctl_sk->sk_mark = fl6.flowi6_mark; - ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); + ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 5ed8e79bf102..64dfd35ccdcc 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -222,7 +222,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); rcu_read_lock(); - res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass); + res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), + np->tclass); rcu_read_unlock(); return res; } -- cgit v1.2.3