diff options
Diffstat (limited to 'net/ipv6')
27 files changed, 758 insertions, 310 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ed06b1190f05..2435f7ab070b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3438,6 +3438,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } else if (event == NETDEV_CHANGE) { if (!addrconf_link_ready(dev)) { /* device is still not ready. */ + rt6_sync_down_dev(dev, event); break; } @@ -3449,6 +3450,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * multicast snooping switches */ ipv6_mc_up(idev); + rt6_sync_up(dev, RTNH_F_LINKDOWN); break; } idev->if_flags |= IF_READY; @@ -3484,6 +3486,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (run_pending) addrconf_dad_run(idev); + /* Device has an address by now */ + rt6_sync_up(dev, RTNH_F_DEAD); + /* * If the MTU changed during the interface down, * when the interface up, the changed MTU must be @@ -3577,6 +3582,7 @@ static bool addr_is_local(const struct in6_addr *addr) static int addrconf_ifdown(struct net_device *dev, int how) { + unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN; struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; @@ -3586,8 +3592,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) ASSERT_RTNL(); - rt6_ifdown(net, dev); - neigh_ifdown(&nd_tbl, dev); + rt6_disable_ip(dev, event); idev = __in6_dev_get(dev); if (!idev) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a1f918713006..fbf08ce3f5ab 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -221,8 +221,7 @@ ipv4_connected: if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) { + if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a902ff8f59be..7c888c6e53a9 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -141,14 +141,32 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; + struct xfrm_offload *xo = xfrm_offload(skb); void *tmp; - struct dst_entry *dst = skb_dst(skb); - struct xfrm_state *x = dst->xfrm; + struct xfrm_state *x; + + if (xo && (xo->flags & XFRM_DEV_RESUME)) + x = skb->sp->xvec[skb->sp->len - 1]; + else + x = skb_dst(skb)->xfrm; tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); kfree(tmp); - xfrm_output_resume(skb, err); + + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + kfree_skb(skb); + return; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + secpath_reset(skb); + xfrm_dev_resume(skb); + } else { + xfrm_output_resume(skb, err); + } } /* Move ESP header back into place. */ @@ -734,17 +752,13 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(aead_name, 0, mask); + aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -774,7 +788,6 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; - u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -800,10 +813,7 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - if (x->xso.offload_handle) - mask |= CRYPTO_ALG_ASYNC; - - aead = crypto_alloc_aead(authenc_name, 0, mask); + aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 333a478aa161..0bb7d54cf2cb 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -135,75 +135,36 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, netdev_features_t features) { - __u32 seq; - int err = 0; - struct sk_buff *skb2; struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; - struct sk_buff *segs = ERR_PTR(-EINVAL); netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) - goto out; - - seq = xo->seq.low; + return ERR_PTR(-EINVAL); x = skb->sp->xvec[skb->sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) - goto out; + return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) - goto out; + return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; - if (!(features & NETIF_F_HW_ESP)) + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - segs = x->outer_mode->gso_segment(x, skb, esp_features); - if (IS_ERR_OR_NULL(segs)) - goto out; - - __skb_pull(skb, skb->data - skb_mac_header(skb)); - - skb2 = segs; - do { - struct sk_buff *nskb = skb2->next; - - xo = xfrm_offload(skb2); - xo->flags |= XFRM_GSO_SEGMENT; - xo->seq.low = seq; - xo->seq.hi = xfrm_replay_seqhi(x, seq); - - if(!(features & NETIF_F_HW_ESP)) - xo->flags |= CRYPTO_FALLBACK; - - x->outer_mode->xmit(x, skb2); - - err = x->type_offload->xmit(x, skb2, esp_features); - if (err) { - kfree_skb_list(segs); - return ERR_PTR(err); - } - - if (!skb_is_gso(skb2)) - seq++; - else - seq += skb_shinfo(skb2)->gso_segs; - - skb_push(skb2, skb2->mac_len); - skb2 = nskb; - } while (skb2); + xo->flags |= XFRM_GSO_SEGMENT; -out: - return segs; + return x->outer_mode->gso_segment(x, skb, esp_features); } static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) @@ -222,6 +183,7 @@ static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { + int len; int err; int alen; int blksize; @@ -230,6 +192,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; + __u32 seq; esp.inplace = true; @@ -265,28 +228,33 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features return esp.nfrags; } + seq = xo->seq.low; + esph = ip_esp_hdr(skb); esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(xo->seq.low); - } else { - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; + esph->seq_no = htonl(seq); - ipv6_hdr(skb)->payload_len = htons(len); + if (!skb_is_gso(skb)) + xo->seq.low++; + else + xo->seq.low += skb_shinfo(skb)->gso_segs; } + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + + ipv6_hdr(skb)->payload_len = htons(len); + if (hw_offload) return 0; - esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); - err = esp6_output_tail(x, skb, &esp); if (err) return err; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a64d559fa513..e31118f417b4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -107,16 +107,13 @@ enum { void fib6_update_sernum(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - spin_lock_bh(&table->tb6_lock); fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&table->tb6_lock)); + lockdep_is_held(&rt->rt6i_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); - spin_unlock_bh(&table->tb6_lock); } /* @@ -799,12 +796,6 @@ insert_above: return ln; } -static bool rt6_qualify_for_ecmp(struct rt6_info *rt) -{ - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; -} - static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) { int i; @@ -994,6 +985,7 @@ next_iter: rt6i_nsiblings++; } BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1102,8 +1094,8 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void fib6_update_sernum_upto_root(struct rt6_info *rt, - int sernum) +static void __fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); @@ -1117,6 +1109,11 @@ static void fib6_update_sernum_upto_root(struct rt6_info *rt, } } +void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +{ + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1230,7 +1227,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, err = fib6_add_rt2node(fn, rt, info, mxc, extack); if (!err) { - fib6_update_sernum_upto_root(rt, sernum); + __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1241,23 +1238,28 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, - lockdep_is_held(&table->tb6_lock)); - if (pn != fn && pn_leaf == rt) { - pn_leaf = NULL; - RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); - pn_leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct rt6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + atomic_dec(&rt->rt6i_ref); } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); +#if RT6_DEBUG >= 2 + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = + info->nl_net->ipv6.ip6_null_entry; + } #endif - atomic_inc(&pn_leaf->rt6i_ref); - rcu_assign_pointer(pn->leaf, pn_leaf); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); + } } #endif goto failure; @@ -1665,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, sibling->rt6i_nsiblings--; rt->rt6i_nsiblings = 0; list_del_init(&rt->rt6i_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ @@ -1887,7 +1890,7 @@ static int fib6_clean_node(struct fib6_walker *w) for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); - if (res < 0) { + if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { @@ -1900,6 +1903,12 @@ static int fib6_clean_node(struct fib6_walker *w) continue; } return 0; + } else if (res == -2) { + if (WARN_ON(!rt->rt6i_nsiblings)) + continue; + rt = list_last_entry(&rt->rt6i_siblings, + struct rt6_info, rt6i_siblings); + continue; } WARN_ON(res != 0); } @@ -1911,7 +1920,8 @@ static int fib6_clean_node(struct fib6_walker *w) * Convenient frontend to tree walker. * * func is called on each route. - * It may return -1 -> delete this route. + * It may return -2 -> skip multipath route. + * -1 -> delete this route. * 0 -> continue walking */ @@ -2103,7 +2113,6 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 97f148f15429..db99446e0276 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -550,10 +550,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len, info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); - if (!md) { - dst_release((struct dst_entry *)tun_dst); - return PACKET_REJECT; - } memcpy(md, pkt_md, sizeof(*md)); md->version = ver; @@ -1337,6 +1333,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev) eth_random_addr(dev->perm_addr); } +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + +static void ip6gre_tnl_init_features(struct net_device *dev) +{ + struct ip6_tnl *nt = netdev_priv(dev); + + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + nt->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1375,6 +1401,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; netif_keep_dst(dev); } + ip6gre_tnl_init_features(dev); return 0; } @@ -1709,11 +1736,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) - static int ip6erspan_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1852,26 +1874,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - (nt->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } - err = register_netdevice(dev); if (err) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ece2781a31b2..19adad6d90bc 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -138,6 +138,14 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ret; } +#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) + /* Policy lookup after SNAT yielded a new policy */ + if (skb_dst(skb)->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(net, sk, skb); + } +#endif + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) @@ -370,7 +378,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) +unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) { unsigned int mtu; struct inet6_dev *idev; @@ -390,6 +398,7 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 8a4610e84e58..8071f42cd8a0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1077,10 +1077,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } - } else if (!(t->parms.flags & - (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { - /* enable the cache only only if the routing decision does - * not depend on the current inner header value + } else if (t->parms.proto != 0 && !(t->parms.flags & + (IP6_TNL_F_USE_ORIG_TCLASS | + IP6_TNL_F_USE_ORIG_FWMARK))) { + /* enable the cache only if neither the outer protocol nor the + * routing decision depends on the current inner header value */ use_cache = true; } @@ -1679,11 +1680,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); - if (tnl->parms.proto == IPPROTO_IPIP) { - if (new_mtu < ETH_MIN_MTU) + if (tnl->parms.proto == IPPROTO_IPV6) { + if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { - if (new_mtu < IPV6_MIN_MTU) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (new_mtu > 0xFFF8 - dev->hard_header_len) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 39970e212ad5..d95ceca7ff8f 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -68,32 +68,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) } EXPORT_SYMBOL(ip6_route_me_harder); -/* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - -struct ip6_rt_info { - struct in6_addr daddr; - struct in6_addr saddr; - u_int32_t mark; -}; - -static void nf_ip6_saveroute(const struct sk_buff *skb, - struct nf_queue_entry *entry) -{ - struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); - - if (entry->state.hook == NF_INET_LOCAL_OUT) { - const struct ipv6hdr *iph = ipv6_hdr(skb); - - rt_info->daddr = iph->daddr; - rt_info->saddr = iph->saddr; - rt_info->mark = skb->mark; - } -} - -static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, +static int nf_ip6_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); @@ -103,7 +78,7 @@ static int nf_ip6_reroute(struct net *net, struct sk_buff *skb, if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || skb->mark != rt_info->mark) - return ip6_route_me_harder(net, skb); + return ip6_route_me_harder(entry->state.net, skb); } return 0; } @@ -190,25 +165,19 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, }; static const struct nf_ipv6_ops ipv6ops = { - .chk_addr = ipv6_chk_addr, - .route_input = ip6_route_input, - .fragment = ip6_fragment -}; - -static const struct nf_afinfo nf_ip6_afinfo = { - .family = AF_INET6, + .chk_addr = ipv6_chk_addr, + .route_input = ip6_route_input, + .fragment = ip6_fragment, .checksum = nf_ip6_checksum, .checksum_partial = nf_ip6_checksum_partial, .route = nf_ip6_route, - .saveroute = nf_ip6_saveroute, .reroute = nf_ip6_reroute, - .route_key_size = sizeof(struct ip6_rt_info), }; int __init ipv6_netfilter_init(void) { RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); - return nf_register_afinfo(&nf_ip6_afinfo); + return 0; } /* This can be called from inet6_init() on errors, so it cannot @@ -217,5 +186,4 @@ int __init ipv6_netfilter_init(void) void ipv6_netfilter_fini(void) { RCU_INIT_POINTER(nf_ipv6_ops, NULL); - nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6acb2eecd986..806e95375ec8 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -71,6 +71,14 @@ config NFT_FIB_IPV6 endif # NF_TABLES_IPV6 endif # NF_TABLES +config NF_FLOW_TABLE_IPV6 + select NF_FLOW_TABLE + tristate "Netfilter flow table IPv6 module" + help + This option adds the flow table IPv6 support. + + To compile it as a module, choose M here. + config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" depends on !NF_CONNTRACK || NF_CONNTRACK diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c6ee0cdd0ba9..95611c4b39b0 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -45,6 +45,9 @@ obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o +# flow table support +obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o + # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1d7ae9366335..6ebbef2dfb60 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -991,9 +991,8 @@ static int get_info(struct net *net, void __user *user, if (compat) xt_compat_lock(AF_INET6); #endif - t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), - "ip6table_%s", name); - if (t) { + t = xt_request_find_table_lock(net, AF_INET6, name); + if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT @@ -1023,7 +1022,7 @@ static int get_info(struct net *net, void __user *user, xt_table_unlock(t); module_put(t->me); } else - ret = -ENOENT; + ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET6); @@ -1049,7 +1048,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); - if (t) { + if (!IS_ERR(t)) { struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1060,7 +1059,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); return ret; } @@ -1083,10 +1082,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto out; } - t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), - "ip6table_%s", name); - if (!t) { - ret = -ENOENT; + t = xt_request_find_table_lock(net, AF_INET6, name); + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } @@ -1199,8 +1197,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); - if (!t) { - ret = -ENOENT; + if (IS_ERR(t)) { + ret = PTR_ERR(t); goto free; } @@ -1636,7 +1634,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); - if (t) { + if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); @@ -1650,7 +1648,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, module_put(t->me); xt_table_unlock(t); } else - ret = -ENOENT; + ret = PTR_ERR(t); xt_compat_unlock(AF_INET6); return ret; diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 2b1a9dcdbcb3..b0524b18c4fb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -42,14 +42,6 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) u_int8_t hop_limit; u_int32_t flowlabel, mark; int err; -#if 0 - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) { - net_warn_ratelimited("ip6t_hook: happy cracking\n"); - return NF_ACCEPT; - } -#endif /* save source/dest address, mark, hoplimit, flowlabel, priority, */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 991512576c8c..47306e45a80a 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -74,6 +74,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_in, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, @@ -81,6 +82,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_out, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, @@ -88,12 +90,14 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { .hook = ip6table_nat_local_fn, .pf = NFPROTO_IPV6, + .nat_hook = true, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = ip6table_nat_fn, + .nat_hook = true, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 3b80a38f62b8..11a313fd9273 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -176,11 +176,6 @@ static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) { - net_notice_ratelimited("ipv6_conntrack_local: packet too short\n"); - return NF_ACCEPT; - } return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); } @@ -368,7 +363,7 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static struct nf_conntrack_l4proto *builtin_l4proto6[] = { +static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = { &nf_conntrack_l4proto_tcp6, &nf_conntrack_l4proto_udp6, &nf_conntrack_l4proto_icmpv6, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 3ac0d826afc4..2548e2c8aedd 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -27,7 +27,7 @@ #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> #include <net/netfilter/nf_log.h> -static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; +static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) { @@ -352,7 +352,7 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.icmpv6.pn; } -struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c new file mode 100644 index 000000000000..0c3b9d32f64f --- /dev/null +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -0,0 +1,278 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + + return 0; +} + +static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) +{ + switch (ip6h->nexthdr) { + case IPPROTO_TCP: + if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; + ip6h->daddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; + ip6h->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + unsigned int thoff = sizeof(*ip6h); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + + return 0; +} + +static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -1; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return -1; + + thoff = sizeof(*ip6h); + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + return false; + + return true; +} + +static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt) +{ + u32 mtu; + + mtu = ip6_dst_mtu_forward(&rt->dst); + if (__nf_flow_exceeds_mtu(skb, mtu)) + return true; + + return false; +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + struct in6_addr *nexthop; + struct ipv6hdr *ip6h; + struct rt6_info *rt; + + if (skb->protocol != htons(ETH_P_IPV6)) + return NF_ACCEPT; + + if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + + rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + if (unlikely(nf_flow_exceeds_mtu(skb, rt))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*ip6h))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ipv6(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); + +static struct nf_flowtable_type flowtable_ipv6 = { + .family = NFPROTO_IPV6, + .params = &nf_flow_offload_rhash_params, + .gc = nf_flow_offload_work_gc, + .hook = nf_flow_offload_ipv6_hook, + .owner = THIS_MODULE, +}; + +static int __init nf_flow_ipv6_module_init(void) +{ + nft_register_flowtable_type(&flowtable_ipv6); + + return 0; +} + +static void __exit nf_flow_ipv6_module_exit(void) +{ + nft_unregister_flowtable_type(&flowtable_ipv6); +} + +module_init(nf_flow_ipv6_module_init); +module_exit(nf_flow_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NF_FLOWTABLE(AF_INET6); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 1d2fb9267d6f..bed57ee65f7b 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -369,10 +369,6 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, #endif unsigned int ret; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && @@ -408,10 +404,6 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, unsigned int ret; int err; - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct ipv6hdr)) - return NF_ACCEPT; - ret = nf_nat_ipv6_fn(priv, skb, state, do_chain); if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index d6e4ba5de916..9cd45b964123 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -22,39 +22,17 @@ static unsigned int nft_do_chain_ipv6(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); return nft_do_chain(&pkt, priv); } -static unsigned int nft_ipv6_output(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (unlikely(skb->len < sizeof(struct ipv6hdr))) { - if (net_ratelimit()) - pr_info("nf_tables_ipv6: ignoring short SOCK_RAW " - "packet\n"); - return NF_ACCEPT; - } - - return nft_do_chain_ipv6(priv, skb, state); -} - -struct nft_af_info nft_af_ipv6 __read_mostly = { +static struct nft_af_info nft_af_ipv6 __read_mostly = { .family = NFPROTO_IPV6, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, - .nops = 1, - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, - [NF_INET_LOCAL_OUT] = nft_ipv6_output, - [NF_INET_FORWARD] = nft_do_chain_ipv6, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, - }, }; -EXPORT_SYMBOL_GPL(nft_af_ipv6); static int nf_tables_ipv6_init_net(struct net *net) { @@ -94,6 +72,13 @@ static const struct nf_chain_type filter_ipv6 = { (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, }; static int __init nf_tables_ipv6_init(void) diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 443cd306c0b0..73fe2bd13fcf 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -31,7 +31,8 @@ static unsigned int nft_nat_do_chain(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); return nft_do_chain(&pkt, priv); } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index f2727475895e..11d3c3b9aa18 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -33,7 +33,8 @@ static unsigned int nf_route_table_hook(void *priv, u32 mark, flowlabel; int err; - nft_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 54b5899543ef..cc5174c7254c 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -60,7 +60,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, { const struct net_device *dev = NULL; const struct nf_ipv6_ops *v6ops; - const struct nf_afinfo *afinfo; int route_err, addrtype; struct rt6_info *rt; struct flowi6 fl6 = { @@ -69,8 +68,8 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, }; u32 ret = 0; - afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (!afinfo) + v6ops = nf_get_ipv6_ops(); + if (!v6ops) return RTN_UNREACHABLE; if (priv->flags & NFTA_FIB_F_IIF) @@ -80,12 +79,11 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); - v6ops = nf_get_ipv6_ops(); - if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) + if (dev && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) ret = RTN_LOCAL; - route_err = afinfo->route(nft_net(pkt), (struct dst_entry **)&rt, - flowi6_to_flowi(&fl6), false); + route_err = v6ops->route(nft_net(pkt), (struct dst_entry **)&rt, + flowi6_to_flowi(&fl6), false); if (route_err) goto err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2490280b3394..1076ae0ea9d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, int strict) { struct rt6_info *sibling, *next_sibling; - int route_choosen; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -463,26 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(fl6, NULL); - route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); - /* Don't change the route, if route_choosen == 0 - * (siblings does not include ourself) - */ - if (route_choosen) - list_for_each_entry_safe(sibling, next_sibling, - &match->rt6i_siblings, rt6i_siblings) { - route_choosen--; - if (route_choosen == 0) { - struct inet6_dev *idev = sibling->rt6i_idev; - - if (!netif_carrier_ok(sibling->dst.dev) && - idev->cnf.ignore_routes_with_linkdown) - break; - if (rt6_score_route(sibling, oif, strict) < 0) - break; - match = sibling; - break; - } - } + if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + return match; + + list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, + rt6i_siblings) { + if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + continue; + if (rt6_score_route(sibling, oif, strict) < 0) + break; + match = sibling; + break; + } + return match; } @@ -499,12 +491,15 @@ static inline struct rt6_info *rt6_device_match(struct net *net, struct rt6_info *local = NULL; struct rt6_info *sprt; - if (!oif && ipv6_addr_any(saddr)) - goto out; + if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + return rt; for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { struct net_device *dev = sprt->dst.dev; + if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + continue; + if (oif) { if (dev->ifindex == oif) return sprt; @@ -533,8 +528,8 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (flags & RT6_LOOKUP_F_IFACE) return net->ipv6.ip6_null_entry; } -out: - return rt; + + return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -679,10 +674,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, int m; bool match_do_rr = false; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *dev = rt->dst.dev; - if (dev && !netif_carrier_ok(dev) && - idev->cnf.ignore_routes_with_linkdown && + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + goto out; + + if (idev->cnf.ignore_routes_with_linkdown && + rt->rt6i_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; @@ -1346,7 +1343,9 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { + spin_lock_bh(&ort->rt6i_table->tb6_lock); fib6_update_sernum(ort); + spin_unlock_bh(&ort->rt6i_table->tb6_lock); fib6_force_start_gc(net); } @@ -1824,10 +1823,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) if (skb) { ip6_multipath_l3_keys(skb, &hash_keys); - return flow_hash_from_keys(&hash_keys); + return flow_hash_from_keys(&hash_keys) >> 1; } - return get_hash_from_flowi6(fl6); + return get_hash_from_flowi6(fl6) >> 1; } void ip6_route_input(struct sk_buff *skb) @@ -2154,6 +2153,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + continue; if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -2344,7 +2345,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); - /* Add this dst into uncached_list so that rt6_ifdown() can + /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); @@ -2593,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, #endif rt->rt6i_metric = cfg->fc_metric; + rt->rt6i_nh_weight = 1; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -2746,6 +2748,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, rt->rt6i_flags = cfg->fc_flags; install_route: + if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + !netif_carrier_ok(dev)) + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -3459,37 +3464,245 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) fib6_clean_all(net, fib6_clean_tohost, gateway); } -struct arg_dev_net { - struct net_device *dev; - struct net *net; +struct arg_netdev_event { + const struct net_device *dev; + union { + unsigned int nh_flags; + unsigned long event; + }; }; +static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +{ + struct rt6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + while (iter) { + if (iter->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference_protected(iter->rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } + + return NULL; +} + +static bool rt6_is_dead(const struct rt6_info *rt) +{ + if (rt->rt6i_nh_flags & RTNH_F_DEAD || + (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + return true; + + return false; +} + +static int rt6_multipath_total_weight(const struct rt6_info *rt) +{ + struct rt6_info *iter; + int total = 0; + + if (!rt6_is_dead(rt)) + total += rt->rt6i_nh_weight; + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + if (!rt6_is_dead(iter)) + total += iter->rt6i_nh_weight; + } + + return total; +} + +static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +{ + int upper_bound = -1; + + if (!rt6_is_dead(rt)) { + *weight += rt->rt6i_nh_weight; + upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, + total) - 1; + } + atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); +} + +static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +{ + struct rt6_info *iter; + int weight = 0; + + rt6_upper_bound_set(rt, &weight, total); + + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + rt6_upper_bound_set(iter, &weight, total); +} + +void rt6_multipath_rebalance(struct rt6_info *rt) +{ + struct rt6_info *first; + int total; + + /* In case the entire multipath route was marked for flushing, + * then there is no need to rebalance upon the removal of every + * sibling route. + */ + if (!rt->rt6i_nsiblings || rt->should_flush) + return; + + /* During lookup routes are evaluated in order, so we need to + * make sure upper bounds are assigned from the first sibling + * onwards. + */ + first = rt6_multipath_first_sibling(rt); + if (WARN_ON_ONCE(!first)) + return; + + total = rt6_multipath_total_weight(first); + rt6_multipath_upper_bound_set(first, total); +} + +static int fib6_ifup(struct rt6_info *rt, void *p_arg) +{ + const struct arg_netdev_event *arg = p_arg; + const struct net *net = dev_net(arg->dev); + + if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { + rt->rt6i_nh_flags &= ~arg->nh_flags; + fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + rt6_multipath_rebalance(rt); + } + + return 0; +} + +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) +{ + struct arg_netdev_event arg = { + .dev = dev, + .nh_flags = nh_flags, + }; + + if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) + arg.nh_flags |= RTNH_F_LINKDOWN; + + fib6_clean_all(dev_net(dev), fib6_ifup, &arg); +} + +static bool rt6_multipath_uses_dev(const struct rt6_info *rt, + const struct net_device *dev) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + return true; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + return true; + + return false; +} + +static void rt6_multipath_flush(struct rt6_info *rt) +{ + struct rt6_info *iter; + + rt->should_flush = 1; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + iter->should_flush = 1; +} + +static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, + const struct net_device *down_dev) +{ + struct rt6_info *iter; + unsigned int dead = 0; + + if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == down_dev || + iter->rt6i_nh_flags & RTNH_F_DEAD) + dead++; + + return dead; +} + +static void rt6_multipath_nh_flags_set(struct rt6_info *rt, + const struct net_device *dev, + unsigned int nh_flags) +{ + struct rt6_info *iter; + + if (rt->dst.dev == dev) + rt->rt6i_nh_flags |= nh_flags; + list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + if (iter->dst.dev == dev) + iter->rt6i_nh_flags |= nh_flags; +} + /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *arg) +static int fib6_ifdown(struct rt6_info *rt, void *p_arg) { - const struct arg_dev_net *adn = arg; - const struct net_device *dev = adn->dev; + const struct arg_netdev_event *arg = p_arg; + const struct net_device *dev = arg->dev; + const struct net *net = dev_net(dev); - if ((rt->dst.dev == dev || !dev) && - rt != adn->net->ipv6.ip6_null_entry && - (rt->rt6i_nsiblings == 0 || - (dev && netdev_unregistering(dev)) || - !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) - return -1; + if (rt == net->ipv6.ip6_null_entry) + return 0; + + switch (arg->event) { + case NETDEV_UNREGISTER: + return rt->dst.dev == dev ? -1 : 0; + case NETDEV_DOWN: + if (rt->should_flush) + return -1; + if (!rt->rt6i_nsiblings) + return rt->dst.dev == dev ? -1 : 0; + if (rt6_multipath_uses_dev(rt, dev)) { + unsigned int count; + + count = rt6_multipath_dead_count(rt, dev); + if (rt->rt6i_nsiblings + 1 == count) { + rt6_multipath_flush(rt); + return -1; + } + rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | + RTNH_F_LINKDOWN); + fib6_update_sernum(rt); + rt6_multipath_rebalance(rt); + } + return -2; + case NETDEV_CHANGE: + if (rt->dst.dev != dev || + rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + break; + rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt6_multipath_rebalance(rt); + break; + } return 0; } -void rt6_ifdown(struct net *net, struct net_device *dev) +void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { - struct arg_dev_net adn = { + struct arg_netdev_event arg = { .dev = dev, - .net = net, + .event = event, }; - fib6_clean_all(net, fib6_ifdown, &adn); - if (dev) - rt6_uncached_list_flush_dev(net, dev); + fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); +} + +void rt6_disable_ip(struct net_device *dev, unsigned long event) +{ + rt6_sync_down_dev(dev, event); + rt6_uncached_list_flush_dev(dev_net(dev), dev); + neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { @@ -3812,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { dst_release_immediate(&rt->dst); @@ -3992,7 +4207,10 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, unsigned int *flags, bool skip_oif) { - if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { + if (rt->rt6i_nh_flags & RTNH_F_DEAD) + *flags |= RTNH_F_DEAD; + + if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) *flags |= RTNH_F_DEAD; @@ -4031,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = 0; + rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 825b8e01f947..ba3767ef5e93 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -501,7 +501,7 @@ static struct seg6_action_desc *__get_action_desc(int action) struct seg6_action_desc *desc; int i, count; - count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc); + count = ARRAY_SIZE(seg6_action_table); for (i = 0; i < count; i++) { desc = &seg6_action_table[i]; if (desc->action == action) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index aa12a26a96c6..c0f7e69f2e6c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -176,8 +176,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, /* If interface is set while binding, indices * must coincide. */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) + if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index fe04e23af986..841f4a07438e 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -32,6 +32,14 @@ int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, } EXPORT_SYMBOL(xfrm6_rcv_spi); +static int xfrm6_transport_finish2(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + if (xfrm_trans_queue(skb, ip6_rcv_finish)) + __kfree_skb(skb); + return -1; +} + int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); @@ -56,7 +64,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_rcv_finish); + xfrm6_transport_finish2); return -1; } diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index e66b94f46532..4e12859bc2ee 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -105,17 +105,14 @@ static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x, { __skb_push(skb, skb->mac_len); return skb_mac_gso_segment(skb, features); - } static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); - if (xo->flags & XFRM_GSO_SEGMENT) { - skb->network_header = skb->network_header - x->props.header_len; + if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - } skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); |