diff options
author | David S. Miller <davem@davemloft.net> | 2024-11-18 11:56:21 +0000 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2024-11-18 11:56:21 +0000 |
commit | ac60031f7988a9ff1a977afccb7f5e01da1bbc09 (patch) | |
tree | 50b3a489273263957582fcbbd2ce04f513ffe405 | |
parent | 296a681def3e105f8535c8b3cb0f37f22f710e62 (diff) | |
parent | 1b29a730ef8b6fd3aa3e11c2f6d409cf201cd913 (diff) | |
download | linux-ac60031f7988a9ff1a977afccb7f5e01da1bbc09.tar.gz linux-ac60031f7988a9ff1a977afccb7f5e01da1bbc09.tar.bz2 linux-ac60031f7988a9ff1a977afccb7f5e01da1bbc09.zip |
Merge branch 'udp-4tuple-hash'
Philo Lu says:
====================
udp: Add 4-tuple hash for connected sockets
This patchset introduces 4-tuple hash for connected udp sockets, to make
connected udp lookup faster.
Stress test results (with 1 cpu fully used) are shown below, in pps:
(1) _un-connected_ socket as server
[a] w/o hash4: 1,825176
[b] w/ hash4: 1,831750 (+0.36%)
(2) 500 _connected_ sockets as server
[c] w/o hash4: 290860 (only 16% of [a])
[d] w/ hash4: 1,889658 (+3.1% compared with [b])
With hash4, compute_score is skipped when lookup, so [d] is slightly
better than [b].
Patch1: Add a new counter for hslot2 named hash4_cnt, to avoid cache line
miss when lookup.
Patch2: Add hslot/hlist_nulls for 4-tuple hash.
Patch3 and 4: Implement 4-tuple hash for ipv4 and ipv6.
The detailed motivation is described in Patch 3.
The 4-tuple hash increases the size of udp_sock and udp_hslot. Thus add it
with CONFIG_BASE_SMALL, i.e., it's a no op with CONFIG_BASE_SMALL.
Intentionally, the feature is not available for udplite. Though udplite
shares some structs and functions with udp, its connect() keeps unchanged.
So all udplite sockets perform the same as un-connected udp sockets.
Besides, udplite also shares the additional memory consumption in udp_sock
and udptable.
changelogs:
v8 -> v9 (Paolo Abeni):
- Add explanation about udplite in cover letter
- Update tags for co-developers
- Add acked-by tags of Paolo and Willem
v7 -> v8:
- add EXPORT_SYMBOL for ipv6.ko build
v6 -> v7 (Kuniyuki Iwashima):
- export udp_ehashfn to be used by udpv6 rehash
v5 -> v6 (Paolo Abeni):
- move udp_table_hash4_init from patch2 to patch1
- use hlist_nulls for lookup-rehash race
- add test results in commit log
- add more comment, e.g., for rehash4 used in hash4
- add ipv6 support (Patch4), and refactor some functions for better
sharing, without functionality change
v4 -> v5 (Paolo Abeni):
- add CONFIG_BASE_SMALL with which udp hash4 does nothing
v3 -> v4 (Willem de Bruijn):
- fix mistakes in udp_pernet_table_alloc()
RFCv2 -> v3 (Gur Stavi):
- minor fix in udp_hashslot2() and udp_table_init()
- add rcu sync in rehash4()
RFCv1 -> RFCv2:
- add a new struct for hslot2
- remove the sockopt UDP_HASH4 because it has little side effect for
unconnected sockets
- add rehash in connect()
- re-organize the patch into 3 smaller ones
- other minor fix
v8:
https://lore.kernel.org/all/20241108054836.123484-1-lulie@linux.alibaba.com/
v7:
https://lore.kernel.org/all/20241105121225.12513-1-lulie@linux.alibaba.com/
v6:
https://lore.kernel.org/all/20241031124550.20227-1-lulie@linux.alibaba.com/
v5:
https://lore.kernel.org/all/20241018114535.35712-1-lulie@linux.alibaba.com/
v4:
https://lore.kernel.org/all/20241012012918.70888-1-lulie@linux.alibaba.com/
v3:
https://lore.kernel.org/all/20241010090351.79698-1-lulie@linux.alibaba.com/
RFCv2:
https://lore.kernel.org/all/20240924110414.52618-1-lulie@linux.alibaba.com/
RFCv1:
https://lore.kernel.org/all/20240913100941.8565-1-lulie@linux.alibaba.com/
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/udp.h | 11 | ||||
-rw-r--r-- | include/net/udp.h | 137 | ||||
-rw-r--r-- | net/ipv4/udp.c | 245 | ||||
-rw-r--r-- | net/ipv6/udp.c | 117 |
4 files changed, 468 insertions, 42 deletions
diff --git a/include/linux/udp.h b/include/linux/udp.h index 3eb3f2b9a2a0..0807e21cfec9 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -56,6 +56,12 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + /* For UDP 4-tuple hash */ + __u16 udp_lrpa_hash; + struct hlist_nulls_node udp_lrpa_node; +#endif + /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -206,6 +212,11 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) +#if !IS_ENABLED(CONFIG_BASE_SMALL) +#define udp_lrpa_for_each_entry_rcu(__up, node, list) \ + hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) +#endif + #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 61222545ab1c..6e89520e100d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,29 +50,56 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot + * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets + * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_head head; + union { + struct hlist_head head; + /* hash4 uses hlist_nulls to avoid moving wrongly onto another + * hlist, because rehash() can happen with lookup(). + */ + struct hlist_nulls_head nulls_head; + }; int count; spinlock_t lock; -} __attribute__((aligned(2 * sizeof(long)))); +} __aligned(2 * sizeof(long)); + +/** + * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 + * + * @hslot: basic hash slot + * @hash4_cnt: number of sockets in hslot4 of the same + * (local port, local address) + */ +struct udp_hslot_main { + struct udp_hslot hslot; /* must be the first member */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + u32 hash4_cnt; +#endif +} __aligned(2 * sizeof(long)); +#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) + * @hash4: hash table, connected sockets are hashed on + * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; - struct udp_hslot *hash2; + struct udp_hslot_main *hash2; +#if !IS_ENABLED(CONFIG_BASE_SMALL) + struct udp_hslot *hash4; +#endif unsigned int mask; unsigned int log; }; @@ -84,6 +111,7 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, { return &table->hash[udp_hashfn(net, num, table->mask)]; } + /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() @@ -91,8 +119,89 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { - return &table->hash2[hash & table->mask]; + return &table->hash2[hash & table->mask].hslot; +} + +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_table_hash4_init(struct udp_table *table) +{ +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + BUILD_BUG(); + return NULL; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return false; +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return 0; +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return false; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ } +#else /* !CONFIG_BASE_SMALL */ + +/* Must be called with table->hash2 initialized */ +static inline void udp_table_hash4_init(struct udp_table *table) +{ + table->hash4 = (void *)(table->hash2 + (table->mask + 1)); + for (int i = 0; i <= table->mask; i++) { + table->hash2[i].hash4_cnt = 0; + + INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); + table->hash4[i].count = 0; + spin_lock_init(&table->hash4[i].lock); + } +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + return &table->hash4[hash & table->mask]; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return sizeof(struct udp_hslot); +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; +} +#endif /* CONFIG_BASE_SMALL */ extern struct proto udp_prot; @@ -193,13 +302,29 @@ static inline int udp_lib_hash(struct sock *sk) } void udp_lib_unhash(struct sock *sk); -void udp_lib_rehash(struct sock *sk, u16 new_hash); +void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } +/* hash4 routines shared between UDPv4/6 */ +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_lib_hash4(struct sock *sk, u16 hash) +{ +} + +static inline void udp4_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +void udp_lib_hash4(struct sock *sk, u16 hash); +void udp4_hash4(struct sock *sk); +#endif /* CONFIG_BASE_SMALL */ + int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0e24916b39d4..6a01905d379f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -410,7 +410,6 @@ static int compute_score(struct sock *sk, const struct net *net, return score; } -INDIRECT_CALLABLE_SCOPE u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { @@ -419,6 +418,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL(udp_ehashfn); /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, @@ -478,6 +478,159 @@ rescore: return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp4_lib_lookup4(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp_rehash4(struct udp_table *udptable, struct sock *sk, + u16 newhash4) +{ +} + +static void udp_unhash4(struct udp_table *udptable, struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp4_lib_lookup4(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + INET_ADDR_COOKIE(acookie, saddr, daddr); + +begin: + /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */ + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet_match(net, sk, acookie, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */ +static void udp_rehash4(struct udp_table *udptable, struct sock *sk, + u16 newhash4) +{ + struct udp_hslot *hslot4, *nhslot4; + + hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); + nhslot4 = udp_hashslot4(udptable, newhash4); + udp_sk(sk)->udp_lrpa_hash = newhash4; + + if (hslot4 != nhslot4) { + spin_lock_bh(&hslot4->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); + hslot4->count--; + spin_unlock_bh(&hslot4->lock); + + spin_lock_bh(&nhslot4->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, + &nhslot4->nulls_head); + nhslot4->count++; + spin_unlock_bh(&nhslot4->lock); + } +} + +static void udp_unhash4(struct udp_table *udptable, struct sock *sk) +{ + struct udp_hslot *hslot2, *hslot4; + + if (udp_hashed4(sk)) { + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); + + spin_lock(&hslot4->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); + hslot4->count--; + spin_unlock(&hslot4->lock); + + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + } +} + +void udp_lib_hash4(struct sock *sk, u16 hash) +{ + struct udp_hslot *hslot, *hslot2, *hslot4; + struct net *net = sock_net(sk); + struct udp_table *udptable; + + /* Connected udp socket can re-connect to another remote address, + * so rehash4 is needed. + */ + udptable = net->ipv4.udp_table; + if (udp_hashed4(sk)) { + udp_rehash4(udptable, sk, hash); + return; + } + + hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + hslot4 = udp_hashslot4(udptable, hash); + udp_sk(sk)->udp_lrpa_hash = hash; + + spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + + spin_lock(&hslot4->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, + &hslot4->nulls_head); + hslot4->count++; + spin_unlock(&hslot4->lock); + + spin_lock(&hslot2->lock); + udp_hash4_inc(hslot2); + spin_unlock(&hslot2->lock); + + spin_unlock_bh(&hslot->lock); +} +EXPORT_SYMBOL(udp_lib_hash4); + +/* call with sock lock */ +void udp4_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY)) + return; + + hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +EXPORT_SYMBOL(udp4_hash4); +#endif /* CONFIG_BASE_SMALL */ + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -486,13 +639,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv4_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); + + if (udp_has_hash4(hslot2)) { + result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp4_lib_lookup4 return sk or NULL */ + return result; + } /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, @@ -519,8 +678,7 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, /* Lookup wildcard sockets */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, @@ -1935,6 +2093,18 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_SYMBOL(udp_pre_connect); +static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip4_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp4_hash4(sk); + release_sock(sk); + return res; +} + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -1994,6 +2164,8 @@ void udp_lib_unhash(struct sock *sk) hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); + + udp_unhash4(udptable, sk); } spin_unlock_bh(&hslot->lock); } @@ -2003,7 +2175,7 @@ EXPORT_SYMBOL(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash */ -void udp_lib_rehash(struct sock *sk, u16 newhash) +void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); @@ -2035,6 +2207,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) spin_unlock(&nhslot2->lock); } + if (udp_hashed4(sk)) { + udp_rehash4(udptable, sk, newhash4); + + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); + } + } spin_unlock_bh(&hslot->lock); } } @@ -2046,7 +2231,11 @@ void udp_v4_rehash(struct sock *sk) u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); - udp_lib_rehash(sk, new_hash); + u16 new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + + udp_lib_rehash(sk, new_hash, new_hash4); } static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -2268,7 +2457,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -2539,14 +2728,13 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -2940,7 +3128,7 @@ struct proto udp_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udp_pre_connect, - .connect = ip4_datagram_connect, + .connect = udp_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udp_init_sock, @@ -3187,7 +3375,7 @@ again: batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { - struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; + struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) continue; @@ -3428,10 +3616,12 @@ __setup("uhash_entries=", set_uhash_entries); void __init udp_table_init(struct udp_table *table, const char *name) { - unsigned int i; + unsigned int i, slot_size; + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + + udp_hash4_slot_size(); table->hash = alloc_large_system_hash(name, - 2 * sizeof(struct udp_hslot), + slot_size, uhash_entries, 21, /* one slot per 2 MB */ 0, @@ -3440,17 +3630,18 @@ void __init udp_table_init(struct udp_table *table, const char *name) UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MAX); - table->hash2 = table->hash + (table->mask + 1); + table->hash2 = (void *)(table->hash + (table->mask + 1)); for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_HEAD(&table->hash2[i].head); - table->hash2[i].count = 0; - spin_lock_init(&table->hash2[i].lock); + INIT_HLIST_HEAD(&table->hash2[i].hslot.head); + table->hash2[i].hslot.count = 0; + spin_lock_init(&table->hash2[i].hslot.lock); } + udp_table_hash4_init(table); } u32 udp_flow_hashrnd(void) @@ -3476,18 +3667,21 @@ static void __net_init udp_sysctl_init(struct net *net) static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) { struct udp_table *udptable; + unsigned int slot_size; int i; udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); if (!udptable) goto out; - udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot), + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + + udp_hash4_slot_size(); + udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) goto free_table; - udptable->hash2 = udptable->hash + hash_entries; + udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); @@ -3496,10 +3690,11 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent udptable->hash[i].count = 0; spin_lock_init(&udptable->hash[i].lock); - INIT_HLIST_HEAD(&udptable->hash2[i].head); - udptable->hash2[i].count = 0; - spin_lock_init(&udptable->hash2[i].lock); + INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head); + udptable->hash2[i].hslot.count = 0; + spin_lock_init(&udptable->hash2[i].hslot.lock); } + udp_table_hash4_init(udptable); return udptable; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0cef8ae5d1ea..d766fd798ecf 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -110,8 +110,19 @@ void udp_v6_rehash(struct sock *sk) u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); + u16 new_hash4; - udp_lib_rehash(sk, new_hash); + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + } else { + new_hash4 = udp6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + } + + udp_lib_rehash(sk, new_hash, new_hash4); } static int compute_score(struct sock *sk, const struct net *net, @@ -216,6 +227,74 @@ rescore: return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + +begin: + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + udp4_hash4(sk); + return; + } + + if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return; + + hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +#endif /* CONFIG_BASE_SMALL */ + /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -224,13 +303,19 @@ struct sock *__udp6_lib_lookup(const struct net *net, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv6_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); + + if (udp_has_hash4(hslot2)) { + result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp6_lib_lookup4 return sk or NULL */ + return result; + } /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, @@ -257,8 +342,7 @@ struct sock *__udp6_lib_lookup(const struct net *net, /* Lookup wildcard sockets */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, @@ -859,7 +943,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -1065,14 +1149,13 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -1169,6 +1252,18 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } +static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp6_hash4(sk); + release_sock(sk); + return res; +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1764,7 +1859,7 @@ struct proto udpv6_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udpv6_pre_connect, - .connect = ip6_datagram_connect, + .connect = udpv6_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udpv6_init_sock, |