summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2024-11-18 11:56:21 +0000
committerDavid S. Miller <davem@davemloft.net>2024-11-18 11:56:21 +0000
commitac60031f7988a9ff1a977afccb7f5e01da1bbc09 (patch)
tree50b3a489273263957582fcbbd2ce04f513ffe405
parent296a681def3e105f8535c8b3cb0f37f22f710e62 (diff)
parent1b29a730ef8b6fd3aa3e11c2f6d409cf201cd913 (diff)
downloadlinux-ac60031f7988a9ff1a977afccb7f5e01da1bbc09.tar.gz
linux-ac60031f7988a9ff1a977afccb7f5e01da1bbc09.tar.bz2
linux-ac60031f7988a9ff1a977afccb7f5e01da1bbc09.zip
Merge branch 'udp-4tuple-hash'
Philo Lu says: ==================== udp: Add 4-tuple hash for connected sockets This patchset introduces 4-tuple hash for connected udp sockets, to make connected udp lookup faster. Stress test results (with 1 cpu fully used) are shown below, in pps: (1) _un-connected_ socket as server [a] w/o hash4: 1,825176 [b] w/ hash4: 1,831750 (+0.36%) (2) 500 _connected_ sockets as server [c] w/o hash4: 290860 (only 16% of [a]) [d] w/ hash4: 1,889658 (+3.1% compared with [b]) With hash4, compute_score is skipped when lookup, so [d] is slightly better than [b]. Patch1: Add a new counter for hslot2 named hash4_cnt, to avoid cache line miss when lookup. Patch2: Add hslot/hlist_nulls for 4-tuple hash. Patch3 and 4: Implement 4-tuple hash for ipv4 and ipv6. The detailed motivation is described in Patch 3. The 4-tuple hash increases the size of udp_sock and udp_hslot. Thus add it with CONFIG_BASE_SMALL, i.e., it's a no op with CONFIG_BASE_SMALL. Intentionally, the feature is not available for udplite. Though udplite shares some structs and functions with udp, its connect() keeps unchanged. So all udplite sockets perform the same as un-connected udp sockets. Besides, udplite also shares the additional memory consumption in udp_sock and udptable. changelogs: v8 -> v9 (Paolo Abeni): - Add explanation about udplite in cover letter - Update tags for co-developers - Add acked-by tags of Paolo and Willem v7 -> v8: - add EXPORT_SYMBOL for ipv6.ko build v6 -> v7 (Kuniyuki Iwashima): - export udp_ehashfn to be used by udpv6 rehash v5 -> v6 (Paolo Abeni): - move udp_table_hash4_init from patch2 to patch1 - use hlist_nulls for lookup-rehash race - add test results in commit log - add more comment, e.g., for rehash4 used in hash4 - add ipv6 support (Patch4), and refactor some functions for better sharing, without functionality change v4 -> v5 (Paolo Abeni): - add CONFIG_BASE_SMALL with which udp hash4 does nothing v3 -> v4 (Willem de Bruijn): - fix mistakes in udp_pernet_table_alloc() RFCv2 -> v3 (Gur Stavi): - minor fix in udp_hashslot2() and udp_table_init() - add rcu sync in rehash4() RFCv1 -> RFCv2: - add a new struct for hslot2 - remove the sockopt UDP_HASH4 because it has little side effect for unconnected sockets - add rehash in connect() - re-organize the patch into 3 smaller ones - other minor fix v8: https://lore.kernel.org/all/20241108054836.123484-1-lulie@linux.alibaba.com/ v7: https://lore.kernel.org/all/20241105121225.12513-1-lulie@linux.alibaba.com/ v6: https://lore.kernel.org/all/20241031124550.20227-1-lulie@linux.alibaba.com/ v5: https://lore.kernel.org/all/20241018114535.35712-1-lulie@linux.alibaba.com/ v4: https://lore.kernel.org/all/20241012012918.70888-1-lulie@linux.alibaba.com/ v3: https://lore.kernel.org/all/20241010090351.79698-1-lulie@linux.alibaba.com/ RFCv2: https://lore.kernel.org/all/20240924110414.52618-1-lulie@linux.alibaba.com/ RFCv1: https://lore.kernel.org/all/20240913100941.8565-1-lulie@linux.alibaba.com/ ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/udp.h11
-rw-r--r--include/net/udp.h137
-rw-r--r--net/ipv4/udp.c245
-rw-r--r--net/ipv6/udp.c117
4 files changed, 468 insertions, 42 deletions
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 3eb3f2b9a2a0..0807e21cfec9 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -56,6 +56,12 @@ struct udp_sock {
int pending; /* Any pending frames ? */
__u8 encap_type; /* Is this an Encapsulation socket? */
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+ /* For UDP 4-tuple hash */
+ __u16 udp_lrpa_hash;
+ struct hlist_nulls_node udp_lrpa_node;
+#endif
+
/*
* Following member retains the information to create a UDP header
* when the socket is uncorked.
@@ -206,6 +212,11 @@ static inline void udp_allow_gso(struct sock *sk)
#define udp_portaddr_for_each_entry_rcu(__sk, list) \
hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+#define udp_lrpa_for_each_entry_rcu(__up, node, list) \
+ hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node)
+#endif
+
#define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
#endif /* _LINUX_UDP_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index 61222545ab1c..6e89520e100d 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,29 +50,56 @@ struct udp_skb_cb {
#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
/**
- * struct udp_hslot - UDP hash slot
+ * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4
*
* @head: head of list of sockets
+ * @nulls_head: head of list of sockets, only used by hash4
* @count: number of sockets in 'head' list
* @lock: spinlock protecting changes to head/count
*/
struct udp_hslot {
- struct hlist_head head;
+ union {
+ struct hlist_head head;
+ /* hash4 uses hlist_nulls to avoid moving wrongly onto another
+ * hlist, because rehash() can happen with lookup().
+ */
+ struct hlist_nulls_head nulls_head;
+ };
int count;
spinlock_t lock;
-} __attribute__((aligned(2 * sizeof(long))));
+} __aligned(2 * sizeof(long));
+
+/**
+ * struct udp_hslot_main - UDP hash slot used by udp_table.hash2
+ *
+ * @hslot: basic hash slot
+ * @hash4_cnt: number of sockets in hslot4 of the same
+ * (local port, local address)
+ */
+struct udp_hslot_main {
+ struct udp_hslot hslot; /* must be the first member */
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+ u32 hash4_cnt;
+#endif
+} __aligned(2 * sizeof(long));
+#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot))
/**
* struct udp_table - UDP table
*
* @hash: hash table, sockets are hashed on (local port)
* @hash2: hash table, sockets are hashed on (local port, local address)
+ * @hash4: hash table, connected sockets are hashed on
+ * (local port, local address, remote port, remote address)
* @mask: number of slots in hash tables, minus 1
* @log: log2(number of slots in hash table)
*/
struct udp_table {
struct udp_hslot *hash;
- struct udp_hslot *hash2;
+ struct udp_hslot_main *hash2;
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+ struct udp_hslot *hash4;
+#endif
unsigned int mask;
unsigned int log;
};
@@ -84,6 +111,7 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
{
return &table->hash[udp_hashfn(net, num, table->mask)];
}
+
/*
* For secondary hash, net_hash_mix() is performed before calling
* udp_hashslot2(), this explains difference with udp_hashslot()
@@ -91,8 +119,89 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
unsigned int hash)
{
- return &table->hash2[hash & table->mask];
+ return &table->hash2[hash & table->mask].hslot;
+}
+
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static inline void udp_table_hash4_init(struct udp_table *table)
+{
+}
+
+static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
+ unsigned int hash)
+{
+ BUILD_BUG();
+ return NULL;
+}
+
+static inline bool udp_hashed4(const struct sock *sk)
+{
+ return false;
+}
+
+static inline unsigned int udp_hash4_slot_size(void)
+{
+ return 0;
+}
+
+static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
+{
+ return false;
+}
+
+static inline void udp_hash4_inc(struct udp_hslot *hslot2)
+{
+}
+
+static inline void udp_hash4_dec(struct udp_hslot *hslot2)
+{
}
+#else /* !CONFIG_BASE_SMALL */
+
+/* Must be called with table->hash2 initialized */
+static inline void udp_table_hash4_init(struct udp_table *table)
+{
+ table->hash4 = (void *)(table->hash2 + (table->mask + 1));
+ for (int i = 0; i <= table->mask; i++) {
+ table->hash2[i].hash4_cnt = 0;
+
+ INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i);
+ table->hash4[i].count = 0;
+ spin_lock_init(&table->hash4[i].lock);
+ }
+}
+
+static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
+ unsigned int hash)
+{
+ return &table->hash4[hash & table->mask];
+}
+
+static inline bool udp_hashed4(const struct sock *sk)
+{
+ return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node);
+}
+
+static inline unsigned int udp_hash4_slot_size(void)
+{
+ return sizeof(struct udp_hslot);
+}
+
+static inline bool udp_has_hash4(const struct udp_hslot *hslot2)
+{
+ return UDP_HSLOT_MAIN(hslot2)->hash4_cnt;
+}
+
+static inline void udp_hash4_inc(struct udp_hslot *hslot2)
+{
+ UDP_HSLOT_MAIN(hslot2)->hash4_cnt++;
+}
+
+static inline void udp_hash4_dec(struct udp_hslot *hslot2)
+{
+ UDP_HSLOT_MAIN(hslot2)->hash4_cnt--;
+}
+#endif /* CONFIG_BASE_SMALL */
extern struct proto udp_prot;
@@ -193,13 +302,29 @@ static inline int udp_lib_hash(struct sock *sk)
}
void udp_lib_unhash(struct sock *sk);
-void udp_lib_rehash(struct sock *sk, u16 new_hash);
+void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4);
+u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
+ const __be32 faddr, const __be16 fport);
static inline void udp_lib_close(struct sock *sk, long timeout)
{
sk_common_release(sk);
}
+/* hash4 routines shared between UDPv4/6 */
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static inline void udp_lib_hash4(struct sock *sk, u16 hash)
+{
+}
+
+static inline void udp4_hash4(struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+void udp_lib_hash4(struct sock *sk, u16 hash);
+void udp4_hash4(struct sock *sk);
+#endif /* CONFIG_BASE_SMALL */
+
int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned int hash2_nulladdr);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0e24916b39d4..6a01905d379f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -410,7 +410,6 @@ static int compute_score(struct sock *sk, const struct net *net,
return score;
}
-INDIRECT_CALLABLE_SCOPE
u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
const __be32 faddr, const __be16 fport)
{
@@ -419,6 +418,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
return __inet_ehashfn(laddr, lport, faddr, fport,
udp_ehash_secret + net_hash_mix(net));
}
+EXPORT_SYMBOL(udp_ehashfn);
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(const struct net *net,
@@ -478,6 +478,159 @@ rescore:
return result;
}
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp4_lib_lookup4(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_table *udptable)
+{
+ return NULL;
+}
+
+static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
+ u16 newhash4)
+{
+}
+
+static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp4_lib_lookup4(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_table *udptable)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ const struct hlist_nulls_node *node;
+ struct udp_hslot *hslot4;
+ unsigned int hash4, slot;
+ struct udp_sock *up;
+ struct sock *sk;
+
+ hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash4 & udptable->mask;
+ hslot4 = &udptable->hash4[slot];
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+
+begin:
+ /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */
+ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
+ sk = (struct sock *)up;
+ if (inet_match(net, sk, acookie, ports, dif, sdif))
+ return sk;
+ }
+
+ /* if the nulls value we got at the end of this lookup is not the
+ * expected one, we must restart lookup. We probably met an item that
+ * was moved to another chain due to rehash.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ return NULL;
+}
+
+/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */
+static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
+ u16 newhash4)
+{
+ struct udp_hslot *hslot4, *nhslot4;
+
+ hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+ nhslot4 = udp_hashslot4(udptable, newhash4);
+ udp_sk(sk)->udp_lrpa_hash = newhash4;
+
+ if (hslot4 != nhslot4) {
+ spin_lock_bh(&hslot4->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+ hslot4->count--;
+ spin_unlock_bh(&hslot4->lock);
+
+ spin_lock_bh(&nhslot4->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
+ &nhslot4->nulls_head);
+ nhslot4->count++;
+ spin_unlock_bh(&nhslot4->lock);
+ }
+}
+
+static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+ struct udp_hslot *hslot2, *hslot4;
+
+ if (udp_hashed4(sk)) {
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+
+ spin_lock(&hslot4->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+ hslot4->count--;
+ spin_unlock(&hslot4->lock);
+
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+ }
+}
+
+void udp_lib_hash4(struct sock *sk, u16 hash)
+{
+ struct udp_hslot *hslot, *hslot2, *hslot4;
+ struct net *net = sock_net(sk);
+ struct udp_table *udptable;
+
+ /* Connected udp socket can re-connect to another remote address,
+ * so rehash4 is needed.
+ */
+ udptable = net->ipv4.udp_table;
+ if (udp_hashed4(sk)) {
+ udp_rehash4(udptable, sk, hash);
+ return;
+ }
+
+ hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ hslot4 = udp_hashslot4(udptable, hash);
+ udp_sk(sk)->udp_lrpa_hash = hash;
+
+ spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
+ spin_lock(&hslot4->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
+ &hslot4->nulls_head);
+ hslot4->count++;
+ spin_unlock(&hslot4->lock);
+
+ spin_lock(&hslot2->lock);
+ udp_hash4_inc(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+}
+EXPORT_SYMBOL(udp_lib_hash4);
+
+/* call with sock lock */
+void udp4_hash4(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ unsigned int hash;
+
+ if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY))
+ return;
+
+ hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+
+ udp_lib_hash4(sk, hash);
+}
+EXPORT_SYMBOL(udp4_hash4);
+#endif /* CONFIG_BASE_SMALL */
+
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
@@ -486,13 +639,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2;
struct udp_hslot *hslot2;
struct sock *result, *sk;
+ unsigned int hash2;
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ if (udp_has_hash4(hslot2)) {
+ result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum,
+ dif, sdif, udptable);
+ if (result) /* udp4_lib_lookup4 return sk or NULL */
+ return result;
+ }
/* Lookup connected or non-wildcard socket */
result = udp4_lib_lookup2(net, saddr, sport,
@@ -519,8 +678,7 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
/* Lookup wildcard sockets */
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
+ hslot2 = udp_hashslot2(udptable, hash2);
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
@@ -1935,6 +2093,18 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
EXPORT_SYMBOL(udp_pre_connect);
+static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip4_datagram_connect(sk, uaddr, addr_len);
+ if (!res)
+ udp4_hash4(sk);
+ release_sock(sk);
+ return res;
+}
+
int __udp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1994,6 +2164,8 @@ void udp_lib_unhash(struct sock *sk)
hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
+
+ udp_unhash4(udptable, sk);
}
spin_unlock_bh(&hslot->lock);
}
@@ -2003,7 +2175,7 @@ EXPORT_SYMBOL(udp_lib_unhash);
/*
* inet_rcv_saddr was changed, we must rehash secondary hash
*/
-void udp_lib_rehash(struct sock *sk, u16 newhash)
+void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
{
if (sk_hashed(sk)) {
struct udp_table *udptable = udp_get_table_prot(sk);
@@ -2035,6 +2207,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
spin_unlock(&nhslot2->lock);
}
+ if (udp_hashed4(sk)) {
+ udp_rehash4(udptable, sk, newhash4);
+
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ udp_hash4_inc(nhslot2);
+ spin_unlock(&nhslot2->lock);
+ }
+ }
spin_unlock_bh(&hslot->lock);
}
}
@@ -2046,7 +2231,11 @@ void udp_v4_rehash(struct sock *sk)
u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
inet_sk(sk)->inet_rcv_saddr,
inet_sk(sk)->inet_num);
- udp_lib_rehash(sk, new_hash);
+ u16 new_hash4 = udp_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+
+ udp_lib_rehash(sk, new_hash, new_hash4);
}
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -2268,7 +2457,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
udptable->mask;
hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udptable->hash2[hash2];
+ hslot = &udptable->hash2[hash2].hslot;
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
@@ -2539,14 +2728,13 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
struct udp_table *udptable = net->ipv4.udp_table;
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2, slot2;
struct udp_hslot *hslot2;
+ unsigned int hash2;
__portpair ports;
struct sock *sk;
hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
+ hslot2 = udp_hashslot2(udptable, hash2);
ports = INET_COMBINED_PORTS(rmt_port, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
@@ -2940,7 +3128,7 @@ struct proto udp_prot = {
.owner = THIS_MODULE,
.close = udp_lib_close,
.pre_connect = udp_pre_connect,
- .connect = ip4_datagram_connect,
+ .connect = udp_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.init = udp_init_sock,
@@ -3187,7 +3375,7 @@ again:
batch_sks = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
- struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
+ struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
if (hlist_empty(&hslot2->head))
continue;
@@ -3428,10 +3616,12 @@ __setup("uhash_entries=", set_uhash_entries);
void __init udp_table_init(struct udp_table *table, const char *name)
{
- unsigned int i;
+ unsigned int i, slot_size;
+ slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+ udp_hash4_slot_size();
table->hash = alloc_large_system_hash(name,
- 2 * sizeof(struct udp_hslot),
+ slot_size,
uhash_entries,
21, /* one slot per 2 MB */
0,
@@ -3440,17 +3630,18 @@ void __init udp_table_init(struct udp_table *table, const char *name)
UDP_HTABLE_SIZE_MIN,
UDP_HTABLE_SIZE_MAX);
- table->hash2 = table->hash + (table->mask + 1);
+ table->hash2 = (void *)(table->hash + (table->mask + 1));
for (i = 0; i <= table->mask; i++) {
INIT_HLIST_HEAD(&table->hash[i].head);
table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_HEAD(&table->hash2[i].head);
- table->hash2[i].count = 0;
- spin_lock_init(&table->hash2[i].lock);
+ INIT_HLIST_HEAD(&table->hash2[i].hslot.head);
+ table->hash2[i].hslot.count = 0;
+ spin_lock_init(&table->hash2[i].hslot.lock);
}
+ udp_table_hash4_init(table);
}
u32 udp_flow_hashrnd(void)
@@ -3476,18 +3667,21 @@ static void __net_init udp_sysctl_init(struct net *net)
static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
{
struct udp_table *udptable;
+ unsigned int slot_size;
int i;
udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
if (!udptable)
goto out;
- udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+ slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+ udp_hash4_slot_size();
+ udptable->hash = vmalloc_huge(hash_entries * slot_size,
GFP_KERNEL_ACCOUNT);
if (!udptable->hash)
goto free_table;
- udptable->hash2 = udptable->hash + hash_entries;
+ udptable->hash2 = (void *)(udptable->hash + hash_entries);
udptable->mask = hash_entries - 1;
udptable->log = ilog2(hash_entries);
@@ -3496,10 +3690,11 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent
udptable->hash[i].count = 0;
spin_lock_init(&udptable->hash[i].lock);
- INIT_HLIST_HEAD(&udptable->hash2[i].head);
- udptable->hash2[i].count = 0;
- spin_lock_init(&udptable->hash2[i].lock);
+ INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head);
+ udptable->hash2[i].hslot.count = 0;
+ spin_lock_init(&udptable->hash2[i].hslot.lock);
}
+ udp_table_hash4_init(udptable);
return udptable;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0cef8ae5d1ea..d766fd798ecf 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -110,8 +110,19 @@ void udp_v6_rehash(struct sock *sk)
u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
&sk->sk_v6_rcv_saddr,
inet_sk(sk)->inet_num);
+ u16 new_hash4;
- udp_lib_rehash(sk, new_hash);
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) {
+ new_hash4 = udp_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+ } else {
+ new_hash4 = udp6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+ }
+
+ udp_lib_rehash(sk, new_hash, new_hash4);
}
static int compute_score(struct sock *sk, const struct net *net,
@@ -216,6 +227,74 @@ rescore:
return result;
}
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp6_lib_lookup4(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ struct udp_table *udptable)
+{
+ return NULL;
+}
+
+static void udp6_hash4(struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp6_lib_lookup4(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ struct udp_table *udptable)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ const struct hlist_nulls_node *node;
+ struct udp_hslot *hslot4;
+ unsigned int hash4, slot;
+ struct udp_sock *up;
+ struct sock *sk;
+
+ hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash4 & udptable->mask;
+ hslot4 = &udptable->hash4[slot];
+
+begin:
+ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
+ sk = (struct sock *)up;
+ if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
+ return sk;
+ }
+
+ /* if the nulls value we got at the end of this lookup is not the
+ * expected one, we must restart lookup. We probably met an item that
+ * was moved to another chain due to rehash.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ return NULL;
+}
+
+static void udp6_hash4(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ unsigned int hash;
+
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) {
+ udp4_hash4(sk);
+ return;
+ }
+
+ if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ return;
+
+ hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+
+ udp_lib_hash4(sk, hash);
+}
+#endif /* CONFIG_BASE_SMALL */
+
/* rcu_read_lock() must be held */
struct sock *__udp6_lib_lookup(const struct net *net,
const struct in6_addr *saddr, __be16 sport,
@@ -224,13 +303,19 @@ struct sock *__udp6_lib_lookup(const struct net *net,
struct sk_buff *skb)
{
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2;
struct udp_hslot *hslot2;
struct sock *result, *sk;
+ unsigned int hash2;
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ if (udp_has_hash4(hslot2)) {
+ result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum,
+ dif, sdif, udptable);
+ if (result) /* udp6_lib_lookup4 return sk or NULL */
+ return result;
+ }
/* Lookup connected or non-wildcard sockets */
result = udp6_lib_lookup2(net, saddr, sport,
@@ -257,8 +342,7 @@ struct sock *__udp6_lib_lookup(const struct net *net,
/* Lookup wildcard sockets */
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
+ hslot2 = udp_hashslot2(udptable, hash2);
result = udp6_lib_lookup2(net, saddr, sport,
&in6addr_any, hnum, dif, sdif,
@@ -859,7 +943,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
udptable->mask;
hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udptable->hash2[hash2];
+ hslot = &udptable->hash2[hash2].hslot;
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
@@ -1065,14 +1149,13 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
{
struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2, slot2;
struct udp_hslot *hslot2;
+ unsigned int hash2;
__portpair ports;
struct sock *sk;
hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
+ hslot2 = udp_hashslot2(udptable, hash2);
ports = INET_COMBINED_PORTS(rmt_port, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
@@ -1169,6 +1252,18 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
}
+static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip6_datagram_connect(sk, uaddr, addr_len);
+ if (!res)
+ udp6_hash4(sk);
+ release_sock(sk);
+ return res;
+}
+
/**
* udp6_hwcsum_outgoing - handle outgoing HW checksumming
* @sk: socket we are sending on
@@ -1764,7 +1859,7 @@ struct proto udpv6_prot = {
.owner = THIS_MODULE,
.close = udp_lib_close,
.pre_connect = udpv6_pre_connect,
- .connect = ip6_datagram_connect,
+ .connect = udpv6_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.init = udpv6_init_sock,