From 4b272750dbe6f92a8d39a0ee1c7bd50d6cc1a2c8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 8 Dec 2016 11:41:54 -0800
Subject: udp: add busylocks in RX path

Idea of busylocks is to let producers grab an extra spinlock
to relieve pressure on the receive_queue spinlock shared by consumer.

This behavior is requested only once socket receive queue is above
half occupancy.

Under flood, this means that only one producer can be in line
trying to acquire the receive_queue spinlock.

These busylock can be allocated on a per cpu manner, instead of a
per socket one (that would consume a cache line per socket)

This patch considerably improves UDP behavior under stress,
depending on number of NIC RX queues and/or RPS spread.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f5628ada47b5..e6a68d66f3b2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1195,10 +1195,36 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(udp_skb_destructor);
 
+/* Idea of busylocks is to let producers grab an extra spinlock
+ * to relieve pressure on the receive_queue spinlock shared by consumer.
+ * Under flood, this means that only one producer can be in line
+ * trying to acquire the receive_queue spinlock.
+ * These busylock can be allocated on a per cpu manner, instead of a
+ * per socket one (that would consume a cache line per socket)
+ */
+static int udp_busylocks_log __read_mostly;
+static spinlock_t *udp_busylocks __read_mostly;
+
+static spinlock_t *busylock_acquire(void *ptr)
+{
+	spinlock_t *busy;
+
+	busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
+	spin_lock(busy);
+	return busy;
+}
+
+static void busylock_release(spinlock_t *busy)
+{
+	if (busy)
+		spin_unlock(busy);
+}
+
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 	int rmem, delta, amt, err = -ENOMEM;
+	spinlock_t *busy = NULL;
 	int size;
 
 	/* try to avoid the costly atomic add/sub pair when the receive
@@ -1214,8 +1240,11 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	 * - Less cache line misses at copyout() time
 	 * - Less work at consume_skb() (less alien page frag freeing)
 	 */
-	if (rmem > (sk->sk_rcvbuf >> 1))
+	if (rmem > (sk->sk_rcvbuf >> 1)) {
 		skb_condense(skb);
+
+		busy = busylock_acquire(sk);
+	}
 	size = skb->truesize;
 
 	/* we drop only if the receive buf is full and the receive
@@ -1252,6 +1281,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_data_ready(sk);
 
+	busylock_release(busy);
 	return 0;
 
 uncharge_drop:
@@ -1259,6 +1289,7 @@ uncharge_drop:
 
 drop:
 	atomic_inc(&sk->sk_drops);
+	busylock_release(busy);
 	return err;
 }
 EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
@@ -2613,6 +2644,7 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
 void __init udp_init(void)
 {
 	unsigned long limit;
+	unsigned int i;
 
 	udp_table_init(&udp_table, "UDP");
 	limit = nr_free_buffer_pages() / 8;
@@ -2623,4 +2655,13 @@ void __init udp_init(void)
 
 	sysctl_udp_rmem_min = SK_MEM_QUANTUM;
 	sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+	/* 16 spinlocks per cpu */
+	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
+	udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
+				GFP_KERNEL);
+	if (!udp_busylocks)
+		panic("UDP: failed to alloc udp_busylocks\n");
+	for (i = 0; i < (1U << udp_busylocks_log); i++)
+		spin_lock_init(udp_busylocks + i);
 }
-- 
cgit v1.2.3-55-g7522


From c84d949057cab262b4d3110ead9a42a58c2958f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 8 Dec 2016 11:41:55 -0800
Subject: udp: copy skb->truesize in the first cache line

In UDP RX handler, we currently clear skb->dev before skb
is added to receive queue, because device pointer is no longer
available once we exit from RCU section.

Since this first cache line is always hot, lets reuse this space
to store skb->truesize and thus avoid a cache line miss at
udp_recvmsg()/udp_skb_destructor time while receive queue
spinlock is held.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  9 ++++++++-
 net/ipv4/udp.c         | 13 ++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0cd92b0f2af5..332e76756f54 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -645,8 +645,15 @@ struct sk_buff {
 		struct rb_node	rbnode; /* used in netem & tcp stack */
 	};
 	struct sock		*sk;
-	struct net_device	*dev;
 
+	union {
+		struct net_device	*dev;
+		/* Some protocols might use this space to store information,
+		 * while device pointer would be NULL.
+		 * UDP receive path is one user.
+		 */
+		unsigned long		dev_scratch;
+	};
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e6a68d66f3b2..c608334d99aa 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1188,10 +1188,14 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-/* Note: called with sk_receive_queue.lock held */
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 {
-	udp_rmem_release(sk, skb->truesize, 1);
+	udp_rmem_release(sk, skb->dev_scratch, 1);
 }
 EXPORT_SYMBOL(udp_skb_destructor);
 
@@ -1246,6 +1250,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 		busy = busylock_acquire(sk);
 	}
 	size = skb->truesize;
+	/* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+	 * in udp_skb_destructor()
+	 */
+	skb->dev_scratch = size;
 
 	/* we drop only if the receive buf is full and the receive
 	 * queue contains some other skb
@@ -1272,7 +1280,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	/* no need to setup a destructor, we will explicitly release the
 	 * forward allocated memory on dequeue
 	 */
-	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
 	__skb_queue_tail(list, skb);
-- 
cgit v1.2.3-55-g7522


From 6b229cf77d683f634f0edd876c6d1015402303ad Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 8 Dec 2016 11:41:56 -0800
Subject: udp: add batching to udp_rmem_release()

If udp_recvmsg() constantly releases sk_rmem_alloc
for every read packet, it gives opportunity for
producers to immediately grab spinlocks and desperatly
try adding another packet, causing false sharing.

We can add a simple heuristic to give the signal
by batches of ~25 % of the queue capacity.

This patch considerably increases performance under
flood by about 50 %, since the thread draining the queue
is no longer slowed by false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  3 +++
 net/ipv4/udp.c      | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'net')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index d1fd8cd39478..c0f530809d1f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,9 @@ struct udp_sock {
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
 						int nhoff);
+
+	/* This field is dirtied by udp_recvmsg() */
+	int		forward_deficit;
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c608334d99aa..5a38faa12cde 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1177,8 +1177,20 @@ out:
 /* fully reclaim rmem/fwd memory allocated for skb */
 static void udp_rmem_release(struct sock *sk, int size, int partial)
 {
+	struct udp_sock *up = udp_sk(sk);
 	int amt;
 
+	if (likely(partial)) {
+		up->forward_deficit += size;
+		size = up->forward_deficit;
+		if (size < (sk->sk_rcvbuf >> 2) &&
+		    !skb_queue_empty(&sk->sk_receive_queue))
+			return;
+	} else {
+		size += up->forward_deficit;
+	}
+	up->forward_deficit = 0;
+
 	atomic_sub(size, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
-- 
cgit v1.2.3-55-g7522


From 02ab0d139cff1efc5aa1fb4378c727668334fe97 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 8 Dec 2016 11:41:57 -0800
Subject: udp: udp_rmem_release() should touch sk_rmem_alloc later

In flood situations, keeping sk_rmem_alloc at a high value
prevents producers from touching the socket.

It makes sense to lower sk_rmem_alloc only at the end
of udp_rmem_release() after the thread draining receive
queue in udp_recvmsg() finished the writes to sk_forward_alloc.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5a38faa12cde..9ca279b130d5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1191,13 +1191,14 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
 	}
 	up->forward_deficit = 0;
 
-	atomic_sub(size, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
 	sk->sk_forward_alloc -= amt;
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+
+	atomic_sub(size, &sk->sk_rmem_alloc);
 }
 
 /* Note: called with sk_receive_queue.lock held.
-- 
cgit v1.2.3-55-g7522