From 2ae928a9441a3b5f13952e1e8a97d03cb23ea603 Mon Sep 17 00:00:00 2001
From: Shakeel Butt
Date: Fri, 17 Nov 2017 15:28:59 -0800
Subject: epoll: account epitem and eppoll_entry to kmemcg

A userspace application can directly trigger the allocations from
eventpoll_epi and eventpoll_pwq slabs.  A buggy or malicious application
can consume a significant amount of system memory by triggering such
allocations.  Indeed we have seen in production where a buggy
application was leaking the epoll references and causing a burst of
eventpoll_epi and eventpoll_pwq slab allocations.  This patch opt-in the
charging of eventpoll_epi and eventpoll_pwq slabs.

There is a per-user limit (~4% of total memory if no highmem) on these
caches.  I think it is too generous particularly in the scenario where
jobs of multiple users are running on the system and the administrator
is reducing cost by overcomitting the memory.  This is unaccounted
kernel memory and will not be considered by the oom-killer.  I think by
accounting it to kmemcg, for systems with kmem accounting enabled, we
can provide better isolation between jobs of different users.

Link: http://lkml.kernel.org/r/20171003021519.23907-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/eventpoll.c')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..a45360444895 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2329,11 +2329,11 @@ static int __init eventpoll_init(void)
 
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	/* Allocates slab cache used to allocate "struct eppoll_entry" */
 	pwq_cache = kmem_cache_create("eventpoll_pwq",
-			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
+		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 57a173bdf5baab48e8e78825c7366c634acd087c Mon Sep 17 00:00:00 2001
From: Jason Baron
Date: Fri, 17 Nov 2017 15:29:02 -0800
Subject: epoll: avoid calling ep_call_nested() from ep_poll_safewake()

ep_poll_safewake() is used to wakeup potentially nested epoll file
descriptors.  The function uses ep_call_nested() to prevent entering the
same wake up queue more than once, and to prevent excessively deep
wakeup paths (deeper than EP_MAX_NESTS).  However, this is not necessary
since we are already preventing these conditions during EPOLL_CTL_ADD.
This saves extra function calls, and avoids taking a global lock during
the ep_call_nested() calls.

I have, however, left ep_call_nested() for the CONFIG_DEBUG_LOCK_ALLOC
case, since ep_call_nested() keeps track of the nesting level, and this
is required by the call to spin_lock_irqsave_nested().  It would be nice
to remove the ep_call_nested() calls for the CONFIG_DEBUG_LOCK_ALLOC
case as well, however its not clear how to simply pass the nesting level
through multiple wake_up() levels without more surgery.  In any case, I
don't think CONFIG_DEBUG_LOCK_ALLOC is generally used for production.
This patch, also apparently fixes a workload at Google that Salman Qazi
reported by completely removing the poll_safewake_ncalls->lock from
wakeup paths.

Link: http://lkml.kernel.org/r/1507920533-8812-1-git-send-email-jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Salman Qazi <sqazi@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 47 ++++++++++++++++++-----------------------------
 1 file changed, 18 insertions(+), 29 deletions(-)

(limited to 'fs/eventpoll.c')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a45360444895..dc15bb02ee2a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex);
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
 
-/* Used for safe wake up implementation */
-static struct nested_calls poll_safewake_ncalls;
-
 /* Used to call file's f_op->poll() under the nested calls boundaries */
 static struct nested_calls poll_readywalk_ncalls;
 
@@ -551,40 +548,21 @@ out_unlock:
  * this special case of epoll.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
-				     unsigned long events, int subclass)
+
+static struct nested_calls poll_safewake_ncalls;
+
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 {
 	unsigned long flags;
+	wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
 
-	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
-	wake_up_locked_poll(wqueue, events);
+	spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
+	wake_up_locked_poll(wqueue, POLLIN);
 	spin_unlock_irqrestore(&wqueue->lock, flags);
-}
-#else
-static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
-				     unsigned long events, int subclass)
-{
-	wake_up_poll(wqueue, events);
-}
-#endif
 
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
-	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
-			  1 + call_nests);
 	return 0;
 }
 
-/*
- * Perform a safe wake up of the poll wait list. The problem is that
- * with the new callback'd wake up system, it is possible that the
- * poll callback is reentered from inside the call to wake_up() done
- * on the poll wait queue head. The rule is that we cannot reenter the
- * wake up code from the same task more than EP_MAX_NESTS times,
- * and we cannot reenter the same wait queue head at all. This will
- * enable to have a hierarchy of epoll file descriptor of no more than
- * EP_MAX_NESTS deep.
- */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
 	int this_cpu = get_cpu();
@@ -595,6 +573,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
 	put_cpu();
 }
 
+#else
+
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+	wake_up_poll(wq, POLLIN);
+}
+
+#endif
+
 static void ep_remove_wait_queue(struct eppoll_entry *pwq)
 {
 	wait_queue_head_t *whead;
@@ -2315,8 +2302,10 @@ static int __init eventpoll_init(void)
 	 */
 	ep_nested_calls_init(&poll_loop_ncalls);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
+#endif
 
 	/* Initialize the structure used to perform file's f_op->poll() calls */
 	ep_nested_calls_init(&poll_readywalk_ncalls);
-- 
cgit v1.2.3-55-g7522


From 37b5e5212a448bac0fe29d2a51f088014fbaaa41 Mon Sep 17 00:00:00 2001
From: Jason Baron
Date: Fri, 17 Nov 2017 15:29:06 -0800
Subject: epoll: remove ep_call_nested() from ep_eventpoll_poll()

The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll
routine for an epoll fd, is used to prevent excessively deep epoll
nesting, and to prevent circular paths.

However, we are already preventing these conditions during
EPOLL_CTL_ADD.  In terms of too deep epoll chains, we do in fact allow
deep nesting of the epoll fds themselves (deeper than EP_MAX_NESTS),
however we don't allow more than EP_MAX_NESTS when an epoll file
descriptor is actually connected to a wakeup source.  Thus, we do not
require the use of ep_call_nested(), since ep_eventpoll_poll(), which is
called via ep_scan_ready_list() only continues nesting if there are
events available.

Since ep_call_nested() is implemented using a global lock, applications
that make use of nested epoll can see large performance improvements
with this change.

Davidlohr said:

: Improvements are quite obscene actually, such as for the following
: epoll_wait() benchmark with 2 level nesting on a 80 core IvyBridge:
:
: ncpus  vanilla     dirty     delta
: 1      2447092     3028315   +23.75%
: 4      231265      2986954   +1191.57%
: 8      121631      2898796   +2283.27%
: 16     59749       2902056   +4757.07%
: 32     26837	     2326314   +8568.30%
: 64     12926       1341281   +10276.61%
:
: (http://linux-scalability.org/epoll/epoll-test.c)

Link: http://lkml.kernel.org/r/1509430214-5599-1-git-send-email-jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Salman Qazi <sqazi@google.com>
Cc: Hou Tao <houtao1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 80 +++++++++++++++++++++++++---------------------------------
 1 file changed, 35 insertions(+), 45 deletions(-)

(limited to 'fs/eventpoll.c')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index dc15bb02ee2a..1e048144f17c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex);
 /* Used to check for epoll file descriptor inclusion loops */
 static struct nested_calls poll_loop_ncalls;
 
-/* Used to call file's f_op->poll() under the nested calls boundaries */
-static struct nested_calls poll_readywalk_ncalls;
-
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
 
@@ -867,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+			       void *priv);
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+				 poll_table *pt);
+
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth)
 {
+	struct eventpoll *ep;
+	bool locked;
+
 	pt->_key = epi->event.events;
+	if (!is_file_epoll(epi->ffd.file))
+		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
+		       epi->event.events;
 
-	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+	ep = epi->ffd.file->private_data;
+	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
+	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
+
+	return ep_scan_ready_list(epi->ffd.file->private_data,
+				  ep_read_events_proc, &depth, depth,
+				  locked) & epi->event.events;
 }
 
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
@@ -879,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 {
 	struct epitem *epi, *tmp;
 	poll_table pt;
+	int depth = *(int *)priv;
 
 	init_poll_funcptr(&pt, NULL);
+	depth++;
 
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
-		if (ep_item_poll(epi, &pt))
+		if (ep_item_poll(epi, &pt, depth)) {
 			return POLLIN | POLLRDNORM;
-		else {
+		} else {
 			/*
 			 * Item has been dropped into the ready list by the poll
 			 * callback, but it's not actually ready, as far as
@@ -899,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	return 0;
 }
 
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
-				 poll_table *pt);
-
-struct readyevents_arg {
-	struct eventpoll *ep;
-	bool locked;
-};
-
-static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
-{
-	struct readyevents_arg *arg = priv;
-
-	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
-				  call_nests + 1, arg->locked);
-}
-
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-	int pollflags;
 	struct eventpoll *ep = file->private_data;
-	struct readyevents_arg arg;
-
-	/*
-	 * During ep_insert() we already hold the ep->mtx for the tfile.
-	 * Prevent re-aquisition.
-	 */
-	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
-	arg.ep = ep;
+	int depth = 0;
 
 	/* Insert inside our poll wait queue */
 	poll_wait(file, &ep->poll_wait, wait);
 
 	/*
 	 * Proceed to find out if wanted events are really available inside
-	 * the ready list. This need to be done under ep_call_nested()
-	 * supervision, since the call to f_op->poll() done on listed files
-	 * could re-enter here.
+	 * the ready list.
 	 */
-	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, &arg, ep, current);
-
-	return pollflags != -1 ? pollflags : 0;
+	return ep_scan_ready_list(ep, ep_read_events_proc,
+				  &depth, depth, false);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -1459,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * this operation completes, the poll callback can start hitting
 	 * the new item.
 	 */
-	revents = ep_item_poll(epi, &epq.pt);
+	revents = ep_item_poll(epi, &epq.pt, 1);
 
 	/*
 	 * We have to check if something went wrong during the poll wait queue
@@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = ep_item_poll(epi, &pt);
+	revents = ep_item_poll(epi, &pt, 1);
 
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
@@ -1661,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 
 		list_del_init(&epi->rdllink);
 
-		revents = ep_item_poll(epi, &pt);
+		revents = ep_item_poll(epi, &pt, 1);
 
 		/*
 		 * If the event mask intersect the caller-requested one,
@@ -2307,9 +2300,6 @@ static int __init eventpoll_init(void)
 	ep_nested_calls_init(&poll_safewake_ncalls);
 #endif
 
-	/* Initialize the structure used to perform file's f_op->poll() calls */
-	ep_nested_calls_init(&poll_readywalk_ncalls);
-
 	/*
 	 * We can have many thousands of epitems, so prevent this from
 	 * using an extra cache line on 64-bit (and smaller) CPUs
-- 
cgit v1.2.3-55-g7522