From e566b76ed30768140df8f0023904aed5a41244f7 Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Wed, 6 Apr 2011 02:54:54 +0200
Subject: perf_event: Fix cgrp event scheduling bug in perf_enable_on_exec()

There is a bug in perf_event_enable_on_exec() when cgroup events are
active on a CPU: the cgroup events may be scheduled twice causing event
state corruptions which eventually may lead to kernel panics.

The reason is that the function needs to first schedule out the cgroup
events, just like for the per-thread events. The cgroup event are
scheduled back in automatically from the perf_event_context_sched_in()
function.

The patch also adds a WARN_ON_ONCE() is perf_cgroup_switch() to catch any
bogus state.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110406005454.GA1062@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 27960f114efd..8e81a9860a0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
 			}
 
 			if (mode & PERF_CGROUP_SWIN) {
+				WARN_ON_ONCE(cpuctx->cgrp);
 				/* set cgrp before ctxsw in to
 				 * allow event_filter_match() to not
 				 * have to pass task around
@@ -2423,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 	if (!ctx || !ctx->nr_events)
 		goto out;
 
+	/*
+	 * We must ctxsw out cgroup events to avoid conflict
+	 * when invoking perf_task_event_sched_in() later on
+	 * in this function. Otherwise we end up trying to
+	 * ctxswin cgroup events which are already scheduled
+	 * in.
+	 */
+	perf_cgroup_sched_out(current);
 	task_ctx_sched_out(ctx, EVENT_ALL);
 
 	raw_spin_lock(&ctx->lock);
@@ -2447,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
 
 	raw_spin_unlock(&ctx->lock);
 
+	/*
+	 * Also calls ctxswin for cgroup events, if any:
+	 */
 	perf_event_context_sched_in(ctx, ctx->task);
 out:
 	local_irq_restore(flags);
-- 
cgit v1.2.3-55-g7522


From d9c97833179036408e53ef5f3f5c7eaf781769bc Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Tue, 12 Apr 2011 10:06:33 +0200
Subject: block: remove block_unplug_timer() trace point

We no longer have an unplug timer running, so no point in keeping
the trace point.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/trace/events/block.h | 14 --------------
 kernel/trace/blktrace.c      | 17 -----------------
 2 files changed, 31 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 78f18adb49c8..43a985390bb6 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -418,20 +418,6 @@ DECLARE_EVENT_CLASS(block_unplug,
 	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
 );
 
-/**
- * block_unplug_timer - timed release of operations requests in queue to device driver
- * @q: request queue to unplug
- *
- * Unplug the request queue @q because a timer expired and allow block
- * operation requests to be sent to the device driver.
- */
-DEFINE_EVENT(block_unplug, block_unplug_timer,
-
-	TP_PROTO(struct request_queue *q),
-
-	TP_ARGS(q)
-);
-
 /**
  * block_unplug_io - release of operations requests in request queue
  * @q: request queue to unplug
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7aa40f8e182d..824708cbfb7b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -863,19 +863,6 @@ static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
 	}
 }
 
-static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
-				sizeof(rpdu), &rpdu);
-	}
-}
-
 static void blk_add_trace_split(void *ignore,
 				struct request_queue *q, struct bio *bio,
 				unsigned int pdu)
@@ -1015,8 +1002,6 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
-	WARN_ON(ret);
 	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
@@ -1033,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
-	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
 	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1348,7 +1332,6 @@ static const struct {
 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
-	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
 	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
-- 
cgit v1.2.3-55-g7522


From 94b5eb28b41cc79d9713696e0005ae167b5afd1b Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Tue, 12 Apr 2011 10:12:19 +0200
Subject: block: fixup block IO unplug trace call

It was removed with the on-stack plugging, readd it and track the
depth of requests added when flushing the plug.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c             | 15 +++++++++++++--
 include/trace/events/block.h | 11 ++++++-----
 kernel/trace/blktrace.c      |  6 +++---
 3 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index eeaca0998df5..d20ce1e849c8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2668,12 +2668,19 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return !(rqa->q <= rqb->q);
 }
 
+static void queue_unplugged(struct request_queue *q, unsigned int depth)
+{
+	trace_block_unplug_io(q, depth);
+	__blk_run_queue(q, false);
+}
+
 static void flush_plug_list(struct blk_plug *plug)
 {
 	struct request_queue *q;
 	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
+	unsigned int depth;
 
 	BUG_ON(plug->magic != PLUG_MAGIC);
 
@@ -2688,6 +2695,7 @@ static void flush_plug_list(struct blk_plug *plug)
 	}
 
 	q = NULL;
+	depth = 0;
 	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
@@ -2696,10 +2704,11 @@ static void flush_plug_list(struct blk_plug *plug)
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			if (q) {
-				__blk_run_queue(q, false);
+				queue_unplugged(q, depth);
 				spin_unlock(q->queue_lock);
 			}
 			q = rq->q;
+			depth = 0;
 			spin_lock(q->queue_lock);
 		}
 		rq->cmd_flags &= ~REQ_ON_PLUG;
@@ -2711,10 +2720,12 @@ static void flush_plug_list(struct blk_plug *plug)
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
+
+		depth++;
 	}
 
 	if (q) {
-		__blk_run_queue(q, false);
+		queue_unplugged(q, depth);
 		spin_unlock(q->queue_lock);
 	}
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 43a985390bb6..006e60b58306 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -401,9 +401,9 @@ TRACE_EVENT(block_plug,
 
 DECLARE_EVENT_CLASS(block_unplug,
 
-	TP_PROTO(struct request_queue *q),
+	TP_PROTO(struct request_queue *q, unsigned int depth),
 
-	TP_ARGS(q),
+	TP_ARGS(q, depth),
 
 	TP_STRUCT__entry(
 		__field( int,		nr_rq			)
@@ -411,7 +411,7 @@ DECLARE_EVENT_CLASS(block_unplug,
 	),
 
 	TP_fast_assign(
-		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		__entry->nr_rq = depth;
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -421,15 +421,16 @@ DECLARE_EVENT_CLASS(block_unplug,
 /**
  * block_unplug_io - release of operations requests in request queue
  * @q: request queue to unplug
+ * @depth: number of requests just added to the queue
  *
  * Unplug request queue @q because device driver is scheduled to work
  * on elements in the request queue.
  */
 DEFINE_EVENT(block_unplug, block_unplug_io,
 
-	TP_PROTO(struct request_queue *q),
+	TP_PROTO(struct request_queue *q, unsigned int depth),
 
-	TP_ARGS(q)
+	TP_ARGS(q, depth)
 );
 
 /**
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 824708cbfb7b..3e3970d53d14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -850,13 +850,13 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q,
+				    unsigned int depth)
 {
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
+		__be64 rpdu = cpu_to_be64(depth);
 
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
 				sizeof(rpdu), &rpdu);
-- 
cgit v1.2.3-55-g7522


From 0cd9c6494ee5c19aef085152bc37f3a4e774a9e1 Mon Sep 17 00:00:00 2001
From: Darren Hart
Date: Thu, 14 Apr 2011 15:41:57 -0700
Subject: futex: Set FLAGS_HAS_TIMEOUT during futex_wait restart setup

The FLAGS_HAS_TIMEOUT flag was not getting set, causing the restart_block to
restart futex_wait() without a timeout after a signal.

Commit b41277dc7a18ee332d in 2.6.38 introduced the regression by accidentally
removing the the FLAGS_HAS_TIMEOUT assignment from futex_wait() during the setup
of the restart block. Restore the originaly behavior.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=32922

Reported-by: Tim Smith <tsmith201104@yahoo.com>
Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/%3Cdaac0eb3af607f72b9a4d3126b2ba8fb5ed3b883.1302820917.git.dvhart%40linux.intel.com%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index dfb924ffe65b..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1886,7 +1886,7 @@ retry:
 	restart->futex.val = val;
 	restart->futex.time = abs_time->tv64;
 	restart->futex.bitset = bitset;
-	restart->futex.flags = flags;
+	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
 	ret = -ERESTART_RESTARTBLOCK;
 
-- 
cgit v1.2.3-55-g7522


From a237c1c5bc5dc5c76a21be922dca4826f3eca8ca Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Sat, 16 Apr 2011 13:27:55 +0200
Subject: block: let io_schedule() flush the plug inline

Linus correctly observes that the most important dispatch cases
are now done from kblockd, this isn't ideal for latency reasons.
The original reason for switching dispatches out-of-line was to
avoid too deep a stack, so by _only_ letting the "accidental"
flush directly in schedule() be guarded by offload to kblockd,
we should be able to get the best of both worlds.

So add a blk_schedule_flush_plug() that offloads to kblockd,
and only use that from the schedule() path.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 13 +++++++++++++
 kernel/sched.c         |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1c76506fcf11..ec0357d8c4a5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -871,6 +871,14 @@ static inline void blk_flush_plug(struct task_struct *tsk)
 {
 	struct blk_plug *plug = tsk->plug;
 
+	if (plug)
+		blk_flush_plug_list(plug, false);
+}
+
+static inline void blk_schedule_flush_plug(struct task_struct *tsk)
+{
+	struct blk_plug *plug = tsk->plug;
+
 	if (plug)
 		blk_flush_plug_list(plug, true);
 }
@@ -1317,6 +1325,11 @@ static inline void blk_flush_plug(struct task_struct *task)
 {
 }
 
+static inline void blk_schedule_flush_plug(struct task_struct *task)
+{
+}
+
+
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
 	return false;
diff --git a/kernel/sched.c b/kernel/sched.c
index a187c3fe027b..312f8b95c2d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4118,7 +4118,7 @@ need_resched:
 			 */
 			if (blk_needs_flush_plug(prev)) {
 				raw_spin_unlock(&rq->lock);
-				blk_flush_plug(prev);
+				blk_schedule_flush_plug(prev);
 				raw_spin_lock(&rq->lock);
 			}
 		}
-- 
cgit v1.2.3-55-g7522


From 49cac01e1fa74174d72adb0e872504a7fefd7c01 Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Sat, 16 Apr 2011 13:51:05 +0200
Subject: block: make unplug timer trace event correspond to the schedule()
 unplug

It's a pretty close match to what we had before - the timer triggering
would mean that nobody unplugged the plug in due time, in the new
scheme this matches very closely what the schedule() unplug now is.
It's essentially the difference between an explicit unplug (IO unplug)
or an implicit unplug (timer unplug, we scheduled with pending IO
queued).

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c             | 18 ++++++++++++------
 include/trace/events/block.h | 13 +++++++------
 kernel/trace/blktrace.c      | 18 ++++++++++++------
 3 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3c8121072507..78b7b0cb7216 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2662,17 +2662,23 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return !(rqa->q <= rqb->q);
 }
 
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
 static void queue_unplugged(struct request_queue *q, unsigned int depth,
-			    bool force_kblockd)
+			    bool from_schedule)
 {
-	trace_block_unplug_io(q, depth);
-	__blk_run_queue(q, force_kblockd);
+	trace_block_unplug(q, depth, !from_schedule);
+	__blk_run_queue(q, from_schedule);
 
 	if (q->unplugged_fn)
 		q->unplugged_fn(q);
 }
 
-void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
 	unsigned long flags;
@@ -2707,7 +2713,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			if (q) {
-				queue_unplugged(q, depth, force_kblockd);
+				queue_unplugged(q, depth, from_schedule);
 				spin_unlock(q->queue_lock);
 			}
 			q = rq->q;
@@ -2728,7 +2734,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
 	}
 
 	if (q) {
-		queue_unplugged(q, depth, force_kblockd);
+		queue_unplugged(q, depth, from_schedule);
 		spin_unlock(q->queue_lock);
 	}
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 006e60b58306..bf366547da25 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -401,9 +401,9 @@ TRACE_EVENT(block_plug,
 
 DECLARE_EVENT_CLASS(block_unplug,
 
-	TP_PROTO(struct request_queue *q, unsigned int depth),
+	TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),
 
-	TP_ARGS(q, depth),
+	TP_ARGS(q, depth, explicit),
 
 	TP_STRUCT__entry(
 		__field( int,		nr_rq			)
@@ -419,18 +419,19 @@ DECLARE_EVENT_CLASS(block_unplug,
 );
 
 /**
- * block_unplug_io - release of operations requests in request queue
+ * block_unplug - release of operations requests in request queue
  * @q: request queue to unplug
  * @depth: number of requests just added to the queue
+ * @explicit: whether this was an explicit unplug, or one from schedule()
  *
  * Unplug request queue @q because device driver is scheduled to work
  * on elements in the request queue.
  */
-DEFINE_EVENT(block_unplug, block_unplug_io,
+DEFINE_EVENT(block_unplug, block_unplug,
 
-	TP_PROTO(struct request_queue *q, unsigned int depth),
+	TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),
 
-	TP_ARGS(q, depth)
+	TP_ARGS(q, depth, explicit)
 );
 
 /**
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3e3970d53d14..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -850,16 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q,
-				    unsigned int depth)
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+				    unsigned int depth, bool explicit)
 {
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(depth);
+		u32 what;
 
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
-				sizeof(rpdu), &rpdu);
+		if (explicit)
+			what = BLK_TA_UNPLUG_IO;
+		else
+			what = BLK_TA_UNPLUG_TIMER;
+
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
 	}
 }
 
@@ -1002,7 +1007,7 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
 	WARN_ON(ret);
@@ -1017,7 +1022,7 @@ static void blk_unregister_tracepoints(void)
 	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
-	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
 	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
@@ -1332,6 +1337,7 @@ static const struct {
 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
 	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
-- 
cgit v1.2.3-55-g7522


From c78193e9c7bcbf25b8237ad0dec82f805c4ea69b Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Mon, 18 Apr 2011 10:35:30 -0700
Subject: next_pidmap: fix overflow condition

next_pidmap() just quietly accepted whatever 'last' pid that was passed
in, which is not all that safe when one of the users is /proc.

Admittedly the proc code should do some sanity checking on the range
(and that will be the next commit), but that doesn't mean that the
helper functions should just do that pidmap pointer arithmetic without
checking the range of its arguments.

So clamp 'last' to PID_MAX_LIMIT.  The fact that we then do "last+1"
doesn't really matter, the for-loop does check against the end of the
pidmap array properly (it's only the actual pointer arithmetic overflow
case we need to worry about, and going one bit beyond isn't going to
overflow).

[ Use PID_MAX_LIMIT rather than pid_max as per Eric Biederman ]

Reported-by: Tavis Ormandy <taviso@cmpxchg8b.com>
Analyzed-by: Robert Święcki <robert@swiecki.net>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h | 2 +-
 kernel/pid.c        | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 31afb7ecbe1f..cdced84261d7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -117,7 +117,7 @@ extern struct pid *find_vpid(int nr);
  */
 extern struct pid *find_get_pid(int nr);
 extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
-int next_pidmap(struct pid_namespace *pid_ns, int last);
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
 
 extern struct pid *alloc_pid(struct pid_namespace *ns);
 extern void free_pid(struct pid *pid);
diff --git a/kernel/pid.c b/kernel/pid.c
index 02f221274265..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	return -1;
 }
 
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 {
 	int offset;
 	struct pidmap *map, *end;
 
+	if (last >= PID_MAX_LIMIT)
+		return -1;
+
 	offset = (last + 1) & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
 	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-- 
cgit v1.2.3-55-g7522