From 41acab8851a0408c1d5ad6c21a07456f88b54d40 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi
Date: Wed, 10 Mar 2010 23:37:45 -0300
Subject: sched: Implement group scheduler statistics in one struct

Put all statistic fields of sched_entity in one struct, sched_statistics,
and embed it into sched_entity.

This change allows to memset the sched_statistics to 0 when needed (for
instance when forking), avoiding bugs of non initialized fields.

Signed-off-by: Lucas De Marchi <lucas.de.marchi@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268275065-18542-1-git-send-email-lucas.de.marchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       |  47 +++++-------------------
 kernel/sched_debug.c | 101 +++++++++++++++++++--------------------------------
 kernel/sched_fair.c  |  65 +++++++++++++++++----------------
 kernel/sched_rt.c    |   2 +-
 4 files changed, 81 insertions(+), 134 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2c1db81f80eb..a4aa071f08f3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2437,15 +2437,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 out_activate:
 #endif /* CONFIG_SMP */
-	schedstat_inc(p, se.nr_wakeups);
+	schedstat_inc(p, se.statistics.nr_wakeups);
 	if (wake_flags & WF_SYNC)
-		schedstat_inc(p, se.nr_wakeups_sync);
+		schedstat_inc(p, se.statistics.nr_wakeups_sync);
 	if (orig_cpu != cpu)
-		schedstat_inc(p, se.nr_wakeups_migrate);
+		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 	if (cpu == this_cpu)
-		schedstat_inc(p, se.nr_wakeups_local);
+		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	else
-		schedstat_inc(p, se.nr_wakeups_remote);
+		schedstat_inc(p, se.statistics.nr_wakeups_remote);
 	activate_task(rq, p, 1);
 	success = 1;
 
@@ -2532,36 +2532,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_start			= 0;
-	p->se.wait_max				= 0;
-	p->se.wait_count			= 0;
-	p->se.wait_sum				= 0;
-
-	p->se.sleep_start			= 0;
-	p->se.sleep_max				= 0;
-	p->se.sum_sleep_runtime			= 0;
-
-	p->se.block_start			= 0;
-	p->se.block_max				= 0;
-	p->se.exec_max				= 0;
-	p->se.slice_max				= 0;
-
-	p->se.nr_migrations_cold		= 0;
-	p->se.nr_failed_migrations_affine	= 0;
-	p->se.nr_failed_migrations_running	= 0;
-	p->se.nr_failed_migrations_hot		= 0;
-	p->se.nr_forced_migrations		= 0;
-
-	p->se.nr_wakeups			= 0;
-	p->se.nr_wakeups_sync			= 0;
-	p->se.nr_wakeups_migrate		= 0;
-	p->se.nr_wakeups_local			= 0;
-	p->se.nr_wakeups_remote			= 0;
-	p->se.nr_wakeups_affine			= 0;
-	p->se.nr_wakeups_affine_attempts	= 0;
-	p->se.nr_wakeups_passive		= 0;
-	p->se.nr_wakeups_idle			= 0;
-
+	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
@@ -7910,9 +7881,9 @@ void normalize_rt_tasks(void)
 
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
-		p->se.wait_start		= 0;
-		p->se.sleep_start		= 0;
-		p->se.block_start		= 0;
+		p->se.statistics.wait_start	= 0;
+		p->se.statistics.sleep_start	= 0;
+		p->se.statistics.block_start	= 0;
 #endif
 
 		if (!rt_task(p)) {
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..ad9df4422763 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-	PN(se->wait_start);
-	PN(se->sleep_start);
-	PN(se->block_start);
-	PN(se->sleep_max);
-	PN(se->block_max);
-	PN(se->exec_max);
-	PN(se->slice_max);
-	PN(se->wait_max);
-	PN(se->wait_sum);
-	P(se->wait_count);
+	PN(se->statistics.wait_start);
+	PN(se->statistics.sleep_start);
+	PN(se->statistics.block_start);
+	PN(se->statistics.sleep_max);
+	PN(se->statistics.block_max);
+	PN(se->statistics.exec_max);
+	PN(se->statistics.slice_max);
+	PN(se->statistics.wait_max);
+	PN(se->statistics.wait_sum);
+	P(se->statistics.wait_count);
 #endif
 	P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		SPLIT_NS(p->se.vruntime),
 		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.sum_sleep_runtime));
+		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -413,34 +413,34 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-	PN(se.wait_start);
-	PN(se.sleep_start);
-	PN(se.block_start);
-	PN(se.sleep_max);
-	PN(se.block_max);
-	PN(se.exec_max);
-	PN(se.slice_max);
-	PN(se.wait_max);
-	PN(se.wait_sum);
-	P(se.wait_count);
-	PN(se.iowait_sum);
-	P(se.iowait_count);
+	PN(se.statistics.wait_start);
+	PN(se.statistics.sleep_start);
+	PN(se.statistics.block_start);
+	PN(se.statistics.sleep_max);
+	PN(se.statistics.block_max);
+	PN(se.statistics.exec_max);
+	PN(se.statistics.slice_max);
+	PN(se.statistics.wait_max);
+	PN(se.statistics.wait_sum);
+	P(se.statistics.wait_count);
+	PN(se.statistics.iowait_sum);
+	P(se.statistics.iowait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
-	P(se.nr_migrations_cold);
-	P(se.nr_failed_migrations_affine);
-	P(se.nr_failed_migrations_running);
-	P(se.nr_failed_migrations_hot);
-	P(se.nr_forced_migrations);
-	P(se.nr_wakeups);
-	P(se.nr_wakeups_sync);
-	P(se.nr_wakeups_migrate);
-	P(se.nr_wakeups_local);
-	P(se.nr_wakeups_remote);
-	P(se.nr_wakeups_affine);
-	P(se.nr_wakeups_affine_attempts);
-	P(se.nr_wakeups_passive);
-	P(se.nr_wakeups_idle);
+	P(se.statistics.nr_migrations_cold);
+	P(se.statistics.nr_failed_migrations_affine);
+	P(se.statistics.nr_failed_migrations_running);
+	P(se.statistics.nr_failed_migrations_hot);
+	P(se.statistics.nr_forced_migrations);
+	P(se.statistics.nr_wakeups);
+	P(se.statistics.nr_wakeups_sync);
+	P(se.statistics.nr_wakeups_migrate);
+	P(se.statistics.nr_wakeups_local);
+	P(se.statistics.nr_wakeups_remote);
+	P(se.statistics.nr_wakeups_affine);
+	P(se.statistics.nr_wakeups_affine_attempts);
+	P(se.statistics.nr_wakeups_passive);
+	P(se.statistics.nr_wakeups_idle);
 
 	{
 		u64 avg_atom, avg_per_cpu;
@@ -491,32 +491,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_max				= 0;
-	p->se.wait_sum				= 0;
-	p->se.wait_count			= 0;
-	p->se.iowait_sum			= 0;
-	p->se.iowait_count			= 0;
-	p->se.sleep_max				= 0;
-	p->se.sum_sleep_runtime			= 0;
-	p->se.block_max				= 0;
-	p->se.exec_max				= 0;
-	p->se.slice_max				= 0;
-	p->se.nr_migrations			= 0;
-	p->se.nr_migrations_cold		= 0;
-	p->se.nr_failed_migrations_affine	= 0;
-	p->se.nr_failed_migrations_running	= 0;
-	p->se.nr_failed_migrations_hot		= 0;
-	p->se.nr_forced_migrations		= 0;
-	p->se.nr_wakeups			= 0;
-	p->se.nr_wakeups_sync			= 0;
-	p->se.nr_wakeups_migrate		= 0;
-	p->se.nr_wakeups_local			= 0;
-	p->se.nr_wakeups_remote			= 0;
-	p->se.nr_wakeups_affine			= 0;
-	p->se.nr_wakeups_affine_attempts	= 0;
-	p->se.nr_wakeups_passive		= 0;
-	p->se.nr_wakeups_idle			= 0;
-	p->sched_info.bkl_count			= 0;
+	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 	p->se.sum_exec_runtime			= 0;
 	p->se.prev_sum_exec_runtime		= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e1fd96c6cf9..8ad164bbdac1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
 	unsigned long delta_exec_weighted;
 
-	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+	schedstat_set(curr->statistics.exec_max,
+		      max((u64)delta_exec, curr->statistics.exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->wait_max, max(se->wait_max,
-			rq_of(cfs_rq)->clock - se->wait_start));
-	schedstat_set(se->wait_count, se->wait_count + 1);
-	schedstat_set(se->wait_sum, se->wait_sum +
-			rq_of(cfs_rq)->clock - se->wait_start);
+	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+			rq_of(cfs_rq)->clock - se->statistics.wait_start));
+	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
-			rq_of(cfs_rq)->clock - se->wait_start);
+			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 	}
 #endif
-	schedstat_set(se->wait_start, 0);
+	schedstat_set(se->statistics.wait_start, 0);
 }
 
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (entity_is_task(se))
 		tsk = task_of(se);
 
-	if (se->sleep_start) {
-		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+	if (se->statistics.sleep_start) {
+		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
 
-		if (unlikely(delta > se->sleep_max))
-			se->sleep_max = delta;
+		if (unlikely(delta > se->statistics.sleep_max))
+			se->statistics.sleep_max = delta;
 
-		se->sleep_start = 0;
-		se->sum_sleep_runtime += delta;
+		se->statistics.sleep_start = 0;
+		se->statistics.sum_sleep_runtime += delta;
 
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
 	}
-	if (se->block_start) {
-		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+	if (se->statistics.block_start) {
+		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
 
-		if (unlikely(delta > se->block_max))
-			se->block_max = delta;
+		if (unlikely(delta > se->statistics.block_max))
+			se->statistics.block_max = delta;
 
-		se->block_start = 0;
-		se->sum_sleep_runtime += delta;
+		se->statistics.block_start = 0;
+		se->statistics.sum_sleep_runtime += delta;
 
 		if (tsk) {
 			if (tsk->in_iowait) {
-				se->iowait_sum += delta;
-				se->iowait_count++;
+				se->statistics.iowait_sum += delta;
+				se->statistics.iowait_count++;
 				trace_sched_stat_iowait(tsk, delta);
 			}
 
@@ -826,9 +827,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 			struct task_struct *tsk = task_of(se);
 
 			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->sleep_start = rq_of(cfs_rq)->clock;
+				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->block_start = rq_of(cfs_rq)->clock;
+				se->statistics.block_start = rq_of(cfs_rq)->clock;
 		}
 #endif
 	}
@@ -912,7 +913,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * when there are only lesser-weight tasks around):
 	 */
 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-		se->slice_max = max(se->slice_max,
+		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
 #endif
@@ -1306,7 +1307,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	if (sync && balanced)
 		return 1;
 
-	schedstat_inc(p, se.nr_wakeups_affine_attempts);
+	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 	if (balanced ||
@@ -1318,7 +1319,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		 * there is no bad imbalance.
 		 */
 		schedstat_inc(sd, ttwu_move_affine);
-		schedstat_inc(p, se.nr_wakeups_affine);
+		schedstat_inc(p, se.statistics.nr_wakeups_affine);
 
 		return 1;
 	}
@@ -1844,13 +1845,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-		schedstat_inc(p, se.nr_failed_migrations_affine);
+		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
 	*all_pinned = 0;
 
 	if (task_running(rq, p)) {
-		schedstat_inc(p, se.nr_failed_migrations_running);
+		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
 
@@ -1866,14 +1867,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
-			schedstat_inc(p, se.nr_forced_migrations);
+			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 #endif
 		return 1;
 	}
 
 	if (tsk_cache_hot) {
-		schedstat_inc(p, se.nr_failed_migrations_hot);
+		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
 		return 0;
 	}
 	return 1;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c4fb42a66cab..0335e87f5204 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
-	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
-- 
cgit v1.2.3-55-g7522


From 9b33fa6ba0e2f90fdf407501db801c2511121564 Mon Sep 17 00:00:00 2001
From: eranian@google.com
Date: Wed, 10 Mar 2010 22:26:05 -0800
Subject: perf_events: Improve task_sched_in()

This patch is an optimization in perf_event_task_sched_in() to avoid
scheduling the events twice in a row.

Without it, the perf_disable()/perf_enable() pair is invoked twice,
thereby pinned events counts while scheduling flexible events and we go
throuh hw_perf_enable() twice.

By encapsulating, the whole sequence into perf_disable()/perf_enable() we
ensure, hw_perf_enable() is going to be invoked only once because of the
refcount protection.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268288765-5326-1-git-send-email-eranian@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 52c69a34d697..3853d49c7d56 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1368,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task)
 	if (cpuctx->task_ctx == ctx)
 		return;
 
+	perf_disable();
+
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -1380,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task)
 	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 
 	cpuctx->task_ctx = ctx;
+
+	perf_enable();
 }
 
 #define MAX_INTERRUPTS (~0ULL)
-- 
cgit v1.2.3-55-g7522


From 39c0cbe2150cbd848a25ba6cdb271d1ad46818ad Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:13 +0100
Subject: sched: Rate-limit nohz

Entering nohz code on every micro-idle is costing ~10% throughput for netperf
TCP_RR when scheduling cross-cpu.  Rate limiting entry fixes this, but raises
ticks a bit.  On my Q6600, an idle box goes from ~85 interrupts/sec to 128.

The higher the context switch rate, the more nohz entry costs.  With this patch
and some cycle recovery patches in my tree, max cross cpu context switch rate is
improved by ~16%, a large portion of which of which is this ratelimiting.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301003.6785.28.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  6 ++++++
 kernel/sched.c           | 12 ++++++++++++
 kernel/time/tick-sched.c |  3 +++
 3 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cc863d66477..13efe7dac5fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -271,11 +271,17 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
 	return 0;
 }
+
+static inline int nohz_ratelimit(int cpu)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index a4aa071f08f3..60b1bbe2ad1b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -492,6 +492,7 @@ struct rq {
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
+	u64 nohz_stamp;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
@@ -1228,6 +1229,17 @@ void wake_up_idle_cpu(int cpu)
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
+
+int nohz_ratelimit(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 diff = rq->clock - rq->nohz_stamp;
+
+	rq->nohz_stamp = rq->clock;
+
+	return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..f25735a767af 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -262,6 +262,9 @@ void tick_nohz_stop_sched_tick(int inidle)
 		goto end;
 	}
 
+	if (nohz_ratelimit(cpu))
+		goto end;
+
 	ts->idle_calls++;
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
-- 
cgit v1.2.3-55-g7522


From b42e0c41a422a212ddea0666d5a3a0e3c35206db Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:15:38 +0100
Subject: sched: Remove avg_wakeup

Testing the load which led to this heuristic (nfs4 kbuild) shows that it has
outlived it's usefullness.  With intervening load balancing changes, I cannot
see any difference with/without, so recover there fastpath cycles.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301062.6785.29.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ---
 kernel/sched.c          | 26 ++++----------------------
 kernel/sched_debug.c    |  1 -
 kernel/sched_fair.c     | 31 -------------------------------
 kernel/sched_features.h |  6 ------
 5 files changed, 4 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13efe7dac5fa..70c560f5ada0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1185,9 +1185,6 @@ struct sched_entity {
 
 	u64			nr_migrations;
 
-	u64			start_runtime;
-	u64			avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_statistics statistics;
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 60b1bbe2ad1b..35a8626ace7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1880,9 +1880,6 @@ static void update_avg(u64 *avg, u64 sample)
 static void
 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
-	if (wakeup)
-		p->se.start_runtime = p->se.sum_exec_runtime;
-
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup, head);
 	p->se.on_rq = 1;
@@ -1890,17 +1887,11 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-	if (sleep) {
-		if (p->se.last_wakeup) {
-			update_avg(&p->se.avg_overlap,
-				p->se.sum_exec_runtime - p->se.last_wakeup);
-			p->se.last_wakeup = 0;
-		} else {
-			update_avg(&p->se.avg_wakeup,
-				sysctl_sched_wakeup_granularity);
-		}
+	if (sleep && p->se.last_wakeup) {
+		update_avg(&p->se.avg_overlap,
+			p->se.sum_exec_runtime - p->se.last_wakeup);
+		p->se.last_wakeup = 0;
 	}
-
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -2466,13 +2457,6 @@ out_activate:
 	 */
 	if (!in_interrupt()) {
 		struct sched_entity *se = &current->se;
-		u64 sample = se->sum_exec_runtime;
-
-		if (se->last_wakeup)
-			sample -= se->last_wakeup;
-		else
-			sample -= se->start_runtime;
-		update_avg(&se->avg_wakeup, sample);
 
 		se->last_wakeup = se->sum_exec_runtime;
 	}
@@ -2540,8 +2524,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.nr_migrations		= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
-	p->se.start_runtime		= 0;
-	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ad9df4422763..20b95a420fec 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -408,7 +408,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
-	PN(se.avg_wakeup);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8ad164bbdac1..6fc62854422c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1592,42 +1592,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-	u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-	u64 gran = 0;
-
-	if (this_run < expected_wakeup)
-		gran = expected_wakeup - this_run;
-
-	return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
-	if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-		gran = adaptive_gran(curr, se);
-
 	/*
 	 * Since its curr running now, convert the gran from real-time
 	 * to virtual-time in his units.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..96ef5dbc66e1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -30,12 +30,6 @@ SCHED_FEAT(START_DEBIT, 1)
  */
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
-/*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
 /*
  * When converting the wakeup granularity to virtual time, do it such
  * that heavier tasks preempting a lighter task have an edge.
-- 
cgit v1.2.3-55-g7522


From e12f31d3e5d36328c7fbd0fce40a95e70b59152c Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:15:51 +0100
Subject: sched: Remove avg_overlap

Both avg_overlap and avg_wakeup had an inherent problem in that their accuracy
was detrimentally affected by cross-cpu wakeups, this because we are missing
the necessary call to update_curr().  This can't be fixed without increasing
overhead in our already too fat fastpath.

Additionally, with recent load balancing changes making us prefer to place tasks
in an idle cache domain (which is good for compute bound loads), communicating
tasks suffer when a sync wakeup, which would enable affine placement, is turned
into a non-sync wakeup by SYNC_LESS.  With one task on the runqueue, wake_affine()
rejects the affine wakeup request, leaving the unfortunate where placed, taking
frequent cache misses.

Remove it, and recover some fastpath cycles.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301121.6785.30.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ---
 kernel/sched.c          | 33 ---------------------------------
 kernel/sched_debug.c    |  1 -
 kernel/sched_fair.c     | 18 ------------------
 kernel/sched_features.h | 16 ----------------
 5 files changed, 71 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70c560f5ada0..8604884cee87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1180,9 +1180,6 @@ struct sched_entity {
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 
-	u64			last_wakeup;
-	u64			avg_overlap;
-
 	u64			nr_migrations;
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched.c b/kernel/sched.c
index 35a8626ace7d..68ed6f4f3c13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1887,11 +1887,6 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-	if (sleep && p->se.last_wakeup) {
-		update_avg(&p->se.avg_overlap,
-			p->se.sum_exec_runtime - p->se.last_wakeup);
-		p->se.last_wakeup = 0;
-	}
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -2452,15 +2447,6 @@ out_activate:
 	activate_task(rq, p, 1);
 	success = 1;
 
-	/*
-	 * Only attribute actual wakeups done by this task.
-	 */
-	if (!in_interrupt()) {
-		struct sched_entity *se = &current->se;
-
-		se->last_wakeup = se->sum_exec_runtime;
-	}
-
 out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, wake_flags);
@@ -2522,8 +2508,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
-	p->se.last_wakeup		= 0;
-	p->se.avg_overlap		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -3594,23 +3578,6 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-	if (prev->state == TASK_RUNNING) {
-		u64 runtime = prev->se.sum_exec_runtime;
-
-		runtime -= prev->se.prev_sum_exec_runtime;
-		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
-		/*
-		 * In order to avoid avg_overlap growing stale when we are
-		 * indeed overlapping and hence not getting put to sleep, grow
-		 * the avg_overlap on preemption.
-		 *
-		 * We use the average preemption runtime because that
-		 * correlates to the amount of cache footprint a task can
-		 * build up.
-		 */
-		update_avg(&prev->se.avg_overlap, runtime);
-	}
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 20b95a420fec..8a46a719f367 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -407,7 +407,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_start);
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
-	PN(se.avg_overlap);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fc62854422c..c3b69d4b5d65 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1241,7 +1241,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	struct task_struct *curr = current;
 	unsigned long this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
@@ -1256,18 +1255,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (sync) {
-	       if (sched_feat(SYNC_LESS) &&
-		   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-		    p->se.avg_overlap > sysctl_sched_migration_cost))
-		       sync = 0;
-	} else {
-		if (sched_feat(SYNC_MORE) &&
-		    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-		     p->se.avg_overlap < sysctl_sched_migration_cost))
-			sync = 1;
-	}
-
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -1711,11 +1698,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (sched_feat(WAKEUP_SYNC) && sync)
 		goto preempt;
 
-	if (sched_feat(WAKEUP_OVERLAP) &&
-			se->avg_overlap < sysctl_sched_migration_cost &&
-			pse->avg_overlap < sysctl_sched_migration_cost)
-		goto preempt;
-
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 96ef5dbc66e1..c545e048dfed 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -41,12 +41,6 @@ SCHED_FEAT(ASYM_GRAN, 1)
  */
 SCHED_FEAT(WAKEUP_SYNC, 0)
 
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
 /*
  * Use the SYNC wakeup hint, pipes and the likes use this to indicate
  * the remote end is likely to consume the data we just wrote, and
@@ -63,16 +57,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
  */
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 
-/*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
 /*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
-- 
cgit v1.2.3-55-g7522


From a64692a3afd85fe048551ab89142fd5ca99a0dbd Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:16:20 +0100
Subject: sched: Cleanup/optimize clock updates

Now that we no longer depend on the clock being updated prior to enqueueing
on migratory wakeup, we can clean up a bit, placing calls to update_rq_clock()
exactly where they are needed, ie on enqueue, dequeue and schedule events.

In the case of a freshly enqueued task immediately preempting, we can skip the
update during preemption, as the clock was just updated by the enqueue event.
We also save an unneeded call during a migratory wakeup by not updating the
previous runqueue, where update_curr() won't be invoked.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301199.6785.32.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 32 ++++++++++++++++----------------
 kernel/sched_fair.c |  2 --
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 68ed6f4f3c13..16559de4edea 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -495,6 +495,8 @@ struct rq {
 	u64 nohz_stamp;
 	unsigned char in_nohz_recently;
 #endif
+	unsigned int skip_clock_update;
+
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
@@ -592,6 +594,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+	/*
+	 * A queue event has occurred, and we're going to schedule.  In
+	 * this case, we can save a useless back to back clock update.
+	 */
+	if (test_tsk_need_resched(p))
+		rq->skip_clock_update = 1;
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -626,7 +635,8 @@ static inline int cpu_of(struct rq *rq)
 
 inline void update_rq_clock(struct rq *rq)
 {
-	rq->clock = sched_clock_cpu(cpu_of(rq));
+	if (!rq->skip_clock_update)
+		rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
 /*
@@ -1782,8 +1792,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
-	update_rq_clock(rq1);
-	update_rq_clock(rq2);
 }
 
 /*
@@ -1880,6 +1888,7 @@ static void update_avg(u64 *avg, u64 sample)
 static void
 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
+	update_rq_clock(rq);
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup, head);
 	p->se.on_rq = 1;
@@ -1887,6 +1896,7 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+	update_rq_clock(rq);
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -2366,7 +2376,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 	if (!(p->state & state))
 		goto out;
 
@@ -2407,7 +2416,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 	rq = cpu_rq(cpu);
 	raw_spin_lock(&rq->lock);
-	update_rq_clock(rq);
 
 	/*
 	 * We migrated the task without holding either rq->lock, however
@@ -2624,7 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	BUG_ON(p->state != TASK_WAKING);
 	p->state = TASK_RUNNING;
-	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -3578,6 +3585,9 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
+	if (prev->se.on_rq)
+		update_rq_clock(rq);
+	rq->skip_clock_update = 0;
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -3640,7 +3650,6 @@ need_resched_nonpreemptible:
 		hrtick_clear(rq);
 
 	raw_spin_lock_irq(&rq->lock);
-	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4197,7 +4206,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 
 	oldprio = p->prio;
 	prev_class = p->sched_class;
@@ -4240,7 +4248,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -4523,7 +4530,6 @@ recheck:
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -5530,7 +5536,6 @@ void sched_idle_next(void)
 
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 
-	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5585,7 +5590,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
-		update_rq_clock(rq);
 		next = pick_next_task(rq);
 		if (!next)
 			break;
@@ -5869,7 +5873,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		raw_spin_lock_irq(&rq->lock);
-		update_rq_clock(rq);
 		deactivate_task(rq, rq->idle, 0);
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
@@ -7815,7 +7818,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
 
-	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
@@ -8177,8 +8179,6 @@ void sched_move_task(struct task_struct *tsk)
 
 	rq = task_rq_lock(tsk, &flags);
 
-	update_rq_clock(rq);
-
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c3b69d4b5d65..69e582020ff8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3064,8 +3064,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
-	update_rq_clock(busiest_rq);
-	update_rq_clock(target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
-- 
cgit v1.2.3-55-g7522


From 21406928afe43f1db6acab4931bb8c886f4d04ce Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:15 +0100
Subject: sched: Tweak sched_latency and min_granularity

Allow LAST_BUDDY to kick in sooner, improving cache utilization as soon as
a second buddy pair arrives on scene.  The cost is latency starting to climb
sooner, the tbenefit for tbench 8 on my Q6600 box is ~2%.  No detrimental
effects noted in normal idesktop usage.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301285.6785.34.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 69e582020ff8..d19df5bccfec 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
-- 
cgit v1.2.3-55-g7522


From 8b911acdf08477c059d1c36c21113ab1696c612b Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:16 +0100
Subject: sched: Fix select_idle_sibling()

Don't bother with selection when the current cpu is idle.  Recent load
balancing changes also make it no longer necessary to check wake_affine()
success before returning the selected sibling, so we now always use it.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301369.6785.36.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d19df5bccfec..0008cc4a1199 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1439,7 +1439,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
-	int want_affine = 0;
+	int want_affine = 0, cpu_idle = !current->pid;
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
@@ -1497,13 +1497,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			 * If there's an idle sibling in this domain, make that
 			 * the wake_affine target instead of the current cpu.
 			 */
-			if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+			if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
 				target = select_idle_sibling(p, tmp, target);
 
 			if (target >= 0) {
 				if (tmp->flags & SD_WAKE_AFFINE) {
 					affine_sd = tmp;
 					want_affine = 0;
+					if (target != cpu)
+						cpu_idle = 1;
 				}
 				cpu = target;
 			}
@@ -1519,6 +1521,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			sd = tmp;
 	}
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	if (sched_feat(LB_SHARES_UPDATE)) {
 		/*
 		 * Pick the largest domain to update shares over
@@ -1532,9 +1535,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 		if (tmp)
 			update_shares(tmp);
 	}
+#endif
 
-	if (affine_sd && wake_affine(affine_sd, p, sync))
-		return cpu;
+	if (affine_sd) {
+		if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+			return cpu;
+	}
 
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
-- 
cgit v1.2.3-55-g7522


From 6bc6cf2b61336ed0c55a615eb4c0c8ed5daf3f08 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:17 +0100
Subject: sched: Remove NORMALIZED_SLEEPER

This feature hasn't been enabled in a long time, remove effectively dead code.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301447.6785.38.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 10 ----------
 kernel/sched_features.h |  7 -------
 2 files changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0008cc4a1199..de98e2e9d6e1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -741,16 +741,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	if (!initial && sched_feat(FAIR_SLEEPERS)) {
 		unsigned long thresh = sysctl_sched_latency;
 
-		/*
-		 * Convert the sleeper threshold into virtual time.
-		 * SCHED_IDLE is a special sub-class.  We care about
-		 * fairness only relative to other SCHED_IDLE tasks,
-		 * all of which have the same weight.
-		 */
-		if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-				 task_of(se)->policy != SCHED_IDLE))
-			thresh = calc_delta_fair(thresh, se);
-
 		/*
 		 * Halve their sleep time's effect, to allow
 		 * for a gentler effect of sleepers:
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index c545e048dfed..404288354aee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,13 +12,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
  */
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
-/*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
 /*
  * Place new tasks ahead so that they do not starve already running
  * tasks
-- 
cgit v1.2.3-55-g7522


From 5ca9880c6f4ba4c84b517bc2fed5366adf63d191 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:17 +0100
Subject: sched: Remove FAIR_SLEEPERS feature

Our preemption model relies too heavily on sleeper fairness to disable it
without dire consequences.  Remove the feature, and save a branch or two.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301520.6785.40.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 2 +-
 kernel/sched_features.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index de98e2e9d6e1..97682f925ed5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -738,7 +738,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += sched_vslice(cfs_rq, se);
 
 	/* sleeps up to a single latency don't count. */
-	if (!initial && sched_feat(FAIR_SLEEPERS)) {
+	if (!initial) {
 		unsigned long thresh = sysctl_sched_latency;
 
 		/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 404288354aee..850f9809cf81 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,10 +1,3 @@
-/*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
 /*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
-- 
cgit v1.2.3-55-g7522


From f2e74eeac03ffb779d64b66a643c5e598145a28b Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:18 +0100
Subject: sched: Remove WAKEUP_SYNC feature

This feature never earned its keep, remove it.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301591.6785.42.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 4 ----
 kernel/sched_features.h | 5 -----
 2 files changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 97682f925ed5..1d99535b0928 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1658,7 +1658,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	int sync = wake_flags & WF_SYNC;
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 
 	if (unlikely(rt_prio(p->prio)))
@@ -1691,9 +1690,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(curr->policy == SCHED_IDLE))
 		goto preempt;
 
-	if (sched_feat(WAKEUP_SYNC) && sync)
-		goto preempt;
-
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 850f9809cf81..1cb7c4701bf3 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -22,11 +22,6 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
  */
 SCHED_FEAT(ASYM_GRAN, 1)
 
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
 /*
  * Use the SYNC wakeup hint, pipes and the likes use this to indicate
  * the remote end is likely to consume the data we just wrote, and
-- 
cgit v1.2.3-55-g7522


From c6ee36c423c3ed1fb86bb3eabba9fc256a300d16 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:16:43 +0100
Subject: sched: Remove SYNC_WAKEUPS feature

Sync wakeups are critical functionality with a long history.  Remove it, we don't
need the branch or icache footprint.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301817.6785.47.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 3 ---
 kernel/sched_features.h | 8 --------
 2 files changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 16559de4edea..cc6dc8caa380 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2369,9 +2369,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	unsigned long flags;
 	struct rq *rq;
 
-	if (!sched_feat(SYNC_WAKEUPS))
-		wake_flags &= ~WF_SYNC;
-
 	this_cpu = get_cpu();
 
 	smp_wmb();
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1cb7c4701bf3..f54b6f9cc3dd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -22,14 +22,6 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
  */
 SCHED_FEAT(ASYM_GRAN, 1)
 
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
 /*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
-- 
cgit v1.2.3-55-g7522


From 13814d42e45dfbe845a0bbe5184565d9236896ae Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:04 +0100
Subject: sched: Remove ASYM_GRAN feature

This features has been enabled for quite a while, after testing showed that
easing preemption for light tasks was harmful to high priority threads.

Remove the feature flag.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301675.6785.44.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 28 +++++++++++-----------------
 kernel/sched_features.h |  6 ------
 2 files changed, 11 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1d99535b0928..9357ecdb7f6b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1583,24 +1583,18 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 	/*
 	 * Since its curr running now, convert the gran from real-time
 	 * to virtual-time in his units.
+	 *
+	 * By using 'se' instead of 'curr' we penalize light tasks, so
+	 * they get preempted easier. That is, if 'se' < 'curr' then
+	 * the resulting gran will be larger, therefore penalizing the
+	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
+	 * be smaller, again penalizing the lighter task.
+	 *
+	 * This is especially important for buddies when the leftmost
+	 * task is higher priority than the buddy.
 	 */
-	if (sched_feat(ASYM_GRAN)) {
-		/*
-		 * By using 'se' instead of 'curr' we penalize light tasks, so
-		 * they get preempted easier. That is, if 'se' < 'curr' then
-		 * the resulting gran will be larger, therefore penalizing the
-		 * lighter, if otoh 'se' > 'curr' then the resulting gran will
-		 * be smaller, again penalizing the lighter task.
-		 *
-		 * This is especially important for buddies when the leftmost
-		 * task is higher priority than the buddy.
-		 */
-		if (unlikely(se->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, se);
-	} else {
-		if (unlikely(curr->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, curr);
-	}
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		gran = calc_delta_fair(gran, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index f54b6f9cc3dd..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -16,12 +16,6 @@ SCHED_FEAT(START_DEBIT, 1)
  */
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
 /*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
-- 
cgit v1.2.3-55-g7522


From beac4c7e4a1cc6d57801f690e5e82fa2c9c245c8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 11 Mar 2010 17:17:20 +0100
Subject: sched: Remove AFFINE_WAKEUPS feature

Disabling affine wakeups is too horrible to contemplate.  Remove the feature flag.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301890.6785.50.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9357ecdb7f6b..35a5c649638b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1434,8 +1434,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
-		if (sched_feat(AFFINE_WAKEUPS) &&
-		    cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
 			want_affine = 1;
 		new_cpu = prev_cpu;
 	}
-- 
cgit v1.2.3-55-g7522


From f55db609042faecd5e518ce372b87f846659b32e Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Thu, 11 Mar 2010 14:04:37 -0800
Subject: cpu-timers: Simplify RLIMIT_CPU handling

Let always set signal->cputime_expires expiration cache when setting
new itimer, POSIX 1.b timer, and RLIMIT_CPU.  Since we are
initializing prof_exp expiration cache during fork(), this allows to
remove "RLIMIT_CPU != inf" check from fastpath_timer_check() and do
some other cleanups.

Checked against regression using test cases from:
http://marc.info/?l=linux-kernel&m=123749066504641&w=4
http://marc.info/?l=linux-kernel&m=123811277916642&w=2

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 75 +++++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1a22dfd42df9..d01e0a348e61 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
 #include <trace/events/timer.h>
 
 /*
- * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ * Called after updating RLIMIT_CPU to run cpu timer and update
+ * tsk->signal->cputime_expires expiration cache if necessary. Needs
+ * siglock protection since other code may update expiration cache as
+ * well.
  */
 void update_rlimit_cpu(unsigned long rlim_new)
 {
 	cputime_t cputime = secs_to_cputime(rlim_new);
-	struct signal_struct *const sig = current->signal;
 
-	if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
-	    cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
-		spin_lock_irq(&current->sighand->siglock);
-		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-		spin_unlock_irq(&current->sighand->siglock);
-	}
+	spin_lock_irq(&current->sighand->siglock);
+	set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+	spin_unlock_irq(&current->sighand->siglock);
 }
 
 static int check_clock(const clockid_t which_clock)
@@ -564,7 +563,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	struct list_head *head, *listpos;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
-	unsigned long i;
 
 	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
 		p->cpu_timers : p->signal->cpu_timers);
@@ -630,20 +628,11 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 			default:
 				BUG();
 			case CPUCLOCK_VIRT:
-				if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
-					       exp->cpu))
-					break;
-				sig->cputime_expires.virt_exp = exp->cpu;
-				break;
+				if (expires_gt(sig->cputime_expires.virt_exp, exp->cpu))
+					sig->cputime_expires.virt_exp = exp->cpu;
 			case CPUCLOCK_PROF:
-				if (expires_le(sig->it[CPUCLOCK_PROF].expires,
-					       exp->cpu))
-					break;
-				i = sig->rlim[RLIMIT_CPU].rlim_cur;
-				if (i != RLIM_INFINITY &&
-				    i <= cputime_to_secs(exp->cpu))
-					break;
-				sig->cputime_expires.prof_exp = exp->cpu;
+				if (expires_gt(sig->cputime_expires.prof_exp, exp->cpu))
+					sig->cputime_expires.prof_exp = exp->cpu;
 				break;
 			case CPUCLOCK_SCHED:
 				sig->cputime_expires.sched_exp = exp->sched;
@@ -1386,7 +1375,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 			return 1;
 	}
 
-	return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
+	return 0;
 }
 
 /*
@@ -1452,21 +1441,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 }
 
 /*
- * Set one of the process-wide special case CPU timers.
+ * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  * The tsk->sighand->siglock must be held by the caller.
- * The *newval argument is relative and we update it to be absolute, *oldval
- * is absolute and we update it to be relative.
  */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
 {
 	union cpu_time_count now;
-	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
+		/*
+		 * We are setting itimer. The *oldval is absolute and we update
+		 * it to be relative, *newval argument is relative and we update
+		 * it to be absolute.
+		 */
 		if (!cputime_eq(*oldval, cputime_zero)) {
 			if (cputime_le(*oldval, now.cpu)) {
 				/* Just about to fire. */
@@ -1479,33 +1470,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		if (cputime_eq(*newval, cputime_zero))
 			return;
 		*newval = cputime_add(*newval, now.cpu);
-
-		/*
-		 * If the RLIMIT_CPU timer will expire before the
-		 * ITIMER_PROF timer, we have nothing else to do.
-		 */
-		if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
-		    < cputime_to_secs(*newval))
-			return;
 	}
 
 	/*
-	 * Check whether there are any process timers already set to fire
-	 * before this one.  If so, we don't have anything more to do.
+	 * Update expiration cache if we are the earliest timer, or eventually
+	 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
 	 */
-	head = &tsk->signal->cpu_timers[clock_idx];
-	if (list_empty(head) ||
-	    cputime_ge(list_first_entry(head,
-				  struct cpu_timer_list, entry)->expires.cpu,
-		       *newval)) {
-		switch (clock_idx) {
-		case CPUCLOCK_PROF:
+	switch (clock_idx) {
+	case CPUCLOCK_PROF:
+		if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
 			tsk->signal->cputime_expires.prof_exp = *newval;
-			break;
-		case CPUCLOCK_VIRT:
+		break;
+	case CPUCLOCK_VIRT:
+		if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
 			tsk->signal->cputime_expires.virt_exp = *newval;
-			break;
-		}
+		break;
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 5eb9aa6414bdab6d075a8763bc3b647181ef3aab Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Thu, 11 Mar 2010 14:04:38 -0800
Subject: cpu-timers: Cleanup arm_timer()

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 104 ++++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d01e0a348e61..7c7166f766cc 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -547,97 +547,63 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 	       cputime_gt(expires, new_exp);
 }
 
-static inline int expires_le(cputime_t expires, cputime_t new_exp)
-{
-	return !cputime_eq(expires, cputime_zero) &&
-	       cputime_le(expires, new_exp);
-}
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
  * for reading, and interrupts disabled.
  */
-static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+static void arm_timer(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
 	struct list_head *head, *listpos;
+	struct task_cputime *cputime_expires;
 	struct cpu_timer_list *const nt = &timer->it.cpu;
 	struct cpu_timer_list *next;
 
-	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
-		p->cpu_timers : p->signal->cpu_timers);
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+		head = p->cpu_timers;
+		cputime_expires = &p->cputime_expires;
+	} else {
+		head = p->signal->cpu_timers;
+		cputime_expires = &p->signal->cputime_expires;
+	}
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
 	listpos = head;
-	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
-		list_for_each_entry(next, head, entry) {
-			if (next->expires.sched > nt->expires.sched)
-				break;
-			listpos = &next->entry;
-		}
-	} else {
-		list_for_each_entry(next, head, entry) {
-			if (cputime_gt(next->expires.cpu, nt->expires.cpu))
-				break;
-			listpos = &next->entry;
-		}
+	list_for_each_entry(next, head, entry) {
+		if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
+			break;
+		listpos = &next->entry;
 	}
 	list_add(&nt->entry, listpos);
 
 	if (listpos == head) {
+		union cpu_time_count *exp = &nt->expires;
+
 		/*
-		 * We are the new earliest-expiring timer.
-		 * If we are a thread timer, there can always
-		 * be a process timer telling us to stop earlier.
+		 * We are the new earliest-expiring POSIX 1.b timer, hence
+		 * need to update expiration cache. Take into account that
+		 * for process timers we share expiration cache with itimers
+		 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
 		 */
 
-		if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-			union cpu_time_count *exp = &nt->expires;
-
-			switch (CPUCLOCK_WHICH(timer->it_clock)) {
-			default:
-				BUG();
-			case CPUCLOCK_PROF:
-				if (expires_gt(p->cputime_expires.prof_exp,
-					       exp->cpu))
-					p->cputime_expires.prof_exp = exp->cpu;
-				break;
-			case CPUCLOCK_VIRT:
-				if (expires_gt(p->cputime_expires.virt_exp,
-					       exp->cpu))
-					p->cputime_expires.virt_exp = exp->cpu;
-				break;
-			case CPUCLOCK_SCHED:
-				if (p->cputime_expires.sched_exp == 0 ||
-				    p->cputime_expires.sched_exp > exp->sched)
-					p->cputime_expires.sched_exp =
-								exp->sched;
-				break;
-			}
-		} else {
-			struct signal_struct *const sig = p->signal;
-			union cpu_time_count *exp = &timer->it.cpu.expires;
-
-			/*
-			 * For a process timer, set the cached expiration time.
-			 */
-			switch (CPUCLOCK_WHICH(timer->it_clock)) {
-			default:
-				BUG();
-			case CPUCLOCK_VIRT:
-				if (expires_gt(sig->cputime_expires.virt_exp, exp->cpu))
-					sig->cputime_expires.virt_exp = exp->cpu;
-			case CPUCLOCK_PROF:
-				if (expires_gt(sig->cputime_expires.prof_exp, exp->cpu))
-					sig->cputime_expires.prof_exp = exp->cpu;
-				break;
-			case CPUCLOCK_SCHED:
-				sig->cputime_expires.sched_exp = exp->sched;
-				break;
-			}
+		switch (CPUCLOCK_WHICH(timer->it_clock)) {
+		case CPUCLOCK_PROF:
+			if (expires_gt(cputime_expires->prof_exp, exp->cpu))
+				cputime_expires->prof_exp = exp->cpu;
+			break;
+		case CPUCLOCK_VIRT:
+			if (expires_gt(cputime_expires->virt_exp, exp->cpu))
+				cputime_expires->virt_exp = exp->cpu;
+			break;
+		case CPUCLOCK_SCHED:
+			if (cputime_expires->sched_exp == 0 ||
+			    cputime_expires->sched_exp > exp->sched)
+				cputime_expires->sched_exp = exp->sched;
+			break;
 		}
 	}
 
@@ -819,7 +785,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	if (new_expires.sched != 0 &&
 	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
 	    cpu_time_before(timer->it_clock, val, new_expires)) {
-		arm_timer(timer, val);
+		arm_timer(timer);
 	}
 
 	read_unlock(&tasklist_lock);
@@ -1283,7 +1249,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	arm_timer(timer, now);
+	arm_timer(timer);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3-55-g7522


From ae1a78eecc45fe41215d9dbfd7079999455772d6 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Thu, 11 Mar 2010 14:04:39 -0800
Subject: cpu-timers: Return correct previous timer reload value

According POSIX we need to correctly set old timer it_interval value when
user request that in timer_settime().  Tested using below program.

 #include <sys/time.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <unistd.h>
 #include <assert.h>

 int main(void)
 {
	struct sigaction act;
	struct sigevent evt = { };
	timer_t tid;
	struct itimerspec spec, u_spec, k_spec;

	evt.sigev_notify = SIGEV_SIGNAL;
	evt.sigev_signo = SIGPROF;
	assert(timer_create(CLOCK_PROCESS_CPUTIME_ID, &evt,  &tid) == 0);

	spec.it_value.tv_sec = 1;
	spec.it_value.tv_nsec = 2;
	spec.it_interval.tv_sec = 3;
	spec.it_interval.tv_nsec = 4;
	u_spec = spec;
	assert(timer_settime(tid, 0, &spec,  NULL) == 0);

	spec.it_value.tv_sec = 5;
	spec.it_value.tv_nsec = 6;
	spec.it_interval.tv_sec = 7;
	spec.it_interval.tv_nsec = 8;
	assert(timer_settime(tid, 0, &spec,  &k_spec) == 0);

 #define PRT(val) printf(#val ":\t%d/%d\n", (int) u_spec.val, (int) k_spec.val)
	PRT(it_value.tv_sec);
	PRT(it_value.tv_nsec);
	PRT(it_interval.tv_sec);
	PRT(it_interval.tv_nsec);

	return 0;
 }

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c7166f766cc..cce2f0b2d406 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -676,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 			struct itimerspec *new, struct itimerspec *old)
 {
 	struct task_struct *p = timer->it.cpu.task;
-	union cpu_time_count old_expires, new_expires, val;
+	union cpu_time_count old_expires, new_expires, old_incr, val;
 	int ret;
 
 	if (unlikely(p == NULL)) {
@@ -707,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	BUG_ON(!irqs_disabled());
 
 	ret = 0;
+	old_incr = timer->it.cpu.incr;
 	spin_lock(&p->sighand->siglock);
 	old_expires = timer->it.cpu.expires;
 	if (unlikely(timer->it.cpu.firing)) {
@@ -822,7 +823,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
  out:
 	if (old) {
 		sample_to_timespec(timer->it_clock,
-				   timer->it.cpu.incr, &old->it_interval);
+				   old_incr, &old->it_interval);
 	}
 	return ret;
 }
-- 
cgit v1.2.3-55-g7522


From 1f169f84d25a74fb2dc67274d31d082ce30c60fb Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Thu, 11 Mar 2010 14:04:41 -0800
Subject: cpu-timers: Change SIGEV_NONE timer implementation

When user sets up a timer without associated signal and process does
not use any other cpu timers and does not exit, tsk->signal->cputimer
is enabled and running forever.

Avoid running the timer for no reason.

I used below program to check patch does not break current user space
visible behavior.

 #include <sys/time.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #include <assert.h>

 void consume_cpu(void)
 {
	int i = 0;
	int count = 0;

	for(i=0; i<100000000; i++)
		count++;
 }

 int main(void)
 {
	int i;
	struct sigaction act;
	struct sigevent evt = { };
	timer_t tid;
	struct itimerspec spec = { };

	evt.sigev_notify = SIGEV_NONE;
	assert(timer_create(CLOCK_PROCESS_CPUTIME_ID, &evt,  &tid) == 0);

	spec.it_value.tv_sec = 10;
	assert(timer_settime(tid, 0, &spec,  NULL) == 0);

	for (i = 0; i < 30; i++) {
		consume_cpu();
		memset(&spec, 0, sizeof(spec));
		assert(timer_gettime(tid, &spec) == 0);
		printf("%lu.%09lu\n",
			(unsigned long) spec.it_value.tv_sec,
			(unsigned long) spec.it_value.tv_nsec);
	}

	assert(timer_delete(tid) == 0);
	return 0;
 }

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index cce2f0b2d406..7d9d0fab1651 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -615,7 +615,12 @@ static void arm_timer(struct k_itimer *timer)
  */
 static void cpu_timer_fire(struct k_itimer *timer)
 {
-	if (unlikely(timer->sigq == NULL)) {
+	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		/*
+		 * User don't want any signal.
+		 */
+		timer->it.cpu.expires.sched = 0;
+	} else if (unlikely(timer->sigq == NULL)) {
 		/*
 		 * This a special case for clock_nanosleep,
 		 * not a normal timer from sys_timer_create.
@@ -784,7 +789,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 */
 	timer->it.cpu.expires = new_expires;
 	if (new_expires.sched != 0 &&
-	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
 	    cpu_time_before(timer->it_clock, val, new_expires)) {
 		arm_timer(timer);
 	}
@@ -809,7 +813,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	timer->it_overrun = -1;
 
 	if (new_expires.sched != 0 &&
-	    (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
 	    !cpu_time_before(timer->it_clock, val, new_expires)) {
 		/*
 		 * The designated time already passed, so we notify
@@ -883,25 +886,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		read_unlock(&tasklist_lock);
 	}
 
-	if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-		if (timer->it.cpu.incr.sched == 0 &&
-		    cpu_time_before(timer->it_clock,
-				    timer->it.cpu.expires, now)) {
-			/*
-			 * Do-nothing timer expired and has no reload,
-			 * so it's as if it was never set.
-			 */
-			timer->it.cpu.expires.sched = 0;
-			itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
-			return;
-		}
-		/*
-		 * Account for any expirations and reloads that should
-		 * have happened.
-		 */
-		bump_cpu_timer(timer, now);
-	}
-
 	if (unlikely(clear_dead)) {
 		/*
 		 * We've noticed that the thread is dead, but
-- 
cgit v1.2.3-55-g7522


From c28739375bf0d6e239b4fa939ec8372aa2c707d2 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Thu, 11 Mar 2010 14:04:42 -0800
Subject: cpu-timers: Avoid iterating over all threads in
 fastpath_timer_check()

Spread p->sighand->siglock locking scope to make sure that
fastpath_timer_check() never iterates over all threads. Without
locking there is small possibility that signal->cputimer will stop
running while we write values to signal->cputime_expires.

Calling thread_group_cputime() from fastpath_timer_check() is not only
bad because it is slow, also it is racy with __exit_signal() which can
lead to invalid signal->{s,u}time values.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7d9d0fab1651..564b3b0240dd 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -550,7 +550,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
- * for reading, and interrupts disabled.
+ * for reading, interrupts disabled and p->sighand->siglock taken.
  */
 static void arm_timer(struct k_itimer *timer)
 {
@@ -569,9 +569,6 @@ static void arm_timer(struct k_itimer *timer)
 	}
 	head += CPUCLOCK_WHICH(timer->it_clock);
 
-	BUG_ON(!irqs_disabled());
-	spin_lock(&p->sighand->siglock);
-
 	listpos = head;
 	list_for_each_entry(next, head, entry) {
 		if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
@@ -606,8 +603,6 @@ static void arm_timer(struct k_itimer *timer)
 			break;
 		}
 	}
-
-	spin_unlock(&p->sighand->siglock);
 }
 
 /*
@@ -720,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		ret = TIMER_RETRY;
 	} else
 		list_del_init(&timer->it.cpu.entry);
-	spin_unlock(&p->sighand->siglock);
 
 	/*
 	 * We need to sample the current value to convert the new
@@ -774,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		 * disable this firing since we are already reporting
 		 * it as an overrun (thanks to bump_cpu_timer above).
 		 */
+		spin_unlock(&p->sighand->siglock);
 		read_unlock(&tasklist_lock);
 		goto out;
 	}
@@ -793,6 +788,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		arm_timer(timer);
 	}
 
+	spin_unlock(&p->sighand->siglock);
 	read_unlock(&tasklist_lock);
 
 	/*
@@ -1206,6 +1202,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			goto out;
 		}
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
+		spin_lock(&p->sighand->siglock);
 	} else {
 		read_lock(&tasklist_lock);
 		if (unlikely(p->signal == NULL)) {
@@ -1226,6 +1223,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			clear_dead_task(timer, now);
 			goto out_unlock;
 		}
+		spin_lock(&p->sighand->siglock);
 		cpu_timer_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
 		/* Leave the tasklist_lock locked for the call below.  */
@@ -1234,7 +1232,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
+	BUG_ON(!irqs_disabled());
 	arm_timer(timer);
+	spin_unlock(&p->sighand->siglock);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3-55-g7522


From 64ce4c2f5252f25798117fa80a027993163d6d84 Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 11 Mar 2010 14:04:47 -0800
Subject: time: Clean up warp_clock()

warp_clock() currently accesses timekeeping internal state directly, which
is unnecessary.  Convert it to use the proper timekeeping interfaces.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..2358a3646a63 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -133,12 +133,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
  */
 static inline void warp_clock(void)
 {
-	write_seqlock_irq(&xtime_lock);
-	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
-	xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-	update_xtime_cache(0);
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
+	struct timespec delta, adjust;
+	delta.tv_sec = sys_tz.tz_minuteswest * 60;
+	delta.tv_nsec = 0;
+	adjust = timespec_add_safe(current_kernel_time(), delta);
+	do_settimeofday(&adjust);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 06f71b922ce5a05352acd706564ca4ae1f2add0e Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König
Date: Thu, 11 Mar 2010 14:04:46 -0800
Subject: timer: Print function name for timer callbacks modifying preemption
 count

A function scheduled with a timer must not exit with a different
preempt count than it was entered. To make helping users running into
the corresponding BUG() easier also print the name of the bad function
not only its address.

[ tglx: Sanitized printk ]

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: johnstul@us.ibm.com
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..f82f4bfe2d88 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1027,11 +1027,8 @@ static inline void __run_timers(struct tvec_base *base)
 				lock_map_release(&lockdep_map);
 
 				if (preempt_count != preempt_count()) {
-					printk(KERN_ERR "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
+					printk(KERN_ERR "timer: %pF preempt leak: %08x -> %08x\n",
+					       fn, preempt_count, preempt_count());
 					BUG();
 				}
 			}
-- 
cgit v1.2.3-55-g7522


From 576da126a6c7364d70dfd58d0bbe43d05cf5859f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 12 Mar 2010 21:10:29 +0100
Subject: timer: Split out timer function call

The ident level is starting to be annoying. More white space than
actual code. Split out the timer function call into its own function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 72 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index f82f4bfe2d88..45229694dc6a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -953,6 +953,41 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 	return index;
 }
 
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
+			  unsigned long data)
+{
+	int preempt_count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * It is permissible to free the timer from inside the
+	 * function that is called from it, this we need to take into
+	 * account for lockdep too. To avoid bogus "held lock freed"
+	 * warnings as well as problems when looking into
+	 * timer->lockdep_map, make a copy and use that here.
+	 */
+	struct lockdep_map lockdep_map = timer->lockdep_map;
+#endif
+	/*
+	 * Couple the lock chain with the lock chain at
+	 * del_timer_sync() by acquiring the lock_map around the fn()
+	 * call here and in del_timer_sync().
+	 */
+	lock_map_acquire(&lockdep_map);
+
+	trace_timer_expire_entry(timer);
+	fn(data);
+	trace_timer_expire_exit(timer);
+
+	lock_map_release(&lockdep_map);
+
+	if (preempt_count != preempt_count()) {
+		printk(KERN_ERR "timer: %pF preempt leak: %08x -> %08x\n",
+		       fn, preempt_count, preempt_count());
+		BUG();
+	}
+}
+
 #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 /**
@@ -996,42 +1031,7 @@ static inline void __run_timers(struct tvec_base *base)
 			detach_timer(timer, 1);
 
 			spin_unlock_irq(&base->lock);
-			{
-				int preempt_count = preempt_count();
-
-#ifdef CONFIG_LOCKDEP
-				/*
-				 * It is permissible to free the timer from
-				 * inside the function that is called from
-				 * it, this we need to take into account for
-				 * lockdep too. To avoid bogus "held lock
-				 * freed" warnings as well as problems when
-				 * looking into timer->lockdep_map, make a
-				 * copy and use that here.
-				 */
-				struct lockdep_map lockdep_map =
-					timer->lockdep_map;
-#endif
-				/*
-				 * Couple the lock chain with the lock chain at
-				 * del_timer_sync() by acquiring the lock_map
-				 * around the fn() call here and in
-				 * del_timer_sync().
-				 */
-				lock_map_acquire(&lockdep_map);
-
-				trace_timer_expire_entry(timer);
-				fn(data);
-				trace_timer_expire_exit(timer);
-
-				lock_map_release(&lockdep_map);
-
-				if (preempt_count != preempt_count()) {
-					printk(KERN_ERR "timer: %pF preempt leak: %08x -> %08x\n",
-					       fn, preempt_count, preempt_count());
-					BUG();
-				}
-			}
+			call_timer_fn(timer, fn, data);
 			spin_lock_irq(&base->lock);
 		}
 	}
-- 
cgit v1.2.3-55-g7522


From 802702e0c2618465b813242d4dfee6a233ba0beb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 12 Mar 2010 20:13:23 +0100
Subject: timer: Try to survive timer callback preempt_count leak

If a timer callback leaks preempt_count we currently assert a
BUG(). That makes it unnecessarily hard to retrieve information about
the problem especially on laptops and headless stations.

There is a decent chance to survive the preempt_count leak by
restoring the preempt_count to the value before the callback. That
allows in many cases to get valuable information about the root cause
of the problem.

We carried that fixup in preempt-rt for years and were able to decode
such wreckage quite a few times.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linux Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Veen <arjan@infradead.org>
---
 kernel/timer.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 45229694dc6a..7e12e7bc7ce6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -982,9 +982,15 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	lock_map_release(&lockdep_map);
 
 	if (preempt_count != preempt_count()) {
-		printk(KERN_ERR "timer: %pF preempt leak: %08x -> %08x\n",
-		       fn, preempt_count, preempt_count());
-		BUG();
+		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+			  fn, preempt_count, preempt_count());
+		/*
+		 * Restore the preempt count. That gives us a decent
+		 * chance to survive and extract information. If the
+		 * callback kept a lock held, bad luck, but not worse
+		 * than the BUG() we had.
+		 */
+		preempt_count() = preempt_count;
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 16 Mar 2010 01:05:02 +0100
Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs

perf_arch_fetch_caller_regs() is exported for the overriden x86
version, but not for the generic weak version.

As a general rule, weak functions should not have their symbol
exported in the same file they are defined.

So let's export it on trace_event_perf.c as it is used by trace
events only.

This fixes:

	ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined!
	ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined!

-v2: And also only build it if trace events are enabled.
-v3: Fix changelog mistake

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ++-
 kernel/perf_event.c              | 2 ++
 kernel/trace/trace_event_perf.c  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 978d297170a1..0d3466cf7f57 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1695,6 +1695,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 	regs->ip = ip;
@@ -1706,4 +1707,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8bf61273c58b..455393e71cab 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 __weak
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 }
+#endif
 
 /*
  * Output
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0709e4f75114..7d79a10c3cde 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -12,6 +12,8 @@
 DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
 
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
 
-- 
cgit v1.2.3-55-g7522


From 6427462bfa50f50dc6c088c07037264fcc73eca1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Mon, 15 Mar 2010 11:21:48 +0300
Subject: sched: Remove some dead code

This was left over from "7c9414385e sched: Remove USER_SCHED"

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Dhaval Giani <dhaval.giani@gmail.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
LKML-Reference: <20100315082148.GD18181@bicker>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..ec3b2229893b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -178,8 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
 	return up;
 
-	put_user_ns(new->user_ns);
-	kmem_cache_free(uid_cachep, new);
 out_unlock:
 	return NULL;
 }
-- 
cgit v1.2.3-55-g7522


From 88393161210493e317ae391696ee8ef463cb3c23 Mon Sep 17 00:00:00 2001
From: Thomas Weber
Date: Tue, 16 Mar 2010 11:47:56 +0100
Subject: Fix typos in comments

[Ss]ytem => [Ss]ystem
udpate => update
paramters => parameters
orginal => original

Signed-off-by: Thomas Weber <swirl@gmx.li>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/cgroups/cgroups.txt               | 2 +-
 Documentation/kbuild/kconfig.txt                | 2 +-
 Documentation/sysfs-rules.txt                   | 2 +-
 Documentation/trace/events.txt                  | 8 ++++----
 drivers/acpi/osl.c                              | 4 ++--
 drivers/ata/ata_piix.c                          | 2 +-
 drivers/firewire/ohci.c                         | 2 +-
 drivers/gpu/drm/drm_bufs.c                      | 2 +-
 drivers/infiniband/hw/ipath/ipath_iba6110.c     | 2 +-
 drivers/infiniband/hw/ipath/ipath_iba6120.c     | 4 ++--
 drivers/infiniband/hw/ipath/ipath_iba7220.c     | 2 +-
 drivers/isdn/hisax/hfc4s8s_l1.c                 | 2 +-
 drivers/macintosh/windfarm_pm81.c               | 2 +-
 drivers/media/dvb/dvb-usb/friio-fe.c            | 2 +-
 drivers/net/smsc911x.c                          | 4 ++--
 drivers/pci/hotplug/cpqphp_core.c               | 2 +-
 drivers/pci/pci.c                               | 2 +-
 drivers/ps3/ps3-sys-manager.c                   | 2 +-
 drivers/regulator/core.c                        | 2 +-
 drivers/s390/char/sclp_cpi_sys.c                | 2 +-
 drivers/scsi/bfa/include/defs/bfa_defs_cee.h    | 2 +-
 drivers/scsi/bfa/include/defs/bfa_defs_status.h | 4 ++--
 drivers/spi/spi_mpc8xxx.c                       | 2 +-
 drivers/virtio/virtio_pci.c                     | 2 +-
 fs/jfs/jfs_dmap.c                               | 2 +-
 kernel/cgroup.c                                 | 2 +-
 mm/page_alloc.c                                 | 2 +-
 net/wimax/op-rfkill.c                           | 2 +-
 sound/pci/emu10k1/emu10k1_main.c                | 2 +-
 29 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index fd588ff0e296..5eb279a48fa4 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -573,7 +573,7 @@ void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 
 Called when a task attach operation has failed after can_attach() has succeeded.
 A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsytem can implement a rollback. If not, not necessary.
+function, so that the subsystem can implement a rollback. If not, not necessary.
 This will be called only about subsystems whose can_attach() operation have
 succeeded.
 
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
index 49efae703979..b2cb16ebcb16 100644
--- a/Documentation/kbuild/kconfig.txt
+++ b/Documentation/kbuild/kconfig.txt
@@ -96,7 +96,7 @@ Environment variables for 'silentoldconfig'
 KCONFIG_NOSILENTUPDATE
 --------------------------------------------------
 If this variable has a non-blank value, it prevents silent kernel
-config udpates (requires explicit updates).
+config updates (requires explicit updates).
 
 KCONFIG_AUTOCONFIG
 --------------------------------------------------
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt
index 5d8bc2cd250c..c1a1fd636bf9 100644
--- a/Documentation/sysfs-rules.txt
+++ b/Documentation/sysfs-rules.txt
@@ -125,7 +125,7 @@ versions of the sysfs interface.
 - Block
   The converted block subsystem at /sys/class/block or
   /sys/subsystem/block will contain the links for disks and partitions
-  at the same level, never in a hierarchy. Assuming the block subsytem to
+  at the same level, never in a hierarchy. Assuming the block subsystem to
   contain only disks and not partition devices in the same flat list is
   a bug in the application.
 
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed38b2d..b22000dbc57d 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -238,7 +238,7 @@ subsystem's filter file.
 
 For convenience, filters for every event in a subsystem can be set or
 cleared as a group by writing a filter expression into the filter file
-at the root of the subsytem.  Note however, that if a filter for any
+at the root of the subsystem.  Note however, that if a filter for any
 event within the subsystem lacks a field specified in the subsystem
 filter, or if the filter can't be applied for any other reason, the
 filter for that event will retain its previous setting.  This can
@@ -250,7 +250,7 @@ fields can be guaranteed to propagate successfully to all events.
 Here are a few subsystem filter examples that also illustrate the
 above points:
 
-Clear the filters on all events in the sched subsytem:
+Clear the filters on all events in the sched subsystem:
 
 # cd /sys/kernel/debug/tracing/events/sched
 # echo 0 > filter
@@ -260,7 +260,7 @@ none
 none
 
 Set a filter using only common fields for all events in the sched
-subsytem (all events end up with the same filter):
+subsystem (all events end up with the same filter):
 
 # cd /sys/kernel/debug/tracing/events/sched
 # echo common_pid == 0 > filter
@@ -270,7 +270,7 @@ common_pid == 0
 common_pid == 0
 
 Attempt to set a filter using a non-common field for all events in the
-sched subsytem (all events but those that have a prev_pid field retain
+sched subsystem (all events but those that have a prev_pid field retain
 their old filters):
 
 # cd /sys/kernel/debug/tracing/events/sched
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 8e6d8665f0ae..f92531fbd501 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1405,7 +1405,7 @@ acpi_os_invalidate_address(
 	switch (space_id) {
 	case ACPI_ADR_SPACE_SYSTEM_IO:
 	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
-		/* Only interference checks against SystemIO and SytemMemory
+		/* Only interference checks against SystemIO and SystemMemory
 		   are needed */
 		res.start = address;
 		res.end = address + length - 1;
@@ -1457,7 +1457,7 @@ acpi_os_validate_address (
 	switch (space_id) {
 	case ACPI_ADR_SPACE_SYSTEM_IO:
 	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
-		/* Only interference checks against SystemIO and SytemMemory
+		/* Only interference checks against SystemIO and SystemMemory
 		   are needed */
 		res = kzalloc(sizeof(struct acpi_res_list), GFP_KERNEL);
 		if (!res)
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index c33806654e46..b1cb8af6af1c 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -43,7 +43,7 @@
  * driver the list of errata that are relevant is below, going back to
  * PIIX4. Older device documentation is now a bit tricky to find.
  *
- * The chipsets all follow very much the same design. The orginal Triton
+ * The chipsets all follow very much the same design. The original Triton
  * series chipsets do _not_ support independant device timings, but this
  * is fixed in Triton II. With the odd mobile exception the chips then
  * change little except in gaining more modes until SATA arrives. This
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 75dc6988cffd..8e180a27be7c 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1342,7 +1342,7 @@ static void bus_reset_tasklet(unsigned long data)
 	 * was set up before this reset, the old one is now no longer
 	 * in use and we can free it. Update the config rom pointers
 	 * to point to the current config rom and clear the
-	 * next_config_rom pointer so a new udpate can take place.
+	 * next_config_rom pointer so a new update can take place.
 	 */
 
 	if (ohci->next_config_rom != NULL) {
diff --git a/drivers/gpu/drm/drm_bufs.c b/drivers/gpu/drm/drm_bufs.c
index 8417cc4c43f1..8475d58f2a33 100644
--- a/drivers/gpu/drm/drm_bufs.c
+++ b/drivers/gpu/drm/drm_bufs.c
@@ -960,7 +960,7 @@ int drm_addbufs_pci(struct drm_device * dev, struct drm_buf_desc * request)
 		dma->buflist[i + dma->buf_count] = &entry->buflist[i];
 	}
 
-	/* No allocations failed, so now we can replace the orginal pagelist
+	/* No allocations failed, so now we can replace the original pagelist
 	 * with the new one.
 	 */
 	if (dma->page_count) {
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
index 37d12e5efa49..1d7aea132a09 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -1474,7 +1474,7 @@ static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
 /**
  * ipath_pe_put_tid - write a TID in chip
  * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidptr: pointer to the expected TID (in chip) to update
  * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
  * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
  *
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c
index fbf8c5379ea8..4b4a30b0dabd 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6120.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c
@@ -1328,7 +1328,7 @@ bail:
 /**
  * ipath_pe_put_tid - write a TID in chip
  * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidptr: pointer to the expected TID (in chip) to update
  * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
  * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
  *
@@ -1394,7 +1394,7 @@ static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr,
 /**
  * ipath_pe_put_tid_2 - write a TID in chip, Revision 2 or higher
  * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidptr: pointer to the expected TID (in chip) to update
  * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
  * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
  *
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index a805402dd4ae..34b778ed97fc 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -1738,7 +1738,7 @@ bail:
 /**
  * ipath_7220_put_tid - write a TID to the chip
  * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidptr: pointer to the expected TID (in chip) to update
  * @tidtype: 0 for eager, 1 for expected
  * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
  *
diff --git a/drivers/isdn/hisax/hfc4s8s_l1.c b/drivers/isdn/hisax/hfc4s8s_l1.c
index ab98e135bcbb..7ea0d07836d6 100644
--- a/drivers/isdn/hisax/hfc4s8s_l1.c
+++ b/drivers/isdn/hisax/hfc4s8s_l1.c
@@ -309,7 +309,7 @@ wait_busy(hfc4s8s_hw * a)
 
 /******************************************************/
 /* function to read critical counter registers that   */
-/* may be udpated by the chip during read             */
+/* may be updated by the chip during read             */
 /******************************************************/
 static u_char
 Read_hfc8_stable(hfc4s8s_hw * hw, int reg)
diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c
index 565d5b2adc95..129cda737880 100644
--- a/drivers/macintosh/windfarm_pm81.c
+++ b/drivers/macintosh/windfarm_pm81.c
@@ -188,7 +188,7 @@ struct wf_smu_sys_fans_state {
 };
 
 /*
- * Configs for SMU Sytem Fan control loop
+ * Configs for SMU System Fan control loop
  */
 static struct wf_smu_sys_fans_param wf_smu_sys_all_params[] = {
 	/* Model ID 2 */
diff --git a/drivers/media/dvb/dvb-usb/friio-fe.c b/drivers/media/dvb/dvb-usb/friio-fe.c
index d14bd227b502..93c21ddd0b77 100644
--- a/drivers/media/dvb/dvb-usb/friio-fe.c
+++ b/drivers/media/dvb/dvb-usb/friio-fe.c
@@ -300,7 +300,7 @@ static int jdvbt90502_set_frontend(struct dvb_frontend *fe,
 				   struct dvb_frontend_parameters *p)
 {
 	/**
-	 * NOTE: ignore all the paramters except frequency.
+	 * NOTE: ignore all the parameters except frequency.
 	 *       others should be fixed to the proper value for ISDB-T,
 	 *       but don't check here.
 	 */
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index 4fd1d8b38788..eb175980a8e0 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -737,7 +737,7 @@ static void smsc911x_phy_adjust_link(struct net_device *dev)
 			SMSC_TRACE(HW, "configuring for carrier OK");
 			if ((pdata->gpio_orig_setting & GPIO_CFG_LED1_EN_) &&
 			    (!pdata->using_extphy)) {
-				/* Restore orginal GPIO configuration */
+				/* Restore original GPIO configuration */
 				pdata->gpio_setting = pdata->gpio_orig_setting;
 				smsc911x_reg_write(pdata, GPIO_CFG,
 					pdata->gpio_setting);
@@ -751,7 +751,7 @@ static void smsc911x_phy_adjust_link(struct net_device *dev)
 			if ((pdata->gpio_setting & GPIO_CFG_LED1_EN_) &&
 			    (!pdata->using_extphy)) {
 				/* Force 10/100 LED off, after saving
-				 * orginal GPIO configuration */
+				 * original GPIO configuration */
 				pdata->gpio_orig_setting = pdata->gpio_setting;
 
 				pdata->gpio_setting &= ~GPIO_CFG_LED1_EN_;
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index f184d1d2ecbe..6644337d63d6 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -848,7 +848,7 @@ static int cpqhpc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_disable_device;
 	}
 
-	/* Check for the proper subsytem ID's
+	/* Check for the proper subsystem ID's
 	 * Intel uses a different SSID programming model than Compaq.
 	 * For Intel, each SSID bit identifies a PHP capability.
 	 * Also Intel HPC's may have RID=0.
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cb1dd5f4988c..ddd55dc927f7 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1506,7 +1506,7 @@ int pci_prepare_to_sleep(struct pci_dev *dev)
  * pci_back_from_sleep - turn PCI device on during system-wide transition into working state
  * @dev: Device to handle.
  *
- * Disable device's sytem wake-up capability and put it into D0.
+ * Disable device's system wake-up capability and put it into D0.
  */
 int pci_back_from_sleep(struct pci_dev *dev)
 {
diff --git a/drivers/ps3/ps3-sys-manager.c b/drivers/ps3/ps3-sys-manager.c
index 3cbaf1811bd0..d37c445f0eda 100644
--- a/drivers/ps3/ps3-sys-manager.c
+++ b/drivers/ps3/ps3-sys-manager.c
@@ -119,7 +119,7 @@ enum ps3_sys_manager_service_id {
  * enum ps3_sys_manager_attr - Notification attribute (bit position mask).
  * @PS3_SM_ATTR_POWER: Power button.
  * @PS3_SM_ATTR_RESET: Reset button, not available on retail console.
- * @PS3_SM_ATTR_THERMAL: Sytem thermal alert.
+ * @PS3_SM_ATTR_THERMAL: System thermal alert.
  * @PS3_SM_ATTR_CONTROLLER: Remote controller event.
  * @PS3_SM_ATTR_ALL: Logical OR of all.
  *
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c7bbe30010f7..7461f5bb2bd8 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1538,7 +1538,7 @@ EXPORT_SYMBOL_GPL(regulator_count_voltages);
  * Context: can sleep
  *
  * Returns a voltage that can be passed to @regulator_set_voltage(),
- * zero if this selector code can't be used on this sytem, or a
+ * zero if this selector code can't be used on this system, or a
  * negative errno.
  */
 int regulator_list_voltage(struct regulator *regulator, unsigned selector)
diff --git a/drivers/s390/char/sclp_cpi_sys.c b/drivers/s390/char/sclp_cpi_sys.c
index 62c2647f37f4..4a51e3f09689 100644
--- a/drivers/s390/char/sclp_cpi_sys.c
+++ b/drivers/s390/char/sclp_cpi_sys.c
@@ -102,7 +102,7 @@ static struct sclp_req *cpi_prepare_req(void)
 	/* set system name */
 	set_data(evb->system_name, system_name);
 
-	/* set sytem level */
+	/* set system level */
 	evb->system_level = system_level;
 
 	/* set sysplex name */
diff --git a/drivers/scsi/bfa/include/defs/bfa_defs_cee.h b/drivers/scsi/bfa/include/defs/bfa_defs_cee.h
index 520a22f52dd1..6217eec8c604 100644
--- a/drivers/scsi/bfa/include/defs/bfa_defs_cee.h
+++ b/drivers/scsi/bfa/include/defs/bfa_defs_cee.h
@@ -54,7 +54,7 @@ struct bfa_cee_lldp_str_s {
 };
 
 
-/* LLDP paramters */
+/* LLDP parameters */
 struct bfa_cee_lldp_cfg_s {
 	struct bfa_cee_lldp_str_s chassis_id;
 	struct bfa_cee_lldp_str_s port_id;
diff --git a/drivers/scsi/bfa/include/defs/bfa_defs_status.h b/drivers/scsi/bfa/include/defs/bfa_defs_status.h
index cdceaeb9f4b8..03ce0331eb48 100644
--- a/drivers/scsi/bfa/include/defs/bfa_defs_status.h
+++ b/drivers/scsi/bfa/include/defs/bfa_defs_status.h
@@ -223,9 +223,9 @@ enum bfa_status {
 	BFA_STATUS_IM_PVID_NON_ZERO = 140, /*  Port VLAN ID (PVID) is Set to
 					    * Non-Zero Value */
 	BFA_STATUS_IM_INETCFG_LOCK_FAILED = 141, /*  Acquiring Network
-						  * Subsytem Lock Failed.Please
+						  * Subsystem Lock Failed.Please
 						  * try after some time */
-	BFA_STATUS_IM_GET_INETCFG_FAILED = 142, /*  Acquiring Network Subsytem
+	BFA_STATUS_IM_GET_INETCFG_FAILED = 142, /*  Acquiring Network Subsystem
 						 * handle Failed. Please try
 						 * after some time */
 	BFA_STATUS_IM_NOT_BOUND = 143, /*  Brocade 10G Ethernet Service is not
diff --git a/drivers/spi/spi_mpc8xxx.c b/drivers/spi/spi_mpc8xxx.c
index 4f0cc9d457e0..56efdfe1428d 100644
--- a/drivers/spi/spi_mpc8xxx.c
+++ b/drivers/spi/spi_mpc8xxx.c
@@ -639,7 +639,7 @@ static int mpc8xxx_spi_setup(struct spi_device *spi)
 	}
 	mpc8xxx_spi = spi_master_get_devdata(spi->master);
 
-	hw_mode = cs->hw_mode; /* Save orginal settings */
+	hw_mode = cs->hw_mode; /* Save original settings */
 	cs->hw_mode = mpc8xxx_spi_read_reg(&mpc8xxx_spi->base->mode);
 	/* mask out bits we are going to set */
 	cs->hw_mode &= ~(SPMODE_CP_BEGIN_EDGECLK | SPMODE_CI_INACTIVEHIGH
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 625447f645d9..7210c0d72e43 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -654,7 +654,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	/* we use the subsystem vendor/device id as the virtio vendor/device
 	 * id.  this allows us to use the same PCI vendor/device id for all
 	 * virtio devices and to identify the particular virtio driver by
-	 * the subsytem ids */
+	 * the subsystem ids */
 	vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
 	vp_dev->vdev.id.device = pci_dev->subsystem_device;
 
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..d13b93043b04 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2437,7 +2437,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
 
 	/* check if this is a control page update for an allocation.
 	 * if so, update the leaf to reflect the new leaf value using
-	 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
+	 * dbSplit(); otherwise (deallocation), use dbJoin() to update
 	 * the leaf with the new value.  in addition to updating the
 	 * leaf, dbSplit() will also split the binary buddy system of
 	 * the leaves, if required, and bubble new values within the
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef909a329750..a3b0f24bddbb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3610,7 +3610,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  * @ss: the subsystem to load
  *
  * This function should be called in a modular subsystem's initcall. If the
- * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * subsystem is built as a module, it will be assigned a new subsys_id and set
  * up for use. If the subsystem is built-in anyway, work is delegated to the
  * simpler cgroup_init_subsys.
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946d5566..a6326c71b663 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2579,7 +2579,7 @@ static int default_zonelist_order(void)
 	struct zone *z;
 	int average_size;
 	/*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
index e978c7136c97..2609e445fe7d 100644
--- a/net/wimax/op-rfkill.c
+++ b/net/wimax/op-rfkill.c
@@ -43,7 +43,7 @@
  *   wimax_rfkill()             Kernel calling wimax_rfkill()
  *     __wimax_rf_toggle_radio()
  *
- * wimax_rfkill_set_radio_block()  RF-Kill subsytem calling
+ * wimax_rfkill_set_radio_block()  RF-Kill subsystem calling
  *   __wimax_rf_toggle_radio()
  *
  * __wimax_rf_toggle_radio()
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index f18bd6207c50..66c7fb3ced3e 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -1787,7 +1787,7 @@ int __devinit snd_emu10k1_create(struct snd_card *card,
 	else if (subsystem)
 		snd_printdd("Sound card name = %s, "
 			"vendor = 0x%x, device = 0x%x, subsystem = 0x%x. "
-			"Forced to subsytem = 0x%x\n",	c->name,
+			"Forced to subsystem = 0x%x\n",	c->name,
 			pci->vendor, pci->device, emu->serial, c->subsystem);
 	else
 		snd_printdd("Sound card name = %s, "
-- 
cgit v1.2.3-55-g7522


From e1292ba164742e3a236e407148e00300b7196906 Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 18 Mar 2010 20:19:27 -0700
Subject: ntp: Make time_adjust static

Now that no arches are accessing time_adjust directly,
make it static.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1268968769-19209-1-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timex.h | 1 -
 kernel/time/ntp.c     | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 7a082b32d8e1..e2de51eedf05 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -238,7 +238,6 @@ extern int tickadj;			/* amount of adjustment per tick */
  * phase-lock loop variables
  */
 extern int time_status;		/* clock synchronization status bits */
-extern long time_adjust;	/* The amount of adjtime left */
 
 extern void ntp_init(void);
 extern void ntp_clear(void);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64			time_freq;
 /* time at last adjustment (secs):					*/
 static long			time_reftime;
 
-long				time_adjust;
+static long			time_adjust;
 
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
-- 
cgit v1.2.3-55-g7522


From 92d6b71ab906be706f3679353b30a8d2c3831144 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich
Date: Thu, 11 Mar 2010 14:08:56 -0800
Subject: genirq: Expose irq_desc->node in proc/irq

Expose irq_desc->node as /proc/irq/*/node.

This file provides device hardware locality information for apps
desiring to include hardware locality in irq mapping decisions.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/filesystems/proc.txt |  4 ++++
 kernel/irq/proc.c                  | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a4f30faa4f1f..6507d2ae5236 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -566,6 +566,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the
 IRQs which have not yet been allocated/activated, and hence which lack a
 /proc/irq/[0-9]* directory.
 
+The node file on an SMP system shows the node to which the device using the IRQ
+reports itself as being attached. This hardware locality information does not
+include information about any possible driver locality preference.
+
 prof_cpu_mask specifies which CPUs are to be profiled by the system wide
 profiler. Default value is ffffffff (all cpus).
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..e346e08f5c34 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -146,6 +146,26 @@ static const struct file_operations default_affinity_proc_fops = {
 	.release	= single_release,
 	.write		= default_affinity_write,
 };
+
+static int irq_node_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long) m->private);
+
+	seq_printf(m, "%d\n", desc->node);
+	return 0;
+}
+
+static int irq_node_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_node_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations irq_node_proc_fops = {
+	.open		= irq_node_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif
 
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -230,6 +250,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	proc_create_data("node", 0444, desc->dir,
+			 &irq_node_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,
-- 
cgit v1.2.3-55-g7522


From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 25 Mar 2010 14:51:50 +0100
Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code

Support for the PMU's BTS features has been upstreamed in
v2.6.32, but we still have the old and disabled ptrace-BTS,
as Linus noticed it not so long ago.

It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without
regard for other uses (perf) and doesn't provide the flexibility
needed for perf either.

Its users are ptrace-block-step and ptrace-bts, since ptrace-bts
was never used and ptrace-block-step can be implemented using a
much simpler approach.

So axe all 3000 lines of it. That includes the *locked_memory*()
APIs in mm/mlock.c as well.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20100325135413.938004390@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu               |   20 -
 arch/x86/Kconfig.debug             |    9 -
 arch/x86/include/asm/ds.h          |  302 --------
 arch/x86/include/asm/processor.h   |   33 +-
 arch/x86/include/asm/ptrace-abi.h  |   57 +-
 arch/x86/include/asm/ptrace.h      |    6 -
 arch/x86/include/asm/thread_info.h |    6 +-
 arch/x86/kernel/Makefile           |    2 -
 arch/x86/kernel/cpu/intel.c        |    2 -
 arch/x86/kernel/ds.c               | 1437 ------------------------------------
 arch/x86/kernel/ds_selftest.c      |  408 ----------
 arch/x86/kernel/ds_selftest.h      |   15 -
 arch/x86/kernel/dumpstack.c        |    5 -
 arch/x86/kernel/kprobes.c          |    6 +-
 arch/x86/kernel/process.c          |    9 -
 arch/x86/kernel/process_32.c       |    8 -
 arch/x86/kernel/process_64.c       |    8 -
 arch/x86/kernel/ptrace.c           |  382 ----------
 arch/x86/kernel/step.c             |   36 +-
 arch/x86/kernel/traps.c            |    5 -
 include/linux/ftrace.h             |   12 -
 include/linux/mm.h                 |    4 -
 include/linux/ptrace.h             |   12 -
 include/linux/sched.h              |    9 -
 kernel/fork.c                      |    3 -
 kernel/ptrace.c                    |    1 -
 kernel/sched.c                     |   43 --
 kernel/trace/Kconfig               |   11 -
 kernel/trace/Makefile              |    1 -
 kernel/trace/trace.h               |    4 -
 kernel/trace/trace_entries.h       |   12 -
 kernel/trace/trace_hw_branches.c   |  312 --------
 kernel/trace/trace_selftest.c      |   57 --
 mm/mlock.c                         |   41 -
 34 files changed, 9 insertions(+), 3269 deletions(-)
 delete mode 100644 arch/x86/include/asm/ds.h
 delete mode 100644 arch/x86/kernel/ds.c
 delete mode 100644 arch/x86/kernel/ds_selftest.c
 delete mode 100644 arch/x86/kernel/ds_selftest.h
 delete mode 100644 kernel/trace/trace_hw_branches.c

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6a..918fbb1855cc 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -502,23 +502,3 @@ config CPU_SUP_UMC_32
 	  CPU might render the kernel unbootable.
 
 	  If unsure, say N.
-
-config X86_DS
-	def_bool X86_PTRACE_BTS
-	depends on X86_DEBUGCTLMSR
-	select HAVE_HW_BRANCH_TRACER
-
-config X86_PTRACE_BTS
-	bool "Branch Trace Store"
-	default y
-	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
-	---help---
-	  This adds a ptrace interface to the hardware's branch trace store.
-
-	  Debuggers may use it to collect an execution trace of the debugged
-	  application in order to answer the question 'how did I get here?'.
-	  Debuggers may trace user mode as well as kernel mode.
-
-	  Say Y unless there is no application development on this machine
-	  and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb2..bd58c8abbfbd 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,15 +174,6 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config X86_DS_SELFTEST
-    bool "DS selftest"
-    default y
-    depends on DEBUG_KERNEL
-    depends on X86_DS
-	---help---
-	  Perform Debug Store selftests at boot time.
-	  If in doubt, say "N".
-
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b093..000000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Debug Store (DS) support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#ifndef _ASM_X86_DS_H
-#define _ASM_X86_DS_H
-
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/err.h>
-
-
-#ifdef CONFIG_X86_DS
-
-struct task_struct;
-struct ds_context;
-struct ds_tracer;
-struct bts_tracer;
-struct pebs_tracer;
-
-typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
-typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
-
-
-/*
- * A list of features plus corresponding macros to talk about them in
- * the ds_request function's flags parameter.
- *
- * We use the enum to index an array of corresponding control bits;
- * we use the macro to index a flags bit-vector.
- */
-enum ds_feature {
-	dsf_bts = 0,
-	dsf_bts_kernel,
-#define BTS_KERNEL (1 << dsf_bts_kernel)
-	/* trace kernel-mode branches */
-
-	dsf_bts_user,
-#define BTS_USER (1 << dsf_bts_user)
-	/* trace user-mode branches */
-
-	dsf_bts_overflow,
-	dsf_bts_max,
-	dsf_pebs = dsf_bts_max,
-
-	dsf_pebs_max,
-	dsf_ctl_max = dsf_pebs_max,
-	dsf_bts_timestamps = dsf_ctl_max,
-#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
-	/* add timestamps into BTS trace */
-
-#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
-};
-
-
-/*
- * Request BTS or PEBS
- *
- * Due to alignement constraints, the actual buffer may be slightly
- * smaller than the requested or provided buffer.
- *
- * Returns a pointer to a tracer structure on success, or
- * ERR_PTR(errcode) on failure.
- *
- * The interrupt threshold is independent from the overflow callback
- * to allow users to use their own overflow interrupt handling mechanism.
- *
- * The function might sleep.
- *
- * task: the task to request recording for
- * cpu:  the cpu to request recording for
- * base: the base pointer for the (non-pageable) buffer;
- * size: the size of the provided buffer in bytes
- * ovfl: pointer to a function to be called on buffer overflow;
- *       NULL if cyclic buffer requested
- * th: the interrupt threshold in records from the end of the buffer;
- *     -1 if no interrupt threshold is requested.
- * flags: a bit-mask of the above flags
- */
-extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-					      void *base, size_t size,
-					      bts_ovfl_callback_t ovfl,
-					      size_t th, unsigned int flags);
-extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-					     bts_ovfl_callback_t ovfl,
-					     size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-						void *base, size_t size,
-						pebs_ovfl_callback_t ovfl,
-						size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
-					       void *base, size_t size,
-					       pebs_ovfl_callback_t ovfl,
-					       size_t th, unsigned int flags);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Must be called with irq's enabled.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern void ds_release_bts(struct bts_tracer *tracer);
-extern void ds_suspend_bts(struct bts_tracer *tracer);
-extern void ds_resume_bts(struct bts_tracer *tracer);
-extern void ds_release_pebs(struct pebs_tracer *tracer);
-extern void ds_suspend_pebs(struct pebs_tracer *tracer);
-extern void ds_resume_pebs(struct pebs_tracer *tracer);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Cpu tracers must call this on the traced cpu.
- * Task tracers must call ds_release_~_noirq() for themselves.
- *
- * May be called with irq's disabled.
- *
- * Returns 0 if successful;
- * -EPERM if the cpu tracer does not trace the current cpu.
- * -EPERM if the task tracer does not trace itself.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_release_bts_noirq(struct bts_tracer *tracer);
-extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
-extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
-extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
-
-
-/*
- * The raw DS buffer state as it is used for BTS and PEBS recording.
- *
- * This is the low-level, arch-dependent interface for working
- * directly on the raw trace data.
- */
-struct ds_trace {
-	/* the number of bts/pebs records */
-	size_t n;
-	/* the size of a bts/pebs record in bytes */
-	size_t size;
-	/* pointers into the raw buffer:
-	   - to the first entry */
-	void *begin;
-	/* - one beyond the last entry */
-	void *end;
-	/* - one beyond the newest entry */
-	void *top;
-	/* - the interrupt threshold */
-	void *ith;
-	/* flags given on ds_request() */
-	unsigned int flags;
-};
-
-/*
- * An arch-independent view on branch trace data.
- */
-enum bts_qualifier {
-	bts_invalid,
-#define BTS_INVALID bts_invalid
-
-	bts_branch,
-#define BTS_BRANCH bts_branch
-
-	bts_task_arrives,
-#define BTS_TASK_ARRIVES bts_task_arrives
-
-	bts_task_departs,
-#define BTS_TASK_DEPARTS bts_task_departs
-
-	bts_qual_bit_size = 4,
-	bts_qual_max = (1 << bts_qual_bit_size),
-};
-
-struct bts_struct {
-	__u64 qualifier;
-	union {
-		/* BTS_BRANCH */
-		struct {
-			__u64 from;
-			__u64 to;
-		} lbr;
-		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
-		struct {
-			__u64 clock;
-			pid_t pid;
-		} event;
-	} variant;
-};
-
-
-/*
- * The BTS state.
- *
- * This gives access to the raw DS state and adds functions to provide
- * an arch-independent view of the BTS data.
- */
-struct bts_trace {
-	struct ds_trace ds;
-
-	int (*read)(struct bts_tracer *tracer, const void *at,
-		    struct bts_struct *out);
-	int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
-};
-
-
-/*
- * The PEBS state.
- *
- * This gives access to the raw DS state and the PEBS-specific counter
- * reset value.
- */
-struct pebs_trace {
-	struct ds_trace ds;
-
-	/* the number of valid counters in the below array */
-	unsigned int counters;
-
-#define MAX_PEBS_COUNTERS 4
-	/* the counter reset value */
-	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
-};
-
-
-/*
- * Read the BTS or PEBS trace.
- *
- * Returns a view on the trace collected for the parameter tracer.
- *
- * The view remains valid as long as the traced task is not running or
- * the tracer is suspended.
- * Writes into the trace buffer are not reflected.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
-extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
-
-
-/*
- * Reset the write pointer of the BTS/PEBS buffer.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_reset_bts(struct bts_tracer *tracer);
-extern int ds_reset_pebs(struct pebs_tracer *tracer);
-
-/*
- * Set the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * counter: the index of the counter
- * value: the new counter reset value
- */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
-			     unsigned int counter, u64 value);
-
-/*
- * Initialization
- */
-struct cpuinfo_x86;
-extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-/*
- * Context switch work
- */
-extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-
-#else /* CONFIG_X86_DS */
-
-struct cpuinfo_x86;
-static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
-static inline void ds_switch_to(struct task_struct *prev,
-				struct task_struct *next) {}
-
-#endif /* CONFIG_X86_DS */
-#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703a..5bec21a66dc5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
-#include <asm/ds.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/math64.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 #define HBP_NUM 4
 /*
@@ -473,10 +473,6 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
-/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
-	unsigned long	debugctlmsr;
-	/* Debug Store context; see asm/ds.h */
-	struct ds_context	*ds_ctx;
 };
 
 static inline unsigned long native_get_debugreg(int regno)
@@ -814,21 +810,6 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
-static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
-{
-	u64 debugctlmsr = 0;
-	u32 val1, val2;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return 0;
-#endif
-	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
-	debugctlmsr = val1 | ((u64)val2 << 32);
-
-	return debugctlmsr;
-}
-
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -838,18 +819,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
-static inline void update_debugctlmsr_on_cpu(int cpu,
-					     unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return;
-#endif
-	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
-		     (u32)((u64)debugctlmsr),
-		     (u32)((u64)debugctlmsr >> 32));
-}
-
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a515..52b098a6eebb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
-   PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
-	/* requested or actual size of BTS buffer in bytes */
-	__u32 size;
-	/* bitmask of below flags */
-	__u32 flags;
-	/* buffer overflow signal */
-	__u32 signal;
-	/* actual size of bts_struct in bytes */
-	__u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
-#define PTRACE_BTS_O_TRACE	0x1 /* branch trace */
-#define PTRACE_BTS_O_SCHED	0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL     0x4 /* send SIG<signal> on buffer overflow
-				       instead of wrapping around */
-#define PTRACE_BTS_O_ALLOC	0x8 /* (re)allocate buffer */
-
-#define PTRACE_BTS_CONFIG	40
-/* Configure branch trace recording.
-   ADDR points to a struct ptrace_bts_config.
-   DATA gives the size of that buffer.
-   A new buffer is allocated, if requested in the flags.
-   An overflow signal may only be requested for new buffers.
-   Returns the number of bytes read.
-*/
-#define PTRACE_BTS_STATUS	41
-/* Return the current configuration in a struct ptrace_bts_config
-   pointed to by ADDR; DATA gives the size of that buffer.
-   Returns the number of bytes written.
-*/
-#define PTRACE_BTS_SIZE		42
-/* Return the number of available BTS records for draining.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_GET		43
-/* Get a single BTS record.
-   DATA defines the index into the BTS array, where 0 is the newest
-   entry, and higher indices refer to older entries.
-   ADDR is pointing to struct bts_struct (see asm/ds.h).
-*/
-#define PTRACE_BTS_CLEAR	44
-/* Clear the BTS buffer.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_DRAIN	45
-/* Read all available BTS records and clear the buffer.
-   ADDR points to an array of struct bts_struct.
-   DATA gives the size of that buffer.
-   BTS records are read from oldest to newest.
-   Returns number of BTS records drained.
-*/
+#endif
 
 #endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff0..78cd1ea94500 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void ptrace_bts_untrace(struct task_struct *tsk);
-
-#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e969..dc85e12d1405 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,8 +92,6 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
-#define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
-#define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -115,8 +113,6 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
-#define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
-#define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -147,7 +143,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..e77b22083721 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
-obj-$(CONFIG_X86_DS)		+= ds.o
-obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af35..d72377c41c76 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
-#include <asm/ds.h>
 #include <asm/bugs.h>
 #include <asm/cpu.h>
 
@@ -367,7 +366,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_BTS);
 		if (!(l1 & (1<<12)))
 			set_cpu_cap(c, X86_FEATURE_PEBS);
-		ds_init_intel(c);
 	}
 
 	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
-	/* The name of the configuration: */
-	const char		*name;
-
-	/* The size of pointer-typed fields in DS, BTS, and PEBS: */
-	unsigned char		sizeof_ptr_field;
-
-	/* The size of a BTS/PEBS record in bytes: */
-	unsigned char		sizeof_rec[2];
-
-	/* The number of pebs counter reset values in the DS structure. */
-	unsigned char		nr_counter_reset;
-
-	/* Control bit-masks indexed by enum ds_feature: */
-	unsigned long		ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS		0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS		(3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT		(1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS	8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE	8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL				  \
-	( ds_cfg.ctl[dsf_bts]			| \
-	  ds_cfg.ctl[dsf_bts_kernel]		| \
-	  ds_cfg.ctl[dsf_bts_user]		| \
-	  ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
-	/* The DS context (partially) owned by this tracer. */
-	struct ds_context	*context;
-	/* The buffer provided on ds_request() and its size in bytes. */
-	void			*buffer;
-	size_t			size;
-};
-
-struct bts_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct bts_trace	trace;
-
-	/* Buffer overflow notification function: */
-	bts_ovfl_callback_t	ovfl;
-
-	/* Active flags affecting trace collection. */
-	unsigned int		flags;
-};
-
-struct pebs_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct pebs_trace	trace;
-
-	/* Buffer overflow notification function: */
-	pebs_ovfl_callback_t	ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into BTS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into PEBS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
-	ds_buffer_base = 0,
-	ds_index,
-	ds_absolute_maximum,
-	ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
-	ds_bts = 0,
-	ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
-       unsigned long value)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	(*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- *   =0  no tracers
- *   >0  number of per-thread tracers
- *   <0  number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
-	int error;
-
-	spin_lock_irq(&ds_lock);
-
-	if (task) {
-		error = -EPERM;
-		if (atomic_read(&tracers) < 0)
-			goto out;
-		atomic_inc(&tracers);
-	} else {
-		error = -EPERM;
-		if (atomic_read(&tracers) > 0)
-			goto out;
-		atomic_dec(&tracers);
-	}
-
-	error = 0;
-out:
-	spin_unlock_irq(&ds_lock);
-	return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
-	if (task)
-		atomic_dec(&tracers);
-	else
-		atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- *   attached context.
- * - in the latter case, we use a static array of per-cpu context
- *   pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
-	/* The DS configuration; goes into MSR_IA32_DS_AREA: */
-	unsigned char		ds[MAX_SIZEOF_DS];
-
-	/* The owner of the BTS and PEBS configuration, respectively: */
-	struct bts_tracer	*bts_master;
-	struct pebs_tracer	*pebs_master;
-
-	/* Use count: */
-	unsigned long		count;
-
-	/* Pointer to the context pointer field: */
-	struct ds_context	**this;
-
-	/* The traced task; NULL for cpu tracing: */
-	struct task_struct	*task;
-
-	/* The traced cpu; only valid if task is NULL: */
-	int			cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
-	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
-	struct ds_context *context = NULL;
-	struct ds_context *new_context = NULL;
-
-	/* Chances are small that we already have a context. */
-	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
-	if (!new_context)
-		return NULL;
-
-	spin_lock_irq(&ds_lock);
-
-	context = *p_context;
-	if (likely(!context)) {
-		context = new_context;
-
-		context->this = p_context;
-		context->task = task;
-		context->cpu = cpu;
-		context->count = 0;
-
-		*p_context = context;
-	}
-
-	context->count++;
-
-	spin_unlock_irq(&ds_lock);
-
-	if (context != new_context)
-		kfree(new_context);
-
-	return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
-	struct task_struct *task;
-	unsigned long irq;
-
-	if (!context)
-		return;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	if (--context->count) {
-		spin_unlock_irqrestore(&ds_lock, irq);
-		return;
-	}
-
-	*(context->this) = NULL;
-
-	task = context->task;
-
-	if (task)
-		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-	/*
-	 * We leave the (now dangling) pointer to the DS configuration in
-	 * the DS_AREA msr. This is as good or as bad as replacing it with
-	 * NULL - the hardware would crash if we enabled tracing.
-	 *
-	 * This saves us some problems with having to write an msr on a
-	 * different cpu while preventing others from doing the same for the
-	 * next context for that same cpu.
-	 */
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-	/* The context might still be in use for context switching. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
-	unsigned long ds;
-
-	ds = (unsigned long)context->ds;
-
-	/*
-	 * There is a race between the bts master and the pebs master.
-	 *
-	 * The thread/cpu access is synchronized via get/put_cpu() for
-	 * task tracing and via wrmsr_on_cpu for cpu tracing.
-	 *
-	 * If bts and pebs are collected for the same task or same cpu,
-	 * the same confiuration is written twice.
-	 */
-	if (context->task) {
-		get_cpu();
-		if (context->task == current)
-			wrmsrl(MSR_IA32_DS_AREA, ds);
-		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
-		put_cpu();
-	} else
-		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
-			     (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
-	switch (qual) {
-	case ds_bts:
-		if (context->bts_master &&
-		    context->bts_master->ovfl)
-			context->bts_master->ovfl(context->bts_master);
-		break;
-	case ds_pebs:
-		if (context->pebs_master &&
-		    context->pebs_master->ovfl)
-			context->pebs_master->ovfl(context->pebs_master);
-		break;
-	}
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual:    the buffer type
- * record:  the data to write
- * size:    the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
-		    const void *record, size_t size)
-{
-	int bytes_written = 0;
-
-	if (!record)
-		return -EINVAL;
-
-	while (size) {
-		unsigned long base, index, end, write_end, int_th;
-		unsigned long write_size, adj_write_size;
-
-		/*
-		 * Write as much as possible without producing an
-		 * overflow interrupt.
-		 *
-		 * Interrupt_threshold must either be
-		 * - bigger than absolute_maximum or
-		 * - point to a record between buffer_base and absolute_maximum
-		 *
-		 * Index points to a valid record.
-		 */
-		base   = ds_get(context->ds, qual, ds_buffer_base);
-		index  = ds_get(context->ds, qual, ds_index);
-		end    = ds_get(context->ds, qual, ds_absolute_maximum);
-		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
-		write_end = min(end, int_th);
-
-		/*
-		 * If we are already beyond the interrupt threshold,
-		 * we fill the entire buffer.
-		 */
-		if (write_end <= index)
-			write_end = end;
-
-		if (write_end <= index)
-			break;
-
-		write_size = min((unsigned long) size, write_end - index);
-		memcpy((void *)index, record, write_size);
-
-		record = (const char *)record + write_size;
-		size -= write_size;
-		bytes_written += write_size;
-
-		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
-		adj_write_size *= ds_cfg.sizeof_rec[qual];
-
-		/* Zero out trailing bytes. */
-		memset((char *)index + write_size, 0,
-		       adj_write_size - write_size);
-		index += adj_write_size;
-
-		if (index >= end)
-			index = base;
-		ds_set(context->ds, qual, ds_index, index);
-
-		if (index >= int_th)
-			ds_overflow(context, qual);
-	}
-
-	return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
-	bts_from,
-	bts_to,
-	bts_flags,
-
-	bts_qual		= bts_from,
-	bts_clock		= bts_to,
-	bts_pid			= bts_flags,
-
-	bts_qual_mask		= (bts_qual_max - 1),
-	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	(*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- *   writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (at < tracer->trace.ds.begin)
-		return -EINVAL;
-
-	if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
-		return -EINVAL;
-
-	memset(out, 0, sizeof(*out));
-	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
-		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-		out->variant.event.clock = bts_get(at, bts_clock);
-		out->variant.event.pid = bts_get(at, bts_pid);
-	} else {
-		out->qualifier = bts_branch;
-		out->variant.lbr.from = bts_get(at, bts_from);
-		out->variant.lbr.to   = bts_get(at, bts_to);
-
-		if (!out->variant.lbr.from && !out->variant.lbr.to)
-			out->qualifier = bts_invalid;
-	}
-
-	return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
-	unsigned char raw[MAX_SIZEOF_BTS];
-
-	if (!tracer)
-		return -EINVAL;
-
-	if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
-		return -EOVERFLOW;
-
-	switch (in->qualifier) {
-	case bts_invalid:
-		bts_set(raw, bts_from, 0);
-		bts_set(raw, bts_to, 0);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_branch:
-		bts_set(raw, bts_from, in->variant.lbr.from);
-		bts_set(raw, bts_to,   in->variant.lbr.to);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_task_arrives:
-	case bts_task_departs:
-		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-		bts_set(raw, bts_clock, in->variant.event.clock);
-		bts_set(raw, bts_pid, in->variant.event.pid);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return ds_write(tracer->ds.context, ds_bts, raw,
-			ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
-			    struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
-	ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
-	ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
-	ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
-			   struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
-	cfg->top = (void *)ds_get(ds, qual, ds_index);
-	cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
-	cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
-			     void *base, size_t size, size_t ith,
-			     unsigned int flags) {
-	unsigned long buffer, adj;
-
-	/*
-	 * Adjust the buffer address and size to meet alignment
-	 * constraints:
-	 * - buffer is double-word aligned
-	 * - size is multiple of record size
-	 *
-	 * We checked the size at the very beginning; we have enough
-	 * space to do the adjustment.
-	 */
-	buffer = (unsigned long)base;
-
-	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
-	buffer += adj;
-	size   -= adj;
-
-	trace->n = size / ds_cfg.sizeof_rec[qual];
-	trace->size = ds_cfg.sizeof_rec[qual];
-
-	size = (trace->n * trace->size);
-
-	trace->begin = (void *)buffer;
-	trace->top = trace->begin;
-	trace->end = (void *)(buffer + size);
-	/*
-	 * The value for 'no threshold' is -1, which will set the
-	 * threshold outside of the buffer, just like we want it.
-	 */
-	ith *= ds_cfg.sizeof_rec[qual];
-	trace->ith = (void *)(buffer + size - ith);
-
-	trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
-		      enum ds_qualifier qual, struct task_struct *task,
-		      int cpu, void *base, size_t size, size_t th)
-{
-	struct ds_context *context;
-	int error;
-	size_t req_size;
-
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.sizeof_rec[qual])
-		goto out;
-
-	error = -EINVAL;
-	if (!base)
-		goto out;
-
-	req_size = ds_cfg.sizeof_rec[qual];
-	/* We might need space for alignment adjustments. */
-	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
-		req_size += DS_ALIGNMENT;
-
-	error = -EINVAL;
-	if (size < req_size)
-		goto out;
-
-	if (th != (size_t)-1) {
-		th *= ds_cfg.sizeof_rec[qual];
-
-		error = -EINVAL;
-		if (size <= th)
-			goto out;
-	}
-
-	tracer->buffer = base;
-	tracer->size = size;
-
-	error = -ENOMEM;
-	context = ds_get_context(task, cpu);
-	if (!context)
-		goto out;
-	tracer->context = context;
-
-	/*
-	 * Defer any tracer-specific initialization work for the context until
-	 * context ownership has been clarified.
-	 */
-
-	error = 0;
- out:
-	return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl, size_t th,
-					 unsigned int flags)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_bts, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the bts part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->bts_master)
-		goto out_unlock;
-	tracer->ds.context->bts_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the bts part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	ds_install_ds_area(tracer->ds.context);
-
-	tracer->trace.read  = bts_read;
-	tracer->trace.write = bts_write;
-
-	/* Start tracing. */
-	ds_resume_bts(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-				       void *base, size_t size,
-				       bts_ovfl_callback_t ovfl,
-				       size_t th, unsigned int flags)
-{
-	return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-				      bts_ovfl_callback_t ovfl,
-				      size_t th, unsigned int flags)
-{
-	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl, size_t th,
-					   unsigned int flags)
-{
-	struct pebs_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_pebs, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the pebs part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->pebs_master)
-		goto out_unlock;
-	tracer->ds.context->pebs_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the pebs part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-	ds_install_ds_area(tracer->ds.context);
-
-	/* Start tracing. */
-	ds_resume_pebs(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-					 void *base, size_t size,
-					 pebs_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags)
-{
-	return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
-					pebs_ovfl_callback_t ovfl,
-					size_t th, unsigned int flags)
-{
-	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
-	tracer->ds.context->bts_master = NULL;
-
-	/* Make sure tracing stopped and the tracer is not in use. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_bts(tracer);
-	ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_bts_noirq(tracer);
-	ds_free_bts(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
-				    unsigned long debugctlmsr)
-{
-	task->thread.debugctlmsr = debugctlmsr;
-
-	get_cpu();
-	if (task == current)
-		update_debugctlmsr(debugctlmsr);
-	put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
-	unsigned long control;
-
-	control = ds_cfg.ctl[dsf_bts];
-	if (!(tracer->trace.ds.flags & BTS_KERNEL))
-		control |= ds_cfg.ctl[dsf_bts_kernel];
-	if (!(tracer->trace.ds.flags & BTS_USER))
-		control |= ds_cfg.ctl[dsf_bts_user];
-
-	return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
-	tracer->ds.context->pebs_master = NULL;
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_pebs(tracer);
-	ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_pebs_noirq(tracer);
-	ds_free_pebs(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
-	tracer->trace.counters = ds_cfg.nr_counter_reset;
-	memcpy(tracer->trace.counter_reset,
-	       tracer->ds.context->ds +
-	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
-	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
-	return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
-		      unsigned int counter, u64 value)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (ds_cfg.nr_counter_reset < counter)
-		return -EINVAL;
-
-	*(u64 *)(tracer->ds.context->ds +
-		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
-		 (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
-	return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
-	.name = "Netburst",
-	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
-	.ctl[dsf_bts_kernel]	= (1 << 5),
-	.ctl[dsf_bts_user]	= (1 << 6),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
-	.name = "Pentium M",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
-	.name = "Core 2/Atom",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
-	.name = "Core i7",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
-	     struct cpuinfo_x86 *cpu)
-{
-	unsigned long nr_pebs_fields = 0;
-
-	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
-	nr_pebs_fields = 10;
-#else
-	nr_pebs_fields = 18;
-#endif
-
-	/*
-	 * Starting with version 2, architectural performance
-	 * monitoring supports a format specifier.
-	 */
-	if ((cpuid_eax(0xa) & 0xff) > 1) {
-		unsigned long perf_capabilities, format;
-
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
-		format = (perf_capabilities >> 8) & 0xf;
-
-		switch (format) {
-		case 0:
-			nr_pebs_fields = 18;
-			break;
-		case 1:
-			nr_pebs_fields = 22;
-			break;
-		default:
-			printk(KERN_INFO
-			       "[ds] unknown PEBS format: %lu\n", format);
-			nr_pebs_fields = 0;
-			break;
-		}
-	}
-
-	memset(&ds_cfg, 0, sizeof(ds_cfg));
-	ds_cfg = *cfg;
-
-	ds_cfg.sizeof_ptr_field =
-		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
-	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
-	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
-	if (!cpu_has(cpu, X86_FEATURE_BTS)) {
-		ds_cfg.sizeof_rec[ds_bts] = 0;
-		printk(KERN_INFO "[ds] bts not available\n");
-	}
-	if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
-		ds_cfg.sizeof_rec[ds_pebs] = 0;
-		printk(KERN_INFO "[ds] pebs not available\n");
-	}
-
-	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
-	       8 * ds_cfg.sizeof_ptr_field);
-	printk("bts/pebs record: %u/%u bytes\n",
-	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
-	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
-	/* Only configure the first cpu. Others are identical. */
-	if (ds_cfg.name)
-		return;
-
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0x9:
-		case 0xd: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m, c);
-			break;
-		case 0xf:
-		case 0x17: /* Core2 */
-		case 0x1c: /* Atom */
-			ds_configure(&ds_cfg_core2_atom, c);
-			break;
-		case 0x1a: /* Core i7 */
-			ds_configure(&ds_cfg_core_i7, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	case 0xf:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	default:
-		/* Sorry, don't know about them. */
-		break;
-	}
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
-				     enum bts_qualifier qualifier,
-				     struct task_struct *task)
-{
-	struct bts_tracer *tracer = context->bts_master;
-	struct bts_struct ts;
-
-	/* Prevent compilers from reading the tracer pointer twice. */
-	barrier();
-
-	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
-		return;
-
-	memset(&ts, 0, sizeof(ts));
-	ts.qualifier		= qualifier;
-	ts.variant.event.clock	= trace_clock_global();
-	ts.variant.event.pid	= task->pid;
-
-	bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
-	struct ds_context *prev_ctx	= prev->thread.ds_ctx;
-	struct ds_context *next_ctx	= next->thread.ds_ctx;
-	unsigned long debugctlmsr	= next->thread.debugctlmsr;
-
-	/* Make sure all data is read before we start. */
-	barrier();
-
-	if (prev_ctx) {
-		update_debugctlmsr(0);
-
-		ds_take_timestamp(prev_ctx, bts_task_departs, prev);
-	}
-
-	if (next_ctx) {
-		ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
-		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
-	}
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
-	if (ds_cfg.sizeof_rec[ds_bts]) {
-		int error;
-
-		error = ds_selftest_bts();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling bts.\n");
-			ds_cfg.sizeof_rec[ds_bts] = 0;
-		}
-	}
-
-	if (ds_cfg.sizeof_rec[ds_pebs]) {
-		int error;
-
-		error = ds_selftest_pebs();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling pebs.\n");
-			ds_cfg.sizeof_rec[ds_pebs] = 0;
-		}
-	}
-
-	return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
-
-struct ds_selftest_bts_conf {
-	struct bts_tracer *tracer;
-	int error;
-	int (*suspend)(struct bts_tracer *);
-	int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
-	int error = 0;
-
-	if (!trace) {
-		printk(KERN_CONT "failed to access trace...");
-		/* Bail out. Other tests are pointless. */
-		return -1;
-	}
-
-	if (!trace->read) {
-		printk(KERN_CONT "bts read not available...");
-		error = -1;
-	}
-
-	/* Do some sanity checks on the trace configuration. */
-	if (!trace->ds.n) {
-		printk(KERN_CONT "empty bts buffer...");
-		error = -1;
-	}
-	if (!trace->ds.size) {
-		printk(KERN_CONT "bad bts trace setup...");
-		error = -1;
-	}
-	if (trace->ds.end !=
-	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
-		printk(KERN_CONT "bad bts buffer setup...");
-		error = -1;
-	}
-	/*
-	 * We allow top in [begin; end], since its not clear when the
-	 * overflow adjustment happens: after the increment or before the
-	 * write.
-	 */
-	if ((trace->ds.top < trace->ds.begin) ||
-	    (trace->ds.end < trace->ds.top)) {
-		printk(KERN_CONT "bts top out of bounds...");
-		error = -1;
-	}
-
-	return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
-				const struct bts_trace *trace,
-				const void *from, const void *to)
-{
-	const unsigned char *at;
-
-	/*
-	 * Check a few things which do not belong to this test.
-	 * They should be covered by other tests.
-	 */
-	if (!trace)
-		return -1;
-
-	if (!trace->read)
-		return -1;
-
-	if (to < from)
-		return -1;
-
-	if (from < trace->ds.begin)
-		return -1;
-
-	if (trace->ds.end < to)
-		return -1;
-
-	if (!trace->ds.size)
-		return -1;
-
-	/* Now to the test itself. */
-	for (at = from; (void *)at < to; at += trace->ds.size) {
-		struct bts_struct bts;
-		unsigned long index;
-		int error;
-
-		if (((void *)at - trace->ds.begin) % trace->ds.size) {
-			printk(KERN_CONT
-			       "read from non-integer index...");
-			return -1;
-		}
-		index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
-		memset(&bts, 0, sizeof(bts));
-		error = trace->read(tracer, at, &bts);
-		if (error < 0) {
-			printk(KERN_CONT
-			       "error reading bts trace at [%lu] (0x%p)...",
-			       index, at);
-			return error;
-		}
-
-		switch (bts.qualifier) {
-		case BTS_BRANCH:
-			break;
-		default:
-			printk(KERN_CONT
-			       "unexpected bts entry %llu at [%lu] (0x%p)...",
-			       bts.qualifier, index, at);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
-	struct ds_selftest_bts_conf *conf = arg;
-	const struct bts_trace *trace;
-	void *top;
-
-	if (IS_ERR(conf->tracer)) {
-		conf->error = PTR_ERR(conf->tracer);
-		conf->tracer = NULL;
-
-		printk(KERN_CONT
-		       "initialization failed (err: %d)...", conf->error);
-		return;
-	}
-
-	/* We should meanwhile have enough trace. */
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	/* Let's see if we can access the trace. */
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	/* If everything went well, we should have a few trace entries. */
-	if (trace->ds.top == trace->ds.begin) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning, but continue.
-		 */
-		printk(KERN_CONT "no trace/overflow...");
-	}
-
-	/* Let's try to read the trace we collected. */
-	conf->error =
-		ds_selftest_bts_read(conf->tracer, trace,
-				     trace->ds.begin, trace->ds.top);
-	if (conf->error < 0)
-		return;
-
-	/*
-	 * Let's read the trace again.
-	 * Since we suspended tracing, we should get the same result.
-	 */
-	top = trace->ds.top;
-
-	trace = ds_read_bts(conf->tracer);
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (top != trace->ds.top) {
-		printk(KERN_CONT "suspend not working...");
-		conf->error = -1;
-		return;
-	}
-
-	/* Let's collect some more trace - see if resume is working. */
-	conf->error = conf->resume(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (trace->ds.top == top) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning and check the full trace.
-		 */
-		printk(KERN_CONT
-		       "no resume progress/overflow...");
-
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else if (trace->ds.top < top) {
-		/*
-		 * We had a buffer overflow - the entire buffer should
-		 * contain trace records.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else {
-		/*
-		 * It is quite likely that the buffer did not overflow.
-		 * Let's just check the delta trace.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace, top,
-					     trace->ds.top);
-	}
-	if (conf->error < 0)
-		return;
-
-	conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_suspend_bts(tracer);
-	return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_resume_bts(tracer);
-	return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
-	(void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
-					     struct bts_tracer *tracer)
-{
-	int error = -EPERM;
-
-	/* Try to release the tracer on the wrong cpu. */
-	get_cpu();
-	if (cpu != smp_processor_id()) {
-		error = ds_release_bts_noirq(tracer);
-		if (error != -EPERM)
-			printk(KERN_CONT "release on wrong cpu...");
-	}
-	put_cpu();
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		ds_release_bts(tracer);
-		error = 0;
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "cpu/task tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		error = 0;
-		ds_release_bts(tracer);
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "task/cpu tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
-	struct ds_selftest_bts_conf conf;
-	unsigned char buffer[BUFFER_SIZE], *small_buffer;
-	unsigned long irq;
-	int cpu;
-
-	printk(KERN_INFO "[ds] bts selftest...");
-	conf.error = 0;
-
-	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		conf.suspend = ds_suspend_bts_wrap;
-		conf.resume = ds_resume_bts_wrap;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		ds_selftest_bts_cpu(&conf);
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		ds_release_bts(conf.tracer);
-		if (conf.error < 0)
-			goto out;
-
-		conf.suspend = ds_suspend_bts_noirq;
-		conf.resume = ds_resume_bts_noirq;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
-		if (conf.error >= 0) {
-			conf.error =
-				ds_selftest_bts_bad_release_noirq(cpu,
-								  conf.tracer);
-			/* We must not release the tracer twice. */
-			if (conf.error < 0)
-				conf.tracer = NULL;
-		}
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
-					 conf.tracer, 1);
-		if (conf.error < 0)
-			goto out;
-	}
-
-	conf.suspend = ds_suspend_bts_wrap;
-	conf.resume = ds_resume_bts_wrap;
-	conf.tracer =
-		ds_request_bts_task(current, buffer, BUFFER_SIZE,
-				    NULL, (size_t)-1, BTS_KERNEL);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts(conf.tracer);
-	if (conf.error < 0)
-		goto out;
-
-	conf.suspend = ds_suspend_bts_noirq;
-	conf.resume = ds_resume_bts_noirq;
-	conf.tracer =
-		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
-				   NULL, (size_t)-1, BTS_KERNEL);
-	local_irq_save(irq);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts_noirq(conf.tracer);
-	local_irq_restore(irq);
-	if (conf.error < 0)
-		goto out;
-
-	conf.error = 0;
- out:
-	put_online_cpus();
-	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
-	return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
-	return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..c89a386930b7 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
-	/* notify the hw-branch tracer so it may disable tracing and
-	   add the last trace to the trace buffer -
-	   the earlier this happens, the more useful the trace. */
-	trace_hw_branch_oops();
-
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c0..7a880ad3a208 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,12 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(0);
+	/* XXX */
 }
 
 static void __kprobes restore_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(current->thread.debugctlmsr);
+	/* XXX */
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ad9540676fcc..1a60beb32ede 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 unsigned long idle_halt;
@@ -50,8 +49,6 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
-
-	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -198,12 +195,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..75090c589b7a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
 #include <asm/cpu.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	return err;
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4cc..cc4258f2beb5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		if (err)
 			goto out;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a503b1fd04e5..f2fd3b80e565 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
 /*
  * Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- *	Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
  */
 
 #include <linux/kernel.h>
@@ -21,7 +18,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/workqueue.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 
@@ -35,7 +31,6 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
-#include <asm/ds.h>
 #include <asm/hw_breakpoint.h>
 
 #include "tls.h"
@@ -788,342 +783,6 @@ static int ioperm_get(struct task_struct *target,
 				   0, IO_BITMAP_BYTES);
 }
 
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
-	/* The branch trace handle. */
-	struct bts_tracer	*tracer;
-
-	/* The buffer used to store the branch trace and its size. */
-	void			*buffer;
-	unsigned int		size;
-
-	/* The mm that paid for the above buffer. */
-	struct mm_struct	*mm;
-
-	/* The task this context belongs to. */
-	struct task_struct	*task;
-
-	/* The signal to send on a bts buffer overflow. */
-	unsigned int		bts_ovfl_signal;
-
-	/* The work struct to destroy a context. */
-	struct work_struct	work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
-	void *buffer = NULL;
-	int err = -ENOMEM;
-
-	err = account_locked_memory(current->mm, current->signal->rlim, size);
-	if (err < 0)
-		return err;
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto out_refund;
-
-	context->buffer = buffer;
-	context->size = size;
-	context->mm = get_task_mm(current);
-
-	return 0;
-
- out_refund:
-	refund_locked_memory(current->mm, size);
-	return err;
-}
-
-static inline void free_bts_buffer(struct bts_context *context)
-{
-	if (!context->buffer)
-		return;
-
-	kfree(context->buffer);
-	context->buffer = NULL;
-
-	refund_locked_memory(context->mm, context->size);
-	context->size = 0;
-
-	mmput(context->mm);
-	context->mm = NULL;
-}
-
-static void free_bts_context_work(struct work_struct *w)
-{
-	struct bts_context *context;
-
-	context = container_of(w, struct bts_context, work);
-
-	ds_release_bts(context->tracer);
-	put_task_struct(context->task);
-	free_bts_buffer(context);
-	kfree(context);
-}
-
-static inline void free_bts_context(struct bts_context *context)
-{
-	INIT_WORK(&context->work, free_bts_context_work);
-	schedule_work(&context->work);
-}
-
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
-	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (context) {
-		context->task = task;
-		task->bts = context;
-
-		get_task_struct(task);
-	}
-
-	return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
-				  struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct bts_struct bts;
-	const unsigned char *at;
-	int error;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	at = trace->ds.top - ((index + 1) * trace->ds.size);
-	if ((void *)at < trace->ds.begin)
-		at += (trace->ds.n * trace->ds.size);
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	error = trace->read(context->tracer, at, &bts);
-	if (error < 0)
-		return error;
-
-	if (copy_to_user(out, &bts, sizeof(bts)))
-		return -EFAULT;
-
-	return sizeof(bts);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
-			    long size,
-			    struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	const unsigned char *at;
-	int error, drained = 0;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	if (size < (trace->ds.top - trace->ds.begin))
-		return -EIO;
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     out++, drained++, at += trace->ds.size) {
-		struct bts_struct bts;
-
-		error = trace->read(context->tracer, at, &bts);
-		if (error < 0)
-			return error;
-
-		if (copy_to_user(out, &bts, sizeof(bts)))
-			return -EFAULT;
-	}
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	error = ds_reset_bts(context->tracer);
-	if (error < 0)
-		return error;
-
-	return drained;
-}
-
-static int ptrace_bts_config(struct task_struct *child,
-			     long cfg_size,
-			     const struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	struct ptrace_bts_config cfg;
-	unsigned int flags = 0;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		return -EFAULT;
-
-	context = child->bts;
-	if (!context)
-		context = alloc_bts_context(child);
-	if (!context)
-		return -ENOMEM;
-
-	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-		if (!cfg.signal)
-			return -EINVAL;
-
-		return -EOPNOTSUPP;
-		context->bts_ovfl_signal = cfg.signal;
-	}
-
-	ds_release_bts(context->tracer);
-	context->tracer = NULL;
-
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
-		int err;
-
-		free_bts_buffer(context);
-		if (!cfg.size)
-			return 0;
-
-		err = alloc_bts_buffer(context, cfg.size);
-		if (err < 0)
-			return err;
-	}
-
-	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		flags |= BTS_USER;
-
-	if (cfg.flags & PTRACE_BTS_O_SCHED)
-		flags |= BTS_TIMESTAMPS;
-
-	context->tracer =
-		ds_request_bts_task(child, context->buffer, context->size,
-				    NULL, (size_t)-1, flags);
-	if (unlikely(IS_ERR(context->tracer))) {
-		int error = PTR_ERR(context->tracer);
-
-		free_bts_buffer(context);
-		context->tracer = NULL;
-		return error;
-	}
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_status(struct task_struct *child,
-			     long cfg_size,
-			     struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct ptrace_bts_config cfg;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(&cfg, 0, sizeof(cfg));
-	cfg.size	= trace->ds.end - trace->ds.begin;
-	cfg.signal	= context->bts_ovfl_signal;
-	cfg.bts_size	= sizeof(struct bts_struct);
-
-	if (cfg.signal)
-		cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
-	if (trace->ds.flags & BTS_USER)
-		cfg.flags |= PTRACE_BTS_O_TRACE;
-
-	if (trace->ds.flags & BTS_TIMESTAMPS)
-		cfg.flags |= PTRACE_BTS_O_SCHED;
-
-	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
-		return -EFAULT;
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_clear(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	return ds_reset_bts(context->tracer);
-}
-
-static int ptrace_bts_size(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
-	if (unlikely(child->bts)) {
-		free_bts_context(child->bts);
-		child->bts = NULL;
-	}
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -1251,39 +910,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
-	/*
-	 * These bits need more cooking - not enabled yet:
-	 */
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-		ret = ptrace_bts_config
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_STATUS:
-		ret = ptrace_bts_status
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_SIZE:
-		ret = ptrace_bts_size(child);
-		break;
-
-	case PTRACE_BTS_GET:
-		ret = ptrace_bts_read_record
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-
-	case PTRACE_BTS_CLEAR:
-		ret = ptrace_bts_clear(child);
-		break;
-
-	case PTRACE_BTS_DRAIN:
-		ret = ptrace_bts_drain
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		break;
@@ -1543,14 +1169,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-	case PTRACE_BTS_STATUS:
-	case PTRACE_BTS_SIZE:
-	case PTRACE_BTS_GET:
-	case PTRACE_BTS_CLEAR:
-	case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
 		return arch_ptrace(child, request, addr, data);
 
 	default:
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..7beba0769a8c 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,22 +157,6 @@ static int enable_single_step(struct task_struct *child)
 	return 1;
 }
 
-/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
-	if (child->thread.debugctlmsr == val)
-		return;
-
-	child->thread.debugctlmsr = val;
-
-	if (child != current)
-		return;
-
-	update_debugctlmsr(val);
-}
-
 /*
  * Enable single or block step.
  */
@@ -185,17 +169,9 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (enable_single_step(child) && block) {
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
-	} else {
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	}
+	if (!enable_single_step(child))
+		return;
+	/* XXX */
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -213,11 +189,7 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	write_debugctlmsr(child,
-			  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	/* XXX */
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..e3da5d726a37 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -543,11 +543,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
-	tsk->thread.debugctlmsr = 0;
 
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 01e6adea07ec..cc12b3c556b3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -504,18 +504,6 @@ extern int ftrace_dump_on_oops;
 #define INIT_TRACE_RECURSION
 #endif
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-
-void trace_hw_branch(u64 from, u64 to);
-void trace_hw_branch_oops(void);
-
-#else /* CONFIG_HW_BRANCH_TRACER */
-
-static inline void trace_hw_branch(u64 from, u64 to) {}
-static inline void trace_hw_branch_oops(void) {}
-
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_FTRACE_SYSCALLS
 
 unsigned long arch_syscall_addr(int nr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e70f21beb4b4..c8442b655111 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,7 +19,6 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
-struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1449,9 +1448,6 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-				 size_t size);
-extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e1fb60729979..4272521e29e9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,18 +345,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_untrace
-/*
- * Do machine-specific work before untracing child.
- *
- * This is called for a normal detach as well as from ptrace_exit()
- * when the tracing task dies.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-#define arch_ptrace_untrace(task)		do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf7..e0447c64af6a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,7 +99,6 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio_list;
 struct fs_struct;
-struct bts_context;
 struct perf_event_context;
 
 /*
@@ -1272,12 +1271,6 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-	/*
-	 * This is the tracer handle for the ptrace BTS extension.
-	 * This field actually belongs to the ptracer task.
-	 */
-	struct bts_context *bts;
-
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -2123,10 +2116,8 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 4799c5f0e6d0..d67f1dbfbe03 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1108,9 +1108,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.do_batch = 0;
 	p->memcg_batch.memcg = NULL;
 #endif
-
-	p->bts = NULL;
-
 	p->stack_start = stack_start;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..9fb51237b18c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
-	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d3..117b7cad31b3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2076,49 +2076,6 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 }
 
-/*
- * wait_task_context_switch -	wait for a thread to complete at least one
- *				context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
-	unsigned long nvcsw, nivcsw, flags;
-	int running;
-	struct rq *rq;
-
-	nvcsw	= p->nvcsw;
-	nivcsw	= p->nivcsw;
-	for (;;) {
-		/*
-		 * The runqueue is assigned before the actual context
-		 * switch. We need to take the runqueue lock.
-		 *
-		 * We could check initially without the lock but it is
-		 * very likely that we need to take the lock in every
-		 * iteration.
-		 */
-		rq = task_rq_lock(p, &flags);
-		running = task_running(rq, p);
-		task_rq_unlock(rq, &flags);
-
-		if (likely(!running))
-			break;
-		/*
-		 * The switch count is incremented before the actual
-		 * context switch. We thus wait for two switches to be
-		 * sure at least one completed.
-		 */
-		if ((p->nvcsw - nvcsw) > 1)
-			break;
-		if ((p->nivcsw - nivcsw) > 1)
-			break;
-
-		cpu_relax();
-	}
-}
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
 	help
 	  See Documentation/trace/ftrace-design.txt
 
-config HAVE_HW_BRANCH_TRACER
-	bool
-
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -374,14 +371,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config HW_BRANCH_TRACER
-	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace hw branches"
-	select GENERIC_TRACER
-	help
-	  This tracer records all branches on the system in a circular
-	  buffer, giving access to the last N branches for each cpu.
-
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..bec2c973ff0c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_HW_BRANCHES,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_BLK,
@@ -229,7 +228,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -467,8 +465,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
-					      struct trace_array *tr);
 extern int trace_selftest_startup_ksym(struct tracer *trace,
 					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-
-	TRACE_HW_BRANCHES,
-
-	F_STRUCT(
-		__field(	u64,	from	)
-		__field(	u64,	to	)
-	),
-
-	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
-
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 
 	TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
-	per_cpu(hwb_tracer, cpu) =
-		ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-				   BTS_BUFFER_SIZE, NULL, (size_t)-1,
-				   BTS_KERNEL);
-
-	if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-		per_cpu(hwb_tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	hw_branch_trace = tr;
-	trace_hw_branches_enabled = 0;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		bts_trace_init_cpu(cpu);
-
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			trace_hw_branches_enabled = 1;
-	}
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-
-	/* If we could not enable tracing on a single cpu, we fail. */
-	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-	trace_hw_branches_enabled = 0;
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 1;
-	put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
-				     unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		/* The notification is sent with interrupts enabled. */
-		if (trace_hw_branches_enabled) {
-			bts_trace_init_cpu(cpu);
-
-			if (trace_hw_branches_suspended &&
-			    likely(per_cpu(hwb_tracer, cpu)))
-				ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/* The notification is sent with interrupts enabled. */
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
-	.notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m, "# CPU#        TO  <-  FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct hw_branch_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
-		    seq_print_ip_sym(seq, it->to, symflags) &&
-		    trace_seq_printf(seq, "\t  <-  ") &&
-		    seq_print_ip_sym(seq, it->from, symflags) &&
-		    trace_seq_printf(seq, "\n"))
-			return TRACE_TYPE_HANDLED;
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
-	struct ftrace_event_call *call = &event_hw_branch;
-	struct trace_array *tr = hw_branch_trace;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buf;
-	struct hw_branch_entry *entry;
-	unsigned long irq1;
-	int cpu;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(!trace_hw_branches_enabled))
-		return;
-
-	local_irq_save(irq1);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	buf = tr->buffer;
-	event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->from = from;
-	entry->to   = to;
-	if (!filter_check_discard(call, entry, buf, event))
-		trace_buffer_unlock_commit(buf, event, 0, 0);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
-	struct bts_struct bts;
-	int err = 0;
-
-	WARN_ON_ONCE(!trace->read);
-	if (!trace->read)
-		return;
-
-	err = trace->read(this_tracer, at, &bts);
-	if (err < 0)
-		return;
-
-	switch (bts.qualifier) {
-	case BTS_BRANCH:
-		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
-		break;
-	}
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *)arg;
-	const struct bts_trace *trace;
-	unsigned char *at;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
-		return;
-
-	if (unlikely(!this_tracer))
-		return;
-
-	trace = ds_read_bts(this_tracer);
-	if (!trace)
-		return;
-
-	for (at = trace->ds.top; (void *)at < trace->ds.end;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	/*
-	 * We need to collect the trace on the respective cpu since ftrace
-	 * implicitly adds the record for the current cpu.
-	 * Once that is more flexible, we could collect the data from any cpu.
-	 */
-	on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
-	tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
-	if (this_tracer) {
-		ds_suspend_bts_noirq(this_tracer);
-		trace_bts_cpu(hw_branch_trace);
-		ds_resume_bts_noirq(this_tracer);
-	}
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "hw-branch-tracer",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_reset,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare,
-	.close		= trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..a7084e7c0427 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,7 +16,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-	case TRACE_HW_BRANCHES:
 	case TRACE_KSYM:
 		return 1;
 	}
@@ -754,62 +753,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
-				   struct trace_array *tr)
-{
-	struct trace_iterator *iter;
-	struct tracer tracer;
-	unsigned long count;
-	int ret;
-
-	if (!trace->open) {
-		printk(KERN_CONT "missing open function...");
-		return -1;
-	}
-
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/*
-	 * The hw-branch tracer needs to collect the trace from the various
-	 * cpu trace buffers - before tracing is stopped.
-	 */
-	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
-
-	memcpy(&tracer, trace, sizeof(tracer));
-
-	iter->trace = &tracer;
-	iter->tr = tr;
-	iter->pos = -1;
-	mutex_init(&iter->mutex);
-
-	trace->open(iter);
-
-	mutex_destroy(&iter->mutex);
-	kfree(iter);
-
-	tracing_stop();
-
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT "no entries found..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_KSYM_TRACER
 static int ksym_selftest_dummy;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec1..3f82720e0515 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	spin_unlock(&shmlock_user_lock);
 	free_uid(user);
 }
-
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-			  size_t size)
-{
-	unsigned long lim, vm, pgsz;
-	int error = -ENOMEM;
-
-	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->total_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->locked_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	mm->total_vm  += pgsz;
-	mm->locked_vm += pgsz;
-
-	error = 0;
- out:
-	up_write(&mm->mmap_sem);
-	return error;
-}
-
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
-	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	mm->total_vm  -= pgsz;
-	mm->locked_vm -= pgsz;
-
-	up_write(&mm->mmap_sem);
-}
-- 
cgit v1.2.3-55-g7522


From ae832d1e03ac9bf09fb8a07fb37908ab40c7cd0e Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Wed, 24 Mar 2010 10:57:43 +0800
Subject: tracing: Remove side effect from module tracepoints that caused a GPF

Remove the @refcnt argument, because it has side-effects, and arguments with
side-effects are not skipped by the jump over disabled instrumentation and are
executed even when the tracepoint is disabled.

This was also causing a GPF as found by Randy Dunlap:

Subject: 2.6.33 GP fault only when built with tracing
LKML-Reference: <4BA2B69D.3000309@oracle.com>

Note, the current 2.6.34-rc has a fix for the actual cause of the GPF,
but this fixes one of its triggers.

Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FA7.6040406@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/module.h        |  6 ++----
 include/trace/events/module.h | 14 +++++++-------
 kernel/module.c               |  3 +--
 3 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5e869ffd34aa..393ec39b580a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -460,8 +460,7 @@ static inline void __module_get(struct module *module)
 	if (module) {
 		preempt_disable();
 		__this_cpu_inc(module->refptr->count);
-		trace_module_get(module, _THIS_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_get(module, _THIS_IP_);
 		preempt_enable();
 	}
 }
@@ -475,8 +474,7 @@ static inline int try_module_get(struct module *module)
 
 		if (likely(module_is_live(module))) {
 			__this_cpu_inc(module->refptr->count);
-			trace_module_get(module, _THIS_IP_,
-				__this_cpu_read(module->refptr->count));
+			trace_module_get(module, _THIS_IP_);
 		}
 		else
 			ret = 0;
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 4b0f48ba16a6..a585f8135bd9 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -53,9 +53,9 @@ TRACE_EVENT(module_free,
 
 DECLARE_EVENT_CLASS(module_refcnt,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt),
+	TP_ARGS(mod, ip),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	ip		)
@@ -65,7 +65,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_fast_assign(
 		__entry->ip	= ip;
-		__entry->refcnt	= refcnt;
+		__entry->refcnt	= __this_cpu_read(mod->refptr->count);
 		__assign_str(name, mod->name);
 	),
 
@@ -75,16 +75,16 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 DEFINE_EVENT(module_refcnt, module_get,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 DEFINE_EVENT(module_refcnt, module_put,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 TRACE_EVENT(module_request,
diff --git a/kernel/module.c b/kernel/module.c
index c968d3606dca..21591ad921f3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -800,8 +800,7 @@ void module_put(struct module *module)
 		preempt_disable();
 		__this_cpu_dec(module->refptr->count);
 
-		trace_module_put(module, _RET_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_put(module, _RET_IP_);
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
-- 
cgit v1.2.3-55-g7522


From eb0c53771fb2f5f66b0edb3ebce33be4bbf1c285 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 29 Mar 2010 14:25:18 -0400
Subject: tracing: Fix compile error in module tracepoints when MODULE_UNLOAD
 not set

If modules are configured in the build but unloading of modules is not,
then the refcnt is not defined. Place the get/put module tracepoints
under CONFIG_MODULE_UNLOAD since it references this field in the module
structure.

As a side-effect, this patch also reduces the code when MODULE_UNLOAD
is not set, because these unused tracepoints are not created.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/module.h | 4 ++++
 kernel/module.c               | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index a585f8135bd9..f07b44a2b240 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -51,6 +51,9 @@ TRACE_EVENT(module_free,
 	TP_printk("%s", __get_str(name))
 );
 
+#ifdef CONFIG_MODULE_UNLOAD
+/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
+
 DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_PROTO(struct module *mod, unsigned long ip),
@@ -86,6 +89,7 @@ DEFINE_EVENT(module_refcnt, module_put,
 
 	TP_ARGS(mod, ip)
 );
+#endif /* CONFIG_MODULE_UNLOAD */
 
 TRACE_EVENT(module_request,
 
diff --git a/kernel/module.c b/kernel/module.c
index 21591ad921f3..d9e237926b69 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
 #if 0
 #define DEBUGP printk
 #else
@@ -467,6 +465,9 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 
 #ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-- 
cgit v1.2.3-55-g7522


From 66a8cb95ed04025664d1db4e952155ee1dccd048 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Wed, 31 Mar 2010 13:21:56 -0400
Subject: ring-buffer: Add place holder recording of dropped events

Currently, when the ring buffer drops events, it does not record
the fact that it did so. It does inform the writer that the event
was dropped by returning a NULL event, but it does not put in any
place holder where the event was dropped.

This is not a trivial thing to add because the ring buffer mostly
runs in overwrite (flight recorder) mode. That is, when the ring
buffer is full, new data will overwrite old data.

In a produce/consumer mode, where new data is simply dropped when
the ring buffer is full, it is trivial to add the placeholder
for dropped events. When there's more room to write new data, then
a special event can be added to notify the reader about the dropped
events.

But in overwrite mode, any new write can overwrite events. A place
holder can not be inserted into the ring buffer since there never
may be room. A reader could also come in at anytime and miss the
placeholder.

Luckily, the way the ring buffer works, the read side can find out
if events were lost or not, and how many events. Everytime a write
takes place, if it overwrites the header page (the next read) it
updates a "overrun" variable that keeps track of the number of
lost events. When a reader swaps out a page from the ring buffer,
it can record this number, perfom the swap, and then check to
see if the number changed, and take the diff if it has, which would be
the number of events dropped. This can be stored by the reader
and returned to callers of the reader.

Since the reader page swap will fail if the writer moved the head
page since the time the reader page set up the swap, this gives room
to record the overruns without worrying about races. If the reader
sets up the pages, records the overrun, than performs the swap,
if the swap succeeds, then the overrun variable has not been
updated since the setup before the swap.

For binary readers of the ring buffer, a flag is set in the header
of each sub page (sub buffer) of the ring buffer. This flag is embedded
in the size field of the data on the sub buffer, in the 31st bit (the size
can be 32 or 64 bits depending on the architecture), but only 27
bits needs to be used for the actual size (less actually).

We could add a new field in the sub buffer header to also record the
number of events dropped since the last read, but this will change the
format of the binary ring buffer a bit too much. Perhaps this change can
be made if the information on the number of events dropped is considered
important enough.

Note, the notification of dropped events is only used by consuming reads
or peeking at the ring buffer. Iterating over the ring buffer does not
keep this information because the necessary data is only available when
a page swap is made, and the iterator does not swap out pages.

Cc: Robert Richter <robert.richter@amd.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 drivers/oprofile/cpu_buffer.c        |  4 +-
 include/linux/ring_buffer.h          |  6 ++-
 kernel/trace/ring_buffer.c           | 72 +++++++++++++++++++++++++++++++++---
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 |  4 +-
 kernel/trace/trace_functions_graph.c |  5 ++-
 kernel/trace/trace_selftest.c        |  2 +-
 7 files changed, 79 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622f..7581dbe456da 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
 	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
 	if (e)
 		goto event;
 	if (ring_buffer_swap_cpu(op_ring_buffer_read,
 				 op_ring_buffer_write,
 				 cpu))
 		return NULL;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
 	if (e)
 		goto event;
 	return NULL;
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5fcc31ed5771..c8297761e414 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,9 +120,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events);
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d1187ef20caf..8295650444c5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS	(1 << 31)
+
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
 	local_t		 commit;	/* write committed index */
@@ -416,6 +419,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 			       (unsigned int)sizeof(field.commit),
 			       (unsigned int)is_signed_type(long));
 
+	ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       1,
+			       (unsigned int)is_signed_type(long));
+
 	ret = trace_seq_printf(s, "\tfield: char data;\t"
 			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			       (unsigned int)offsetof(typeof(field), data),
@@ -439,6 +448,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			lost_events;
+	unsigned long			last_overrun;
 	local_t				commit_overrun;
 	local_t				overrun;
 	local_t				entries;
@@ -2835,6 +2846,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = NULL;
+	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
 	int ret;
@@ -2895,6 +2907,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	/* The reader page will be pointing to the new head */
 	rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
 
+	/*
+	 * We want to make sure we read the overruns after we set up our
+	 * pointers to the next object. The writer side does a
+	 * cmpxchg to cross pages which acts as the mb on the writer
+	 * side. Note, the reader will constantly fail the swap
+	 * while the writer is updating the pointers, so this
+	 * guarantees that the overwrite recorded here is the one we
+	 * want to compare with the last_overrun.
+	 */
+	smp_mb();
+	overwrite = local_read(&(cpu_buffer->overrun));
+
 	/*
 	 * Here's the tricky part.
 	 *
@@ -2926,6 +2950,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page = reader;
 	rb_reset_reader_page(cpu_buffer);
 
+	if (overwrite != cpu_buffer->last_overrun) {
+		cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+		cpu_buffer->last_overrun = overwrite;
+	}
+
 	goto again;
 
  out:
@@ -3002,8 +3031,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		rb_advance_iter(iter);
 }
 
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return cpu_buffer->lost_events;
+}
+
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+	       unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct buffer_page *reader;
@@ -3055,6 +3090,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
+		if (lost_events)
+			*lost_events = rb_lost_events(cpu_buffer);
 		return event;
 
 	default:
@@ -3165,12 +3202,14 @@ static inline int rb_ok_to_lock(void)
  * @buffer: The ring buffer to read
  * @cpu: The cpu to peak at
  * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * This will return the event that will be read next, but does
  * not consume the data.
  */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
@@ -3185,7 +3224,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	local_irq_save(flags);
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
-	event = rb_buffer_peek(cpu_buffer, ts);
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
 	if (dolock)
@@ -3227,13 +3266,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * Returns the next event in the ring buffer, and that event is consumed.
  * Meaning, that sequential reads will keep returning a different event,
  * and eventually empty the ring buffer if the producer is slower.
  */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event = NULL;
@@ -3254,9 +3297,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
 
-	event = rb_buffer_peek(cpu_buffer, ts);
-	if (event)
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+	if (event) {
+		cpu_buffer->lost_events = 0;
 		rb_advance_reader(cpu_buffer);
+	}
 
 	if (dolock)
 		spin_unlock(&cpu_buffer->reader_lock);
@@ -3405,6 +3450,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
 
+	cpu_buffer->lost_events = 0;
+	cpu_buffer->last_overrun = 0;
+
 	rb_head_page_activate(cpu_buffer);
 }
 
@@ -3684,6 +3732,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
+	int missed_events = 0;
 	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3716,6 +3765,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	read = reader->read;
 	commit = rb_page_commit(reader);
 
+	/* Check if any events were dropped */
+	if (cpu_buffer->lost_events)
+		missed_events = 1;
+
 	/*
 	 * If this page has been partially read or
 	 * if len is not big enough to read the rest of the page or
@@ -3779,6 +3832,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	}
 	ret = read;
 
+	cpu_buffer->lost_events = 0;
+	/*
+	 * Set a flag in the commit field if we lost events
+	 */
+	if (missed_events)
+		local_add(RB_MISSED_EVENTS, &bpage->commit);
+
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..dc56556b55a2 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
 	int *entry;
 	u64 ts;
 
-	event = ring_buffer_consume(buffer, cpu, &ts);
+	event = ring_buffer_consume(buffer, cpu, &ts, NULL);
 	if (!event)
 		return EVENT_DROPPED;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ec2ee6f6560..fabb0033a9be 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1556,7 +1556,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
 
 	ftrace_enable_cpu();
 
@@ -1635,7 +1635,7 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
 	ftrace_enable_cpu();
 }
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e6989d9b44da..a7f75fb10aa4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -489,9 +489,10 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * We need to consume the current entry to see
 			 * the next one.
 			 */
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+			ring_buffer_consume(iter->tr->buffer, iter->cpu,
+					    NULL, NULL);
 			event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-						 NULL);
+						 NULL, NULL);
 		}
 
 		if (!event)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..e50180874c63 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
 	struct trace_entry *entry;
 	unsigned int loops = 0;
 
-	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
 		entry = ring_buffer_event_data(event);
 
 		/*
-- 
cgit v1.2.3-55-g7522


From bc21b478425ac73f66a5ec0b375a5e0d12d609ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Wed, 31 Mar 2010 19:49:26 -0400
Subject: tracing: Show the lost events in the trace_pipe output

Now that the ring buffer can keep track of where events are lost.
Use this information to the output of trace_pipe:

       hackbench-3588  [001]  1326.701660: lock_acquire: ffffffff816591e0 read rcu_read_lock
       hackbench-3588  [001]  1326.701661: lock_acquire: ffff88003f4091f0 &(&dentry->d_lock)->rlock
       hackbench-3588  [001]  1326.701664: lock_release: ffff88003f4091f0 &(&dentry->d_lock)->rlock
CPU:1 [LOST 673 EVENTS]
       hackbench-3588  [001]  1326.702711: kmem_cache_free: call_site=ffffffff81102b85 ptr=ffff880026d96738
       hackbench-3588  [001]  1326.702712: lock_release: ffff88003e1480a8 &mm->mmap_sem
       hackbench-3588  [001]  1326.702713: lock_acquire: ffff88003e1480a8 &mm->mmap_sem

Even works with the function graph tracer:

 2) ! 170.098 us  |                                            }
 2)   4.036 us    |                                            rcu_irq_exit();
 2)   3.657 us    |                                            idle_cpu();
 2) ! 190.301 us  |                                          }
CPU:2 [LOST 2196 EVENTS]
 2)   0.853 us    |                            } /* cancel_dirty_page */
 2)               |                            remove_from_page_cache() {
 2)   1.578 us    |                              _raw_spin_lock_irq();
 2)               |                              __remove_from_page_cache() {

Note, it does not work with the iterator "trace" file, since it requires
the use of consuming the page from the ring buffer to determine how many
events were lost, which the iterator does not do.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace.c         | 30 ++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c711..39e71b0a3bfd 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -58,6 +58,7 @@ struct trace_iterator {
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
 	struct trace_entry	*ent;
+	unsigned long		lost_events;
 	int			leftover;
 	int			cpu;
 	u64			ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fabb0033a9be..0498bebcbfd1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1545,7 +1545,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+		unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1557,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+					 lost_events);
 
 	ftrace_enable_cpu();
 
@@ -1564,10 +1566,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+		  unsigned long *missing_events, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	unsigned long lost_events, next_lost = 0;
 	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
@@ -1580,7 +1584,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (cpu_file > TRACE_PIPE_ALL_CPU) {
 		if (ring_buffer_empty_cpu(buffer, cpu_file))
 			return NULL;
-		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
 		if (ent_cpu)
 			*ent_cpu = cpu_file;
 
@@ -1592,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 		if (ring_buffer_empty_cpu(buffer, cpu))
 			continue;
 
-		ent = peek_next_entry(iter, cpu, &ts);
+		ent = peek_next_entry(iter, cpu, &ts, &lost_events);
 
 		/*
 		 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1605,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 			next = ent;
 			next_cpu = cpu;
 			next_ts = ts;
+			next_lost = lost_events;
 		}
 	}
 
@@ -1610,6 +1615,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (ent_ts)
 		*ent_ts = next_ts;
 
+	if (missing_events)
+		*missing_events = next_lost;
+
 	return next;
 }
 
@@ -1617,13 +1625,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts)
 {
-	return __find_next_entry(iter, ent_cpu, ent_ts);
+	return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
 static void *find_next_entry_inc(struct trace_iterator *iter)
 {
-	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+	iter->ent = __find_next_entry(iter, &iter->cpu,
+				      &iter->lost_events, &iter->ts);
 
 	if (iter->ent)
 		trace_iterator_increment(iter);
@@ -1635,7 +1644,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+			    &iter->lost_events);
 	ftrace_enable_cpu();
 }
 
@@ -2030,6 +2040,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
 
+	if (iter->lost_events)
+		trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+				 iter->cpu, iter->lost_events);
+
 	if (iter->trace && iter->trace->print_line) {
 		ret = iter->trace->print_line(iter);
 		if (ret != TRACE_TYPE_UNHANDLED)
-- 
cgit v1.2.3-55-g7522


From ff0ff84a0767df48d728c36510365344a7e7d582 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Wed, 31 Mar 2010 22:11:42 -0400
Subject: ring-buffer: Add lost event count to end of sub buffer

Currently, binary readers of the ring buffer only know where events were
lost, but not how many events were lost at that location.
This information is available, but it would require adding another
field to the sub buffer header to include it.

But when a event can not fit at the end of a sub buffer, it is written
to the next sub buffer. This means there is a good chance that the
buffer may have room to hold this counter. If it does, write
the counter at the end of the sub buffer and set another flag
in the data size field that states that this information exists.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8295650444c5..dc6d563a6d22 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -320,6 +320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 /* Flag when events were overwritten */
 #define RB_MISSED_EVENTS	(1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED	(1 << 30)
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
@@ -340,6 +342,7 @@ struct buffer_page {
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
 	local_t		 entries;	/* entries on this page */
+	unsigned long	 real_end;	/* real end of data */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -1769,6 +1772,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 
+	/*
+	 * Save the original length to the meta data.
+	 * This will be used by the reader to add lost event
+	 * counter.
+	 */
+	tail_page->real_end = tail;
+
 	/*
 	 * If this event is bigger than the minimum size, then
 	 * we need to be careful that we don't subtract the
@@ -2888,6 +2898,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->write, 0);
 	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
+	cpu_buffer->reader_page->real_end = 0;
 
  spin:
 	/*
@@ -3728,11 +3739,11 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	struct buffer_page *reader;
+	unsigned long missed_events;
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
-	int missed_events = 0;
 	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3766,8 +3777,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	commit = rb_page_commit(reader);
 
 	/* Check if any events were dropped */
-	if (cpu_buffer->lost_events)
-		missed_events = 1;
+	missed_events = cpu_buffer->lost_events;
 
 	/*
 	 * If this page has been partially read or
@@ -3829,6 +3839,14 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
+
+		/*
+		 * Use the real_end for the data size,
+		 * This gives us a chance to store the lost events
+		 * on the page.
+		 */
+		if (reader->real_end)
+			local_set(&bpage->commit, reader->real_end);
 	}
 	ret = read;
 
@@ -3836,8 +3854,19 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	/*
 	 * Set a flag in the commit field if we lost events
 	 */
-	if (missed_events)
+	if (missed_events) {
+		commit = local_read(&bpage->commit);
+
+		/* If there is room at the end of the page to save the
+		 * missed events, then record it there.
+		 */
+		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+			memcpy(&bpage->data[commit], &missed_events,
+			       sizeof(missed_events));
+			local_add(RB_MISSED_STORED, &bpage->commit);
+		}
 		local_add(RB_MISSED_EVENTS, &bpage->commit);
+	}
 
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-- 
cgit v1.2.3-55-g7522


From 32bd7eb5a7f4596c8440dd9440322fe9e686634d Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Wed, 24 Mar 2010 13:17:19 +0800
Subject: sched: Remove remaining USER_SCHED code

This is left over from commit 7c9414385e ("sched: Remove USER_SCHED"")

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Dhaval Giani <dhaval.giani@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Howells <dhowells@redhat.com>
LKML-Reference: <4BA9A05F.7010407@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig            |  3 +--
 kernel/capability.c     |  1 -
 kernel/cred-internals.h | 21 ---------------------
 kernel/cred.c           |  3 ---
 kernel/exit.c           |  1 -
 kernel/sched_debug.c    |  5 -----
 kernel/user.c           | 10 +---------
 7 files changed, 2 insertions(+), 42 deletions(-)
 delete mode 100644 kernel/cred-internals.h

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1c..5fe94b82e4c0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -604,8 +604,7 @@ config RT_GROUP_SCHED
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
-	  to users or control groups (depending on the "Basis for grouping tasks"
-	  setting below. If enabled, it will also make it impossible to
+	  to task groups. If enabled, it will also make it impossible to
 	  schedule realtime tasks for non-root users until you allocate
 	  realtime bandwidth for them.
 	  See Documentation/scheduler/sched-rt-group.txt for more information.
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-	sched_move_task(p);
-#endif	/* CONFIG_USER_SCHED */
-}
-
diff --git a/kernel/cred.c b/kernel/cred.c
index 1b1129d0cce8..19d3ccce3d4d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,7 +16,6 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 
 #if 0
 #define kdebug(FMT, ...) \
@@ -557,8 +556,6 @@ int commit_creds(struct cred *new)
 		atomic_dec(&old->user->processes);
 	alter_cred_subscribers(old, -2);
 
-	sched_switch_user(task);
-
 	/* send notifications */
 	if (new->uid   != old->uid  ||
 	    new->euid  != old->euid ||
diff --git a/kernel/exit.c b/kernel/exit.c
index cce59cb5ee6a..84dc4b294e47 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8a46a719f367..0932c5c45b34 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	task_group_path(tg, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-	{
-		uid_t uid = cfs_rq->tg->uid;
-		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
diff --git a/kernel/user.c b/kernel/user.c
index ec3b2229893b..8e1c8c0a496c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -137,9 +136,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
-	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-	 * atomic.
-	 */
+	/* Make uid_hash_find() + uid_hash_insert() atomic. */
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
@@ -161,11 +158,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			/* This case is not possible when CONFIG_USER_SCHED
-			 * is defined, since we serialize alloc_uid() using
-			 * uids_mutex. Hence no need to call
-			 * sched_destroy_user() or remove_user_sysfs_dir().
-			 */
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
-- 
cgit v1.2.3-55-g7522


From 897f0b3c3ff40b443c84e271bef19bd6ae885195 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 15 Mar 2010 10:10:03 +0100
Subject: sched: Kill the broken and deadlockable
 cpuset_lock/cpuset_cpus_allowed_locked code

This patch just states the fact the cpusets/cpuhotplug interaction is
broken and removes the deadlockable code which only pretends to work.

- cpuset_lock() doesn't really work. It is needed for
  cpuset_cpus_allowed_locked() but we can't take this lock in
  try_to_wake_up()->select_fallback_rq() path.

- cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes
  callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex
  stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take
  cpuset_lock() and hangs forever because CPU is already dead and thus
  T can't be scheduled.

- cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock()
  which is not irq-safe, but try_to_wake_up() can be called from irq.

Kill them, and change select_fallback_rq() to use cpu_possible_mask, like
we currently do without CONFIG_CPUSETS.

Also, with or without this patch, with or without CONFIG_CPUSETS, the
callers of select_fallback_rq() can race with each other or with
set_cpus_allowed() pathes.

The subsequent patches try to to fix these problems.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091003.GA9123@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h | 13 -------------
 kernel/cpuset.c        | 27 +--------------------------
 kernel/sched.c         | 10 +++-------
 3 files changed, 4 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc4d04b..eeaaee746bee 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,8 +21,6 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
-				       struct cpumask *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -69,9 +67,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
 					struct task_struct *task);
 
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
@@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 {
 	cpumask_copy(mask, cpu_possible_mask);
 }
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-					      struct cpumask *mask)
-{
-	cpumask_copy(mask, cpu_possible_mask);
-}
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
@@ -157,9 +147,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
 {
 }
 
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
 static inline int cpuset_mem_spread_node(void)
 {
 	return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..9a747f56d58c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,10 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
 	mutex_lock(&callback_mutex);
-	cpuset_cpus_allowed_locked(tsk, pmask);
-	mutex_unlock(&callback_mutex);
-}
-
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
-{
 	task_lock(tsk);
 	guarantee_online_cpus(task_cs(tsk), pmask);
 	task_unlock(tsk);
+	mutex_unlock(&callback_mutex);
 }
 
 void cpuset_init_current_mems_allowed(void)
@@ -2382,22 +2373,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 	return 0;
 }
 
-/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-	mutex_lock(&callback_mutex);
-}
-
 /**
  * cpuset_unlock - release lock on cpuset changes
  *
diff --git a/kernel/sched.c b/kernel/sched.c
index 52b7efd27416..c0b3ebc16317 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2296,11 +2296,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		return dest_cpu;
 
 	/* No more Mr. Nice Guy. */
-	if (dest_cpu >= nr_cpu_ids) {
-		rcu_read_lock();
-		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-		rcu_read_unlock();
-		dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+	if (unlikely(dest_cpu >= nr_cpu_ids)) {
+		cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+		dest_cpu = cpumask_any(cpu_active_mask);
 
 		/*
 		 * Don't tell them about moving exiting tasks or
@@ -5866,7 +5864,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
@@ -5879,7 +5876,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		raw_spin_unlock_irq(&rq->lock);
-		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
-- 
cgit v1.2.3-55-g7522


From 1445c08d06c5594895b4fae952ef8a457e89c390 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 15 Mar 2010 10:10:10 +0100
Subject: sched: move_task_off_dead_cpu(): Take rq->lock around
 select_fallback_rq()

move_task_off_dead_cpu()->select_fallback_rq() reads/updates ->cpus_allowed
lockless. We can race with set_cpus_allowed() running in parallel.

Change it to take rq->lock around select_fallback_rq(). Note that it is not
trivial to move this spin_lock() into select_fallback_rq(), we must recheck
the task was not migrated after we take the lock and other callers do not
need this lock.

To avoid the races with other callers of select_fallback_rq() which rely on
TASK_WAKING, we also check p->state != TASK_WAKING and do nothing otherwise.
The owner of TASK_WAKING must update ->cpus_allowed and choose the correct
CPU anyway, and the subsequent __migrate_task() is just meaningless because
p->se.on_rq must be false.

Alternatively, we could change select_task_rq() to take rq->lock right
after it calls sched_class->select_task_rq(), but this looks a bit ugly.

Also, change it to not assume irqs are disabled and absorb __migrate_task_irq().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091010.GA9131@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c0b3ebc16317..27774b5aeb61 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5448,29 +5448,29 @@ static int migration_thread(void *data)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	int ret;
-
-	local_irq_disable();
-	ret = __migrate_task(p, src_cpu, dest_cpu);
-	local_irq_enable();
-	return ret;
-}
-
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-	int dest_cpu;
-
+	struct rq *rq = cpu_rq(dead_cpu);
+	int needs_cpu, uninitialized_var(dest_cpu);
+	unsigned long flags;
 again:
-	dest_cpu = select_fallback_rq(dead_cpu, p);
+	local_irq_save(flags);
+
+	raw_spin_lock(&rq->lock);
+	needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+	if (needs_cpu)
+		dest_cpu = select_fallback_rq(dead_cpu, p);
+	raw_spin_unlock(&rq->lock);
 
 	/* It can have affinity changed while we were choosing. */
-	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+	if (needs_cpu)
+		needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu);
+	local_irq_restore(flags);
+
+	if (unlikely(needs_cpu))
 		goto again;
 }
 
-- 
cgit v1.2.3-55-g7522


From c1804d547dc098363443667609c272d1e4d15ee8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 15 Mar 2010 10:10:14 +0100
Subject: sched: move_task_off_dead_cpu(): Remove retry logic

The previous patch preserved the retry logic, but it looks unneeded.

__migrate_task() can only fail if we raced with migration after we dropped
the lock, but in this case the caller of set_cpus_allowed/etc must initiate
migration itself if ->on_rq == T.

We already fixed p->cpus_allowed, the changes in active/online masks must
be visible to racer, it should migrate the task to online cpu correctly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091014.GA9138@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 27774b5aeb61..f475c608b073 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5456,7 +5456,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 	struct rq *rq = cpu_rq(dead_cpu);
 	int needs_cpu, uninitialized_var(dest_cpu);
 	unsigned long flags;
-again:
+
 	local_irq_save(flags);
 
 	raw_spin_lock(&rq->lock);
@@ -5464,14 +5464,13 @@ again:
 	if (needs_cpu)
 		dest_cpu = select_fallback_rq(dead_cpu, p);
 	raw_spin_unlock(&rq->lock);
-
-	/* It can have affinity changed while we were choosing. */
+	/*
+	 * It can only fail if we race with set_cpus_allowed(),
+	 * in the racer should migrate the task anyway.
+	 */
 	if (needs_cpu)
-		needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu);
+		__migrate_task(p, dead_cpu, dest_cpu);
 	local_irq_restore(flags);
-
-	if (unlikely(needs_cpu))
-		goto again;
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 30da688ef6b76e01969b00608202fff1eed2accc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 15 Mar 2010 10:10:19 +0100
Subject: sched: sched_exec(): Remove the select_fallback_rq() logic

sched_exec()->select_task_rq() reads/updates ->cpus_allowed lockless.
This can race with other CPUs updating our ->cpus_allowed, and this
looks meaningless to me.

The task is current and running, it must have online cpus in ->cpus_allowed,
the fallback mode is bogus. And, if ->sched_class returns the "wrong" cpu,
this likely means we raced with set_cpus_allowed() which was called
for reason, why should sched_exec() retry and call ->select_task_rq()
again?

Change the code to call sched_class->select_task_rq() directly and do
nothing if the returned cpu is wrong after re-checking under rq->lock.

From now task_struct->cpus_allowed is always stable under TASK_WAKING,
select_fallback_rq() is always called under rq-lock or the caller or
the caller owns TASK_WAKING (select_task_rq).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091019.GA9141@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f475c608b073..165b532dd8c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2280,6 +2280,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
 	int dest_cpu;
@@ -2316,12 +2319,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 
 /*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- *  exec:           is unstable, retry loop
- *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -3076,9 +3074,8 @@ void sched_exec(void)
 	unsigned long flags;
 	struct rq *rq;
 
-again:
 	this_cpu = get_cpu();
-	dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
+	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
 	if (dest_cpu == this_cpu) {
 		put_cpu();
 		return;
@@ -3086,18 +3083,12 @@ again:
 
 	rq = task_rq_lock(p, &flags);
 	put_cpu();
-
 	/*
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
-	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-	    || unlikely(!cpu_active(dest_cpu))) {
-		task_rq_unlock(rq, &flags);
-		goto again;
-	}
-
-	/* force the process onto the specified CPU */
-	if (migrate_task(p, dest_cpu, &req)) {
+	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+	    likely(cpu_active(dest_cpu)) &&
+	    migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 
-- 
cgit v1.2.3-55-g7522


From 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 15 Mar 2010 10:10:23 +0100
Subject: sched: _cpu_down(): Don't play with current->cpus_allowed

_cpu_down() changes the current task's affinity and then recovers it at
the end. The problems are well known: we can't restore old_allowed if it
was bound to the now-dead-cpu, and we can race with the userspace which
can change cpu-affinity during unplug.

_cpu_down() should not play with current->cpus_allowed at all. Instead,
take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable()
removes the dying cpu from cpu_online_mask.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091023.GA9148@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/cpu.c          | 18 ++++++------------
 kernel/sched.c        |  2 +-
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43c945152732..8bea40725c76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1843,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f8cced2692b3..8d340faac380 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -163,6 +163,7 @@ static inline void check_for_tasks(int cpu)
 }
 
 struct take_cpu_down_param {
+	struct task_struct *caller;
 	unsigned long mod;
 	void *hcpu;
 };
@@ -171,6 +172,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
 	struct take_cpu_down_param *param = _param;
+	unsigned int cpu = (unsigned long)param->hcpu;
 	int err;
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
@@ -181,6 +183,8 @@ static int __ref take_cpu_down(void *_param)
 	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
 				param->hcpu);
 
+	if (task_cpu(param->caller) == cpu)
+		move_task_off_dead_cpu(cpu, param->caller);
 	/* Force idle task to run as soon as we yield: it should
 	   immediately notice cpu is offline and die quickly. */
 	sched_idle_next();
@@ -191,10 +195,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	cpumask_var_t old_allowed;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
+		.caller = current,
 		.mod = mod,
 		.hcpu = hcpu,
 	};
@@ -205,9 +209,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-		return -ENOMEM;
-
 	cpu_hotplug_begin();
 	set_cpu_active(cpu, false);
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -224,10 +225,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		goto out_release;
 	}
 
-	/* Ensure that we are not runnable on dying cpu */
-	cpumask_copy(old_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpu_active_mask);
-
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		set_cpu_active(cpu, true);
@@ -236,7 +233,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		goto out_allowed;
+		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
 
@@ -254,8 +251,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	check_for_tasks(cpu);
 
-out_allowed:
-	set_cpus_allowed_ptr(current, old_allowed);
 out_release:
 	cpu_hotplug_done();
 	if (!err) {
@@ -263,7 +258,6 @@ out_release:
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 	}
-	free_cpumask_var(old_allowed);
 	return err;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 165b532dd8c2..11119deffa48 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5442,7 +5442,7 @@ static int migration_thread(void *data)
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	int needs_cpu, uninitialized_var(dest_cpu);
-- 
cgit v1.2.3-55-g7522


From 9084bb8246ea935b98320554229e2f371f7f52fa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 15 Mar 2010 10:10:27 +0100
Subject: sched: Make select_fallback_rq() cpuset friendly

Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems
with select_fallback_rq(). It can be called from any context and can't use
any cpuset locks including task_lock(). It is called when the task doesn't
have online cpus in ->cpus_allowed but ttwu/etc must be able to find a
suitable cpu.

I am not proud of this patch. Everything which needs such a fat comment
can't be good even if correct. But I'd prefer to not change the locking
rules in the code I hardly understand, and in any case I believe this
simple change make the code much more correct compared to deadlocks we
currently have.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091027.GA9155@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h |  7 +++++++
 kernel/cpuset.c        | 42 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched.c         |  4 +---
 3 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index eeaaee746bee..a73454aec333 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,6 +21,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 	cpumask_copy(mask, cpu_possible_mask);
 }
 
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
+{
+	cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+	return cpumask_any(cpu_active_mask);
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a747f56d58c..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2188,6 +2188,48 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	mutex_unlock(&callback_mutex);
 }
 
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+{
+	const struct cpuset *cs;
+	int cpu;
+
+	rcu_read_lock();
+	cs = task_cs(tsk);
+	if (cs)
+		cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+	rcu_read_unlock();
+
+	/*
+	 * We own tsk->cpus_allowed, nobody can change it under us.
+	 *
+	 * But we used cs && cs->cpus_allowed lockless and thus can
+	 * race with cgroup_attach_task() or update_cpumask() and get
+	 * the wrong tsk->cpus_allowed. However, both cases imply the
+	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+	 * which takes task_rq_lock().
+	 *
+	 * If we are called after it dropped the lock we must see all
+	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+	 * set any mask even if it is not right from task_cs() pov,
+	 * the pending set_cpus_allowed_ptr() will fix things.
+	 */
+
+	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+	if (cpu >= nr_cpu_ids) {
+		/*
+		 * Either tsk->cpus_allowed is wrong (see above) or it
+		 * is actually empty. The latter case is only possible
+		 * if we are racing with remove_tasks_in_empty_cpuset().
+		 * Like above we can temporary set any mask and rely on
+		 * set_cpus_allowed_ptr() as synchronization point.
+		 */
+		cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+		cpu = cpumask_any(cpu_active_mask);
+	}
+
+	return cpu;
+}
+
 void cpuset_init_current_mems_allowed(void)
 {
 	nodes_setall(current->mems_allowed);
diff --git a/kernel/sched.c b/kernel/sched.c
index 11119deffa48..9a38c7a24ed7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2300,9 +2300,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 
 	/* No more Mr. Nice Guy. */
 	if (unlikely(dest_cpu >= nr_cpu_ids)) {
-		cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
-		dest_cpu = cpumask_any(cpu_active_mask);
-
+		dest_cpu = cpuset_cpus_allowed_fallback(p);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
-- 
cgit v1.2.3-55-g7522


From 0017d735092844118bef006696a750a0e4ef6ebd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 24 Mar 2010 18:34:10 +0100
Subject: sched: Fix TASK_WAKING vs fork deadlock

Oleg noticed a few races with the TASK_WAKING usage on fork.

 - since TASK_WAKING is basically a spinlock, it should be IRQ safe
 - since we set TASK_WAKING (*) without holding rq->lock it could
   be there still is a rq->lock holder, thereby not actually
   providing full serialization.

(*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING.

Cure the second issue by not setting TASK_WAKING in sched_fork(), but
only temporarily in wake_up_new_task() while calling select_task_rq().

Cure the first by holding rq->lock around the select_task_rq() call,
this will disable IRQs, this however requires that we push down the
rq->lock release into select_task_rq_fair()'s cgroup stuff.

Because select_task_rq_fair() still needs to drop the rq->lock we
cannot fully get rid of TASK_WAKING.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ++-
 kernel/sched.c          | 65 ++++++++++++++++++-------------------------------
 kernel/sched_fair.c     |  8 ++++--
 kernel/sched_idletask.c |  3 ++-
 kernel/sched_rt.c       |  5 ++--
 5 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bea40725c76..fb6c18843ee8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,7 +1046,8 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
+			       int sd_flag, int flags);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a38c7a24ed7..dcd17736dae1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -916,14 +916,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 /*
  * Check whether the task is waking, we use this to synchronize against
  * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
  */
 static inline int task_is_waking(struct task_struct *p)
 {
-	return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+	return unlikely(p->state == TASK_WAKING);
 }
 
 /*
@@ -2320,9 +2316,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+	int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -2393,17 +2389,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(rq, p);
 
-	__task_rq_unlock(rq);
-
-	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu) {
-		/*
-		 * Since we migrate the task without holding any rq->lock,
-		 * we need to be careful with task_rq_lock(), since that
-		 * might end up locking an invalid rq.
-		 */
+	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
-	}
+	__task_rq_unlock(rq);
 
 	rq = cpu_rq(cpu);
 	raw_spin_lock(&rq->lock);
@@ -2530,11 +2519,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
 	__sched_fork(p);
 	/*
-	 * We mark the process as waking here. This guarantees that
+	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_WAKING;
+	p->state = TASK_RUNNING;
 
 	/*
 	 * Revert to default priority/policy on fork if requested.
@@ -2601,28 +2590,25 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	int cpu __maybe_unused = get_cpu();
 
 #ifdef CONFIG_SMP
+	rq = task_rq_lock(p, &flags);
+	p->state = TASK_WAKING;
+
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
 	 *
-	 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
-	 * ->cpus_allowed is stable, we have preemption disabled, meaning
-	 * cpu_online_mask is stable.
+	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+	 * without people poking at ->cpus_allowed.
 	 */
-	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+	cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
 	set_task_cpu(p, cpu);
-#endif
-
-	/*
-	 * Since the task is not on the rq and we still have TASK_WAKING set
-	 * nobody else will migrate this task.
-	 */
-	rq = cpu_rq(cpu);
-	raw_spin_lock_irqsave(&rq->lock, flags);
 
-	BUG_ON(p->state != TASK_WAKING);
 	p->state = TASK_RUNNING;
+	task_rq_unlock(rq, &flags);
+#endif
+
+	rq = task_rq_lock(p, &flags);
 	activate_task(rq, p, 0);
 	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -3068,19 +3054,15 @@ void sched_exec(void)
 {
 	struct task_struct *p = current;
 	struct migration_req req;
-	int dest_cpu, this_cpu;
 	unsigned long flags;
 	struct rq *rq;
-
-	this_cpu = get_cpu();
-	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
-	if (dest_cpu == this_cpu) {
-		put_cpu();
-		return;
-	}
+	int dest_cpu;
 
 	rq = task_rq_lock(p, &flags);
-	put_cpu();
+	dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+	if (dest_cpu == smp_processor_id())
+		goto unlock;
+
 	/*
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
@@ -3098,6 +3080,7 @@ void sched_exec(void)
 
 		return;
 	}
+unlock:
 	task_rq_unlock(rq, &flags);
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 49ad99378f82..8a5e7632d09b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1423,7 +1423,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -1521,8 +1522,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				  cpumask_weight(sched_domain_span(sd))))
 			tmp = affine_sd;
 
-		if (tmp)
+		if (tmp) {
+			raw_spin_unlock(&rq->lock);
 			update_shares(tmp);
+			raw_spin_lock(&rq->lock);
+		}
 	}
 #endif
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..5af709f503b0 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 012d69bb67c7..fde895f8044d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-	struct rq *rq = task_rq(p);
-
 	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
-- 
cgit v1.2.3-55-g7522


From 65cc8e4859ff29a9ddc989c88557d6059834c2a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 25 Mar 2010 21:05:16 +0100
Subject: sched: Optimize task_rq_lock()

Now that we hold the rq->lock over set_task_cpu() again, we can do
away with most of the TASK_WAKING checks and reduce them again to
set_cpus_allowed_ptr().

Removes some conditionals from scheduling hot-paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dcd17736dae1..51d336e08a92 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -914,8 +914,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
  */
 static inline int task_is_waking(struct task_struct *p)
 {
@@ -932,11 +932,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	struct rq *rq;
 
 	for (;;) {
-		while (task_is_waking(p))
-			cpu_relax();
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_is_waking(p)))
+		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 	}
@@ -953,12 +951,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	struct rq *rq;
 
 	for (;;) {
-		while (task_is_waking(p))
-			cpu_relax();
 		local_irq_save(*flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_is_waking(p)))
+		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock_irqrestore(&rq->lock, *flags);
 	}
@@ -5262,7 +5258,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	struct rq *rq;
 	int ret = 0;
 
+	/*
+	 * Serialize against TASK_WAKING so that ttwu() and wunt() can
+	 * drop the rq->lock and still rely on ->cpus_allowed.
+	 */
+again:
+	while (task_is_waking(p))
+		cpu_relax();
 	rq = task_rq_lock(p, &flags);
+	if (task_is_waking(p)) {
+		task_rq_unlock(rq, &flags);
+		goto again;
+	}
 
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
-- 
cgit v1.2.3-55-g7522


From cc87f76a601d2d256118f7bab15e35254356ae21 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 26 Mar 2010 12:22:14 +0100
Subject: sched: Fix nr_uninterruptible count

The cpuload calculation in calc_load_account_active() assumes
rq->nr_uninterruptible will not change on an offline cpu after
migrate_nr_uninterruptible(). However the recent migrate on wakeup
changes broke that and would result in decrementing the offline cpu's
rq->nr_uninterruptible.

Fix this by accounting the nr_uninterruptible on the waking cpu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 51d336e08a92..14c8d2a1b38a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2378,8 +2378,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	 *
 	 * First fix up the nr_uninterruptible count:
 	 */
-	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible--;
+	if (task_contributes_to_load(p)) {
+		if (likely(cpu_online(orig_cpu)))
+			rq->nr_uninterruptible--;
+		else
+			this_rq()->nr_uninterruptible--;
+	}
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking)
-- 
cgit v1.2.3-55-g7522


From 371fd7e7a56a5c136d31aa980011bd2f131c3ef5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 24 Mar 2010 16:38:48 +0100
Subject: sched: Add enqueue/dequeue flags

In order to reduce the dependency on TASK_WAKING rework the enqueue
interface to support a proper flags field.

Replace the int wakeup, bool head arguments with an int flags argument
and create the following flags:

  ENQUEUE_WAKEUP - the enqueue is a wakeup of a sleeping task,
  ENQUEUE_WAKING - the enqueue has relative vruntime due to
                   having sched_class::task_waking() called,
  ENQUEUE_HEAD - the waking task should be places on the head
                 of the priority queue (where appropriate).

For symmetry also convert sched_class::dequeue() to a flags scheme.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   | 11 ++++++++---
 kernel/sched.c          | 32 +++++++++++++++++---------------
 kernel/sched_fair.c     | 25 ++++++++-----------------
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       |  8 ++++----
 5 files changed, 38 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb6c18843ee8..e3e900f318d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1032,12 +1032,17 @@ struct sched_domain;
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 
+#define ENQUEUE_WAKEUP		1
+#define ENQUEUE_WAKING		2
+#define ENQUEUE_HEAD		4
+
+#define DEQUEUE_SLEEP		1
+
 struct sched_class {
 	const struct sched_class *next;
 
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
-			      bool head);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task) (struct rq *rq);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched.c b/kernel/sched.c
index 14c8d2a1b38a..4a57e96dd6c7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1877,44 +1877,43 @@ static void update_avg(u64 *avg, u64 sample)
 	*avg += diff >> 3;
 }
 
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_queued(p);
-	p->sched_class->enqueue_task(rq, p, wakeup, head);
+	p->sched_class->enqueue_task(rq, p, flags);
 	p->se.on_rq = 1;
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_dequeued(p);
-	p->sched_class->dequeue_task(rq, p, sleep);
+	p->sched_class->dequeue_task(rq, p, flags);
 	p->se.on_rq = 0;
 }
 
 /*
  * activate_task - move a task to the runqueue.
  */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 
-	enqueue_task(rq, p, wakeup, false);
+	enqueue_task(rq, p, flags);
 	inc_nr_running(rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 
-	dequeue_task(rq, p, sleep);
+	dequeue_task(rq, p, flags);
 	dec_nr_running(rq);
 }
 
@@ -2353,6 +2352,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
+	unsigned long en_flags = ENQUEUE_WAKEUP;
 	struct rq *rq;
 
 	this_cpu = get_cpu();
@@ -2386,8 +2386,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	}
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking)
+	if (p->sched_class->task_waking) {
 		p->sched_class->task_waking(rq, p);
+		en_flags |= ENQUEUE_WAKING;
+	}
 
 	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
@@ -2432,7 +2434,7 @@ out_activate:
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
-	activate_task(rq, p, 1);
+	activate_task(rq, p, en_flags);
 	success = 1;
 
 out_running:
@@ -3623,7 +3625,7 @@ need_resched_nonpreemptible:
 		if (unlikely(signal_pending_state(prev->state, prev)))
 			prev->state = TASK_RUNNING;
 		else
-			deactivate_task(rq, prev, 1);
+			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 		switch_count = &prev->nvcsw;
 	}
 
@@ -4193,7 +4195,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
-		enqueue_task(rq, p, 0, oldprio < prio);
+		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
@@ -4236,7 +4238,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (on_rq) {
-		enqueue_task(rq, p, 0, false);
+		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -8180,7 +8182,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
-		enqueue_task(rq, tsk, 0, false);
+		enqueue_task(rq, tsk, 0);
 
 	task_rq_unlock(rq, &flags);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8a5e7632d09b..88d3053ac7c2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -757,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = vruntime;
 }
 
-#define ENQUEUE_WAKEUP	1
-#define ENQUEUE_MIGRATE 2
-
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -767,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
 	 */
-	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
 		se->vruntime += cfs_rq->min_vruntime;
 
 	/*
@@ -803,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
@@ -811,7 +808,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 	update_curr(cfs_rq);
 
 	update_stats_dequeue(cfs_rq, se);
-	if (sleep) {
+	if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -836,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 	 * update can refer to the ->curr item and we need to reflect this
 	 * movement in our normalized position.
 	 */
-	if (!sleep)
+	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 }
 
@@ -1045,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
  * then put the task into the rbtree:
  */
 static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int flags = 0;
-
-	if (wakeup)
-		flags |= ENQUEUE_WAKEUP;
-	if (p->state == TASK_WAKING)
-		flags |= ENQUEUE_MIGRATE;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
@@ -1072,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, sleep);
+		dequeue_entity(cfs_rq, se, flags);
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
-		sleep = 1;
+		flags |= DEQUEUE_SLEEP;
 	}
 
 	hrtick_update(rq);
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5af709f503b0..bea2b8f12024 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -33,7 +33,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
  * message if some code attempts to do it:
  */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	raw_spin_unlock_irq(&rq->lock);
 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fde895f8044d..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Adding/removing a task to/from a priority array:
  */
 static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
-	if (wakeup)
+	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, head);
+	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
-- 
cgit v1.2.3-55-g7522


From 3326c1ceee234e63160852720d48be8a8f7a6d08 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Tue, 23 Mar 2010 19:09:33 +0100
Subject: perf_event: Make perf fd non seekable

Perf_event does not need seeking, so prevent it in order to
get rid of default_llseek, which uses the BKL.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
[drop the nonseekable_open, not needed for anon inodes]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/perf_event.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 63fbce1c80b5..4aa50ff4efc0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2645,6 +2645,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 
 static const struct file_operations perf_fops = {
+	.llseek			= no_llseek,
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
-- 
cgit v1.2.3-55-g7522


From aa27497c2fb4c7f57706099bd489e683e5cc3e3b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Mon, 5 Apr 2010 17:11:05 +0800
Subject: tracing: Fix uninitialized variable of tracing/trace output

Because a local variable is not initialized, I got these
when I did 'cat tracing/trace'. (not trace_pipe):

CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770221: lock_acquire: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770221: lock_release: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446612133255294080 EVENTS]
              ps-3099  [000]   560.770221: lock_acquire: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770222: lock_release: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770222: lock_release: ffffffff816cfb98 dcache_lock

See peek_next_entry(), it does not set *lost_events when we 'cat tracing/trace'

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4BB9A929.2000303@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0498bebcbfd1..b9be232352b8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1571,7 +1571,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
-	unsigned long lost_events, next_lost = 0;
+	unsigned long lost_events = 0, next_lost = 0;
 	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
-- 
cgit v1.2.3-55-g7522


From bd6d29c25bb1a24a4c160ec5de43e0004e01f72b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 6 Apr 2010 00:10:17 +0200
Subject: lockstat: Make lockstat counting per cpu

Locking statistics are implemented using global atomic
variables. This is usually fine unless some path write them very
often.

This is the case for the function and function graph tracers
that disable irqs for each entry saved (except if the function
tracer is in preempt disabled only mode).
And calls to local_irq_save/restore() increment
hardirqs_on_events and hardirqs_off_events stats (or similar
stats for redundant versions).

Incrementing these global vars for each function ends up in too
much cache bouncing if lockstats are enabled.

To solve this, implement the debug_atomic_*() operations using
per cpu vars.

 -v2: Use per_cpu() instead of get_cpu_var() to fetch the desired
      cpu vars on debug_atomic_read()

 -v3: Store the stats in a structure. No need for local_t as we
      are NMI/irq safe.

 -v4: Fix tons of build errors. I thought I had tested it but I
      probably forgot to select the relevant config.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1270505417-8144-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/lockdep.c           | 47 +++++++++++------------------
 kernel/lockdep_internals.h | 74 +++++++++++++++++++++++++++++++++-------------
 kernel/lockdep_proc.c      | 58 ++++++++++++++++++------------------
 3 files changed, 99 insertions(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de1..069af0276bf7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -430,20 +430,7 @@ static struct stack_trace lockdep_init_trace = {
 /*
  * Various lockdep statistics:
  */
-atomic_t chain_lookup_hits;
-atomic_t chain_lookup_misses;
-atomic_t hardirqs_on_events;
-atomic_t hardirqs_off_events;
-atomic_t redundant_hardirqs_on;
-atomic_t redundant_hardirqs_off;
-atomic_t softirqs_on_events;
-atomic_t softirqs_off_events;
-atomic_t redundant_softirqs_on;
-atomic_t redundant_softirqs_off;
-atomic_t nr_unused_locks;
-atomic_t nr_cyclic_checks;
-atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_backwards_checks;
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
 #endif
 
 /*
@@ -758,7 +745,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 		return NULL;
 	}
 	class = lock_classes + nr_lock_classes++;
-	debug_atomic_inc(&nr_unused_locks);
+	debug_atomic_inc(nr_unused_locks);
 	class->key = key;
 	class->name = lock->name;
 	class->subclass = subclass;
@@ -1215,7 +1202,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
 {
 	int result;
 
-	debug_atomic_inc(&nr_cyclic_checks);
+	debug_atomic_inc(nr_cyclic_checks);
 
 	result = __bfs_forwards(root, target, class_equal, target_entry);
 
@@ -1252,7 +1239,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
 {
 	int result;
 
-	debug_atomic_inc(&nr_find_usage_forwards_checks);
+	debug_atomic_inc(nr_find_usage_forwards_checks);
 
 	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
 
@@ -1275,7 +1262,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
 {
 	int result;
 
-	debug_atomic_inc(&nr_find_usage_backwards_checks);
+	debug_atomic_inc(nr_find_usage_backwards_checks);
 
 	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
 
@@ -1835,7 +1822,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	list_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
-			debug_atomic_inc(&chain_lookup_hits);
+			debug_atomic_inc(chain_lookup_hits);
 			if (very_verbose(class))
 				printk("\nhash chain already cached, key: "
 					"%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1887,7 @@ cache_hit:
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
 	list_add_tail_rcu(&chain->entry, hash_head);
-	debug_atomic_inc(&chain_lookup_misses);
+	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
 	return 1;
@@ -2321,7 +2308,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
-		debug_atomic_inc(&redundant_hardirqs_on);
+		debug_atomic_inc(redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
@@ -2348,7 +2335,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&hardirqs_on_events);
+	debug_atomic_inc(hardirqs_on_events);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
@@ -2380,9 +2367,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
 		curr->hardirqs_enabled = 0;
 		curr->hardirq_disable_ip = ip;
 		curr->hardirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&hardirqs_off_events);
+		debug_atomic_inc(hardirqs_off_events);
 	} else
-		debug_atomic_inc(&redundant_hardirqs_off);
+		debug_atomic_inc(redundant_hardirqs_off);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
@@ -2406,7 +2393,7 @@ void trace_softirqs_on(unsigned long ip)
 		return;
 
 	if (curr->softirqs_enabled) {
-		debug_atomic_inc(&redundant_softirqs_on);
+		debug_atomic_inc(redundant_softirqs_on);
 		return;
 	}
 
@@ -2416,7 +2403,7 @@ void trace_softirqs_on(unsigned long ip)
 	curr->softirqs_enabled = 1;
 	curr->softirq_enable_ip = ip;
 	curr->softirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&softirqs_on_events);
+	debug_atomic_inc(softirqs_on_events);
 	/*
 	 * We are going to turn softirqs on, so set the
 	 * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2433,10 @@ void trace_softirqs_off(unsigned long ip)
 		curr->softirqs_enabled = 0;
 		curr->softirq_disable_ip = ip;
 		curr->softirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&softirqs_off_events);
+		debug_atomic_inc(softirqs_off_events);
 		DEBUG_LOCKS_WARN_ON(!softirq_count());
 	} else
-		debug_atomic_inc(&redundant_softirqs_off);
+		debug_atomic_inc(redundant_softirqs_off);
 }
 
 static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2641,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 			return 0;
 		break;
 	case LOCK_USED:
-		debug_atomic_dec(&nr_unused_locks);
+		debug_atomic_dec(nr_unused_locks);
 		break;
 	default:
 		if (!debug_locks_off_graph_unlock())
@@ -2760,7 +2747,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		if (!class)
 			return 0;
 	}
-	debug_atomic_inc((atomic_t *)&class->ops);
+	atomic_inc((atomic_t *)&class->ops);
 	if (very_verbose(class)) {
 		printk("\nacquire class [%p] %s", class->key, class->name);
 		if (class->name_version > 1)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..8d7d4b6c741a 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,29 +110,61 @@ lockdep_count_backward_deps(struct lock_class *class)
 #endif
 
 #ifdef CONFIG_DEBUG_LOCKDEP
+
+#include <asm/local.h>
 /*
- * Various lockdep statistics:
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
  */
-extern atomic_t chain_lookup_hits;
-extern atomic_t chain_lookup_misses;
-extern atomic_t hardirqs_on_events;
-extern atomic_t hardirqs_off_events;
-extern atomic_t redundant_hardirqs_on;
-extern atomic_t redundant_hardirqs_off;
-extern atomic_t softirqs_on_events;
-extern atomic_t softirqs_off_events;
-extern atomic_t redundant_softirqs_on;
-extern atomic_t redundant_softirqs_off;
-extern atomic_t nr_unused_locks;
-extern atomic_t nr_cyclic_checks;
-extern atomic_t nr_cyclic_check_recursions;
-extern atomic_t nr_find_usage_forwards_checks;
-extern atomic_t nr_find_usage_forwards_recursions;
-extern atomic_t nr_find_usage_backwards_checks;
-extern atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr)		atomic_inc(ptr)
-# define debug_atomic_dec(ptr)		atomic_dec(ptr)
-# define debug_atomic_read(ptr)		atomic_read(ptr)
+struct lockdep_stats {
+	int	chain_lookup_hits;
+	int	chain_lookup_misses;
+	int	hardirqs_on_events;
+	int	hardirqs_off_events;
+	int	redundant_hardirqs_on;
+	int	redundant_hardirqs_off;
+	int	softirqs_on_events;
+	int	softirqs_off_events;
+	int	redundant_softirqs_on;
+	int	redundant_softirqs_off;
+	int	nr_unused_locks;
+	int	nr_cyclic_checks;
+	int	nr_cyclic_check_recursions;
+	int	nr_find_usage_forwards_checks;
+	int	nr_find_usage_forwards_recursions;
+	int	nr_find_usage_backwards_checks;
+	int	nr_find_usage_backwards_recursions;
+};
+
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+
+#define debug_atomic_inc(ptr)			{		\
+	struct lockdep_stats *__cpu_lockdep_stats;		\
+								\
+	WARN_ON_ONCE(!irqs_disabled());				\
+	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
+	__cpu_lockdep_stats->ptr++;				\
+}
+
+#define debug_atomic_dec(ptr)			{		\
+	struct lockdep_stats *__cpu_lockdep_stats;		\
+								\
+	WARN_ON_ONCE(!irqs_disabled());				\
+	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
+	__cpu_lockdep_stats->ptr--;				\
+}
+
+#define debug_atomic_read(ptr)		({				\
+	struct lockdep_stats *__cpu_lockdep_stats;			\
+	unsigned long long __total = 0;					\
+	int __cpu;							\
+	for_each_possible_cpu(__cpu) {					\
+		__cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu);	\
+		__total += __cpu_lockdep_stats->ptr;			\
+	}								\
+	__total;							\
+})
 #else
 # define debug_atomic_inc(ptr)		do { } while (0)
 # define debug_atomic_dec(ptr)		do { } while (0)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
-	unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
-		     hi2 = debug_atomic_read(&hardirqs_off_events),
-		     hr1 = debug_atomic_read(&redundant_hardirqs_on),
-		     hr2 = debug_atomic_read(&redundant_hardirqs_off),
-		     si1 = debug_atomic_read(&softirqs_on_events),
-		     si2 = debug_atomic_read(&softirqs_off_events),
-		     sr1 = debug_atomic_read(&redundant_softirqs_on),
-		     sr2 = debug_atomic_read(&redundant_softirqs_off);
-
-	seq_printf(m, " chain lookup misses:           %11u\n",
-		debug_atomic_read(&chain_lookup_misses));
-	seq_printf(m, " chain lookup hits:             %11u\n",
-		debug_atomic_read(&chain_lookup_hits));
-	seq_printf(m, " cyclic checks:                 %11u\n",
-		debug_atomic_read(&nr_cyclic_checks));
-	seq_printf(m, " find-mask forwards checks:     %11u\n",
-		debug_atomic_read(&nr_find_usage_forwards_checks));
-	seq_printf(m, " find-mask backwards checks:    %11u\n",
-		debug_atomic_read(&nr_find_usage_backwards_checks));
-
-	seq_printf(m, " hardirq on events:             %11u\n", hi1);
-	seq_printf(m, " hardirq off events:            %11u\n", hi2);
-	seq_printf(m, " redundant hardirq ons:         %11u\n", hr1);
-	seq_printf(m, " redundant hardirq offs:        %11u\n", hr2);
-	seq_printf(m, " softirq on events:             %11u\n", si1);
-	seq_printf(m, " softirq off events:            %11u\n", si2);
-	seq_printf(m, " redundant softirq ons:         %11u\n", sr1);
-	seq_printf(m, " redundant softirq offs:        %11u\n", sr2);
+	unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
+			   hi2 = debug_atomic_read(hardirqs_off_events),
+			   hr1 = debug_atomic_read(redundant_hardirqs_on),
+			   hr2 = debug_atomic_read(redundant_hardirqs_off),
+			   si1 = debug_atomic_read(softirqs_on_events),
+			   si2 = debug_atomic_read(softirqs_off_events),
+			   sr1 = debug_atomic_read(redundant_softirqs_on),
+			   sr2 = debug_atomic_read(redundant_softirqs_off);
+
+	seq_printf(m, " chain lookup misses:           %11llu\n",
+		debug_atomic_read(chain_lookup_misses));
+	seq_printf(m, " chain lookup hits:             %11llu\n",
+		debug_atomic_read(chain_lookup_hits));
+	seq_printf(m, " cyclic checks:                 %11llu\n",
+		debug_atomic_read(nr_cyclic_checks));
+	seq_printf(m, " find-mask forwards checks:     %11llu\n",
+		debug_atomic_read(nr_find_usage_forwards_checks));
+	seq_printf(m, " find-mask backwards checks:    %11llu\n",
+		debug_atomic_read(nr_find_usage_backwards_checks));
+
+	seq_printf(m, " hardirq on events:             %11llu\n", hi1);
+	seq_printf(m, " hardirq off events:            %11llu\n", hi2);
+	seq_printf(m, " redundant hardirq ons:         %11llu\n", hr1);
+	seq_printf(m, " redundant hardirq offs:        %11llu\n", hr2);
+	seq_printf(m, " softirq on events:             %11llu\n", si1);
+	seq_printf(m, " softirq off events:            %11llu\n", si2);
+	seq_printf(m, " redundant softirq ons:         %11llu\n", sr1);
+	seq_printf(m, " redundant softirq offs:        %11llu\n", sr2);
 #endif
 }
 
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #endif
 	}
 #ifdef CONFIG_DEBUG_LOCKDEP
-	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+	DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
 #endif
 	seq_printf(m, " lock-classes:                  %11lu [max: %lu]\n",
 			nr_lock_classes, MAX_LOCKDEP_KEYS);
-- 
cgit v1.2.3-55-g7522


From 3bbb9ec946428b96657126768f65487a48dd090c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Thu, 11 Mar 2010 14:04:36 -0800
Subject: timers: Introduce the concept of timer slack for legacy timers

While HR timers have had the concept of timer slack for quite some time
now, the legacy timers lacked this concept, and had to make do with
round_jiffies() and friends.

Timer slack is important for power management; grouping timers reduces the
number of wakeups which in turn reduces power consumption.

This patch introduces timer slack to the legacy timers using the following
pieces:
* A slack field in the timer struct
* An api (set_timer_slack) that callers can use to set explicit timer slack
* A default slack of 0.4% of the requested delay for callers that do not set
  any explicit slack
* Rounding code that is part of mod_timer() that tries to
  group timers around jiffies values every 'power of two'
  (so quick timers will group around every 2, but longer timers
  will group around every 4, 8, 16, 32 etc)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: johnstul@us.ibm.com
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h | 10 ++++++++-
 kernel/timer.c        | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index a2d1eb6cb3f0..ea965b857a50 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -10,13 +10,19 @@
 struct tvec_base;
 
 struct timer_list {
+	/*
+	 * All fields that change during normal runtime grouped to the
+	 * same cacheline
+	 */
 	struct list_head entry;
 	unsigned long expires;
+	struct tvec_base *base;
 
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_base *base;
+	int slack;
+
 #ifdef CONFIG_TIMER_STATS
 	void *start_site;
 	char start_comm[16];
@@ -165,6 +171,8 @@ extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
+extern void set_timer_slack(struct timer_list *time, int slack_hz);
+
 #define TIMER_NOT_PINNED	0
 #define TIMER_PINNED		1
 /*
diff --git a/kernel/timer.c b/kernel/timer.c
index 7e12e7bc7ce6..49773f38c9bc 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -318,6 +318,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 
+/**
+ * set_timer_slack - set the allowed slack for a timer
+ * @slack_hz: the amount of time (in jiffies) allowed for rounding
+ *
+ * Set the amount of time, in jiffies, that a certain timer has
+ * in terms of slack. By setting this value, the timer subsystem
+ * will schedule the actual timer somewhere between
+ * the time mod_timer() asks for, and that time plus the slack.
+ *
+ * By setting the slack to -1, a percentage of the delay is used
+ * instead.
+ */
+void set_timer_slack(struct timer_list *timer, int slack_hz)
+{
+	timer->slack = slack_hz;
+}
+EXPORT_SYMBOL_GPL(set_timer_slack);
+
 
 static inline void set_running_timer(struct tvec_base *base,
 					struct timer_list *timer)
@@ -549,6 +567,7 @@ static void __init_timer(struct timer_list *timer,
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
+	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
 	timer->start_pid = -1;
@@ -714,6 +733,41 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
+/*
+ * Decide where to put the timer while taking the slack into account
+ *
+ * Algorithm:
+ *   1) calculate the maximum (absolute) time
+ *   2) calculate the highest bit where the expires and new max are different
+ *   3) use this bit to make a mask
+ *   4) use the bitmask to round down the maximum time, so that all last
+ *      bits are zeros
+ */
+static inline
+unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
+{
+	unsigned long expires_limit, mask;
+	int bit;
+
+	expires_limit = expires + timer->slack;
+
+	if (timer->slack < 0) /* auto slack: use 0.4% */
+		expires_limit = expires + (expires - jiffies)/256;
+
+	mask = expires ^ expires_limit;
+
+	if (mask == 0)
+		return expires;
+
+	bit = find_last_bit(&mask, BITS_PER_LONG);
+
+	mask = (1 << bit) - 1;
+
+	expires_limit = expires_limit & ~(mask);
+
+	return expires_limit;
+}
+
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -744,6 +798,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
+	expires = apply_slack(timer, expires);
+
 	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
-- 
cgit v1.2.3-55-g7522


From 351b3f7a21e413a9b14d0393171497d2373bd702 Mon Sep 17 00:00:00 2001
From: Carsten Emde
Date: Fri, 2 Apr 2010 22:40:19 +0200
Subject: hrtimers: Provide schedule_hrtimeout for CLOCK_REALTIME

The current version of schedule_hrtimeout() always uses the
monotonic clock. Some system calls such as mq_timedsend()
and mq_timedreceive(), however, require the use of the wall
clock due to the definition of the system call.

This patch provides the infrastructure to use schedule_hrtimeout() 
with a CLOCK_REALTIME timer.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Tested-by: Pradyumna Sampath <pradysam@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Veen <arjan@infradead.org>
LKML-Reference: <20100402204331.167439615@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/hrtimer.h |  2 ++
 kernel/hrtimer.c        | 67 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 43 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5d86fb2309d2..fd0c1b857d3d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -422,6 +422,8 @@ extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 
 extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 						const enum hrtimer_mode mode);
+extern int schedule_hrtimeout_range_clock(ktime_t *expires,
+		unsigned long delta, const enum hrtimer_mode mode, int clock);
 extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..b9b134b35088 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
 }
 
 /**
- * schedule_hrtimeout_range - sleep until timeout
+ * schedule_hrtimeout_range_clock - sleep until timeout
  * @expires:	timeout value (ktime_t)
  * @delta:	slack in expires timeout (ktime_t)
  * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- *
- * Make the current task sleep until the given expiry time has
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
- * The kernel give the normal best effort behavior for "@expires+@delta",
- * but may decide to fire the timer earlier, but no earlier than @expires.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- *
- * Returns 0 when the timer has expired otherwise -EINTR
+ * @clock:	timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
  */
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
-			       const enum hrtimer_mode mode)
+int __sched
+schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+			       const enum hrtimer_mode mode, int clock)
 {
 	struct hrtimer_sleeper t;
 
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 		return -EINTR;
 	}
 
-	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+	hrtimer_init_on_stack(&t.timer, clock, mode);
 	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
 
 	hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
 
 	return !t.task ? 0 : -EINTR;
 }
+
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires:	timeout value (ktime_t)
+ * @delta:	slack in expires timeout (ktime_t)
+ * @mode:	timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+				     const enum hrtimer_mode mode)
+{
+	return schedule_hrtimeout_range_clock(expires, delta, mode,
+					      CLOCK_MONOTONIC);
+}
 EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
 
 /**
-- 
cgit v1.2.3-55-g7522


From 5534ecb2dda04345e8243901e0e49599228b4273 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Sat, 27 Feb 2010 19:49:37 +0100
Subject: ptrace: kill BKL in ptrace syscall

The comment suggests that this usage is stale. There is no bkl in the
exec path so if there is a race lurking there, the bkl in ptrace is
not going to help in this regard.

Overview of the possibility of "accidental" races this bkl might
protect:

- ptrace_traceme() is protected against task removal and concurrent
read/write on current->ptrace as it locks write tasklist_lock.

- arch_ptrace_attach() is serialized by ptrace_traceme() against
concurrent PTRACE_TRACEME or PTRACE_ATTACH

- ptrace_attach() is protected the same way ptrace_traceme() and
in turn serializes arch_ptrace_attach()

- ptrace_check_attach() does its own well described serializing too.

There is no obvious race here.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Roland McGrath <roland@redhat.com>
---
 kernel/ptrace.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..53575020f82b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -666,10 +666,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 	struct task_struct *child;
 	long ret;
 
-	/*
-	 * This lock_kernel fixes a subtle race with suid exec
-	 */
-	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
 		if (!ret)
@@ -703,7 +699,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
  out_put_task_struct:
 	put_task_struct(child);
  out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -813,10 +808,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	struct task_struct *child;
 	long ret;
 
-	/*
-	 * This lock_kernel fixes a subtle race with suid exec
-	 */
-	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
 		goto out;
@@ -846,7 +837,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
  out_put_task_struct:
 	put_task_struct(child);
  out:
-	unlock_kernel();
 	return ret;
 }
 #endif	/* CONFIG_COMPAT */
-- 
cgit v1.2.3-55-g7522


From 0968d0060a3c885e53d453380266c7792a55d302 Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Wed, 7 Apr 2010 15:14:56 -0400
Subject: security: remove dead hook cred_commit

Unused hook.  Remove.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 11 -----------
 kernel/cred.c            |  2 --
 security/capability.c    |  5 -----
 security/security.c      |  5 -----
 4 files changed, 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index ef6edc759891..33b0c1b27f82 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -652,10 +652,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@old points to the original credentials.
  *	@gfp indicates the atomicity of any memory allocations.
  *	Prepare a new set of credentials by copying the data from the old set.
- * @cred_commit:
- *	@new points to the new credentials.
- *	@old points to the original credentials.
- *	Install a new set of credentials.
  * @cred_transfer:
  *	@new points to the new credentials.
  *	@old points to the original credentials.
@@ -1536,7 +1532,6 @@ struct security_operations {
 	void (*cred_free) (struct cred *cred);
 	int (*cred_prepare)(struct cred *new, const struct cred *old,
 			    gfp_t gfp);
-	void (*cred_commit)(struct cred *new, const struct cred *old);
 	void (*cred_transfer)(struct cred *new, const struct cred *old);
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
@@ -1794,7 +1789,6 @@ int security_task_create(unsigned long clone_flags);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
-void security_commit_creds(struct cred *new, const struct cred *old);
 void security_transfer_creds(struct cred *new, const struct cred *old);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
@@ -2315,11 +2309,6 @@ static inline int security_prepare_creds(struct cred *new,
 	return 0;
 }
 
-static inline void security_commit_creds(struct cred *new,
-					 const struct cred *old)
-{
-}
-
 static inline void security_transfer_creds(struct cred *new,
 					   const struct cred *old)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index 1b1129d0cce8..fecb34640482 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -520,8 +520,6 @@ int commit_creds(struct cred *new)
 #endif
 	BUG_ON(atomic_read(&new->usage) < 1);
 
-	security_commit_creds(new, old);
-
 	get_cred(new); /* we will require a ref for the subj creds too */
 
 	/* dumpability changes */
diff --git a/security/capability.c b/security/capability.c
index b855e9f27f0e..a927bdea1816 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -373,10 +373,6 @@ static int cap_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp)
 	return 0;
 }
 
-static void cap_cred_commit(struct cred *new, const struct cred *old)
-{
-}
-
 static void cap_cred_transfer(struct cred *new, const struct cred *old)
 {
 }
@@ -972,7 +968,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, cred_alloc_blank);
 	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, cred_prepare);
-	set_to_cap_if_null(ops, cred_commit);
 	set_to_cap_if_null(ops, cred_transfer);
 	set_to_cap_if_null(ops, kernel_act_as);
 	set_to_cap_if_null(ops, kernel_create_files_as);
diff --git a/security/security.c b/security/security.c
index c65b0bca05bb..6e5942653d4f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -712,11 +712,6 @@ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 	return security_ops->cred_prepare(new, old, gfp);
 }
 
-void security_commit_creds(struct cred *new, const struct cred *old)
-{
-	security_ops->cred_commit(new, old);
-}
-
 void security_transfer_creds(struct cred *new, const struct cred *old)
 {
 	security_ops->cred_transfer(new, old);
-- 
cgit v1.2.3-55-g7522


From 43ed8c3b4573d5f5cd314937fee63b4ab046ac5f Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Wed, 7 Apr 2010 15:15:02 -0400
Subject: security: remove dead hook task_setuid

Unused hook.  Remove.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 20 --------------------
 kernel/sys.c             | 15 ---------------
 security/capability.c    |  6 ------
 security/security.c      |  5 -----
 4 files changed, 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index 33b0c1b27f82..447c57fcec88 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -674,18 +674,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	userspace to load a kernel module with the given name.
  *	@kmod_name name of the module requested by the kernel
  *	Return 0 if successful.
- * @task_setuid:
- *	Check permission before setting one or more of the user identity
- *	attributes of the current process.  The @flags parameter indicates
- *	which of the set*uid system calls invoked this hook and how to
- *	interpret the @id0, @id1, and @id2 parameters.  See the LSM_SETID
- *	definitions at the beginning of this file for the @flags values and
- *	their meanings.
- *	@id0 contains a uid.
- *	@id1 contains a uid.
- *	@id2 contains a uid.
- *	@flags contains one of the LSM_SETID_* values.
- *	Return 0 if permission is granted.
  * @task_fix_setuid:
  *	Update the module's state after setting one or more of the user
  *	identity attributes of the current process.  The @flags parameter
@@ -1536,7 +1524,6 @@ struct security_operations {
 	int (*kernel_act_as)(struct cred *new, u32 secid);
 	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
 	int (*kernel_module_request)(char *kmod_name);
-	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
 	int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags);
@@ -1793,7 +1780,6 @@ void security_transfer_creds(struct cred *new, const struct cred *old);
 int security_kernel_act_as(struct cred *new, u32 secid);
 int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_kernel_module_request(char *kmod_name);
-int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags);
@@ -2330,12 +2316,6 @@ static inline int security_kernel_module_request(char *kmod_name)
 	return 0;
 }
 
-static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
-				       int flags)
-{
-	return 0;
-}
-
 static inline int security_task_fix_setuid(struct cred *new,
 					   const struct cred *old,
 					   int flags)
diff --git a/kernel/sys.c b/kernel/sys.c
index 8298878f4f71..396c11cd9a20 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -609,10 +609,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
 		new->uid = ruid;
@@ -674,10 +670,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
@@ -718,9 +710,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 	if (!new)
 		return -ENOMEM;
 
-	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
-	if (retval)
-		goto error;
 	old = current_cred();
 
 	retval = -EPERM;
@@ -850,9 +839,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 	old = current_cred();
 	old_fsuid = old->fsuid;
 
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
-		goto error;
-
 	if (uid == old->uid  || uid == old->euid  ||
 	    uid == old->suid || uid == old->fsuid ||
 	    capable(CAP_SETUID)) {
@@ -863,7 +849,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
 		}
 	}
 
-error:
 	abort_creds(new);
 	return old_fsuid;
 
diff --git a/security/capability.c b/security/capability.c
index a927bdea1816..41ff54f3b4d8 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -392,11 +392,6 @@ static int cap_kernel_module_request(char *kmod_name)
 	return 0;
 }
 
-static int cap_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
-{
-	return 0;
-}
-
 static int cap_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
 {
 	return 0;
@@ -972,7 +967,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, kernel_act_as);
 	set_to_cap_if_null(ops, kernel_create_files_as);
 	set_to_cap_if_null(ops, kernel_module_request);
-	set_to_cap_if_null(ops, task_setuid);
 	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setgid);
 	set_to_cap_if_null(ops, task_setpgid);
diff --git a/security/security.c b/security/security.c
index 6e5942653d4f..3900da3da87b 100644
--- a/security/security.c
+++ b/security/security.c
@@ -732,11 +732,6 @@ int security_kernel_module_request(char *kmod_name)
 	return security_ops->kernel_module_request(kmod_name);
 }
 
-int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
-{
-	return security_ops->task_setuid(id0, id1, id2, flags);
-}
-
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags)
 {
-- 
cgit v1.2.3-55-g7522


From 06ad187e280e725e356c62c3a30ddcd01564f8be Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Wed, 7 Apr 2010 15:15:08 -0400
Subject: security: remove dead hook task_setgid

Unused hook.  Remove.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 20 --------------------
 kernel/sys.c             | 16 ----------------
 security/capability.c    |  6 ------
 security/security.c      |  5 -----
 4 files changed, 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index 447c57fcec88..04ce0d6e4edf 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -683,18 +683,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@old is the set of credentials that are being replaces
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 on success.
- * @task_setgid:
- *	Check permission before setting one or more of the group identity
- *	attributes of the current process.  The @flags parameter indicates
- *	which of the set*gid system calls invoked this hook and how to
- *	interpret the @id0, @id1, and @id2 parameters.  See the LSM_SETID
- *	definitions at the beginning of this file for the @flags values and
- *	their meanings.
- *	@id0 contains a gid.
- *	@id1 contains a gid.
- *	@id2 contains a gid.
- *	@flags contains one of the LSM_SETID_* values.
- *	Return 0 if permission is granted.
  * @task_setpgid:
  *	Check permission before setting the process group identifier of the
  *	process @p to @pgid.
@@ -1526,7 +1514,6 @@ struct security_operations {
 	int (*kernel_module_request)(char *kmod_name);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
-	int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags);
 	int (*task_setpgid) (struct task_struct *p, pid_t pgid);
 	int (*task_getpgid) (struct task_struct *p);
 	int (*task_getsid) (struct task_struct *p);
@@ -1782,7 +1769,6 @@ int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_kernel_module_request(char *kmod_name);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
-int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
 int security_task_getsid(struct task_struct *p);
@@ -2323,12 +2309,6 @@ static inline int security_task_fix_setuid(struct cred *new,
 	return cap_task_fix_setuid(new, old, flags);
 }
 
-static inline int security_task_setgid(gid_t id0, gid_t id1, gid_t id2,
-				       int flags)
-{
-	return 0;
-}
-
 static inline int security_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 396c11cd9a20..eb1ec5c7f03b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -491,10 +491,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
 		if (old->gid == rgid ||
@@ -542,10 +538,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (capable(CAP_SETGID))
 		new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -776,10 +768,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
 		return -ENOMEM;
 	old = current_cred();
 
-	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
-	if (retval)
-		goto error;
-
 	retval = -EPERM;
 	if (!capable(CAP_SETGID)) {
 		if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -872,9 +860,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 	old = current_cred();
 	old_fsgid = old->fsgid;
 
-	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		goto error;
-
 	if (gid == old->gid  || gid == old->egid  ||
 	    gid == old->sgid || gid == old->fsgid ||
 	    capable(CAP_SETGID)) {
@@ -884,7 +869,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
 		}
 	}
 
-error:
 	abort_creds(new);
 	return old_fsgid;
 
diff --git a/security/capability.c b/security/capability.c
index 41ff54f3b4d8..66cd7453563f 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -392,11 +392,6 @@ static int cap_kernel_module_request(char *kmod_name)
 	return 0;
 }
 
-static int cap_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
-{
-	return 0;
-}
-
 static int cap_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return 0;
@@ -968,7 +963,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, kernel_create_files_as);
 	set_to_cap_if_null(ops, kernel_module_request);
 	set_to_cap_if_null(ops, task_fix_setuid);
-	set_to_cap_if_null(ops, task_setgid);
 	set_to_cap_if_null(ops, task_setpgid);
 	set_to_cap_if_null(ops, task_getpgid);
 	set_to_cap_if_null(ops, task_getsid);
diff --git a/security/security.c b/security/security.c
index 3900da3da87b..1e35dd669209 100644
--- a/security/security.c
+++ b/security/security.c
@@ -738,11 +738,6 @@ int security_task_fix_setuid(struct cred *new, const struct cred *old,
 	return security_ops->task_fix_setuid(new, old, flags);
 }
 
-int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
-{
-	return security_ops->task_setgid(id0, id1, id2, flags);
-}
-
 int security_task_setpgid(struct task_struct *p, pid_t pgid)
 {
 	return security_ops->task_setpgid(p, pgid);
-- 
cgit v1.2.3-55-g7522


From 6307f8fee295b364716d28686df6e69c2fee751a Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Wed, 7 Apr 2010 15:15:13 -0400
Subject: security: remove dead hook task_setgroups

Unused hook.  Remove.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 12 ------------
 kernel/groups.c          |  6 ------
 security/capability.c    |  6 ------
 security/security.c      |  5 -----
 4 files changed, 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index 04ce0d6e4edf..73505f0c9b75 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -704,11 +704,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@p contains the task_struct for the process and place is into @secid.
  *	In case of failure, @secid will be set to zero.
  *
- * @task_setgroups:
- *	Check permission before setting the supplementary group set of the
- *	current process.
- *	@group_info contains the new group information.
- *	Return 0 if permission is granted.
  * @task_setnice:
  *	Check permission before setting the nice value of @p to @nice.
  *	@p contains the task_struct of process.
@@ -1518,7 +1513,6 @@ struct security_operations {
 	int (*task_getpgid) (struct task_struct *p);
 	int (*task_getsid) (struct task_struct *p);
 	void (*task_getsecid) (struct task_struct *p, u32 *secid);
-	int (*task_setgroups) (struct group_info *group_info);
 	int (*task_setnice) (struct task_struct *p, int nice);
 	int (*task_setioprio) (struct task_struct *p, int ioprio);
 	int (*task_getioprio) (struct task_struct *p);
@@ -1773,7 +1767,6 @@ int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
 int security_task_getsid(struct task_struct *p);
 void security_task_getsecid(struct task_struct *p, u32 *secid);
-int security_task_setgroups(struct group_info *group_info);
 int security_task_setnice(struct task_struct *p, int nice);
 int security_task_setioprio(struct task_struct *p, int ioprio);
 int security_task_getioprio(struct task_struct *p);
@@ -2329,11 +2322,6 @@ static inline void security_task_getsecid(struct task_struct *p, u32 *secid)
 	*secid = 0;
 }
 
-static inline int security_task_setgroups(struct group_info *group_info)
-{
-	return 0;
-}
-
 static inline int security_task_setnice(struct task_struct *p, int nice)
 {
 	return cap_task_setnice(p, nice);
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
  */
 int set_groups(struct cred *new, struct group_info *group_info)
 {
-	int retval;
-
-	retval = security_task_setgroups(group_info);
-	if (retval)
-		return retval;
-
 	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
diff --git a/security/capability.c b/security/capability.c
index 66cd7453563f..247c04edd468 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -412,11 +412,6 @@ static void cap_task_getsecid(struct task_struct *p, u32 *secid)
 	*secid = 0;
 }
 
-static int cap_task_setgroups(struct group_info *group_info)
-{
-	return 0;
-}
-
 static int cap_task_getioprio(struct task_struct *p)
 {
 	return 0;
@@ -967,7 +962,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, task_getpgid);
 	set_to_cap_if_null(ops, task_getsid);
 	set_to_cap_if_null(ops, task_getsecid);
-	set_to_cap_if_null(ops, task_setgroups);
 	set_to_cap_if_null(ops, task_setnice);
 	set_to_cap_if_null(ops, task_setioprio);
 	set_to_cap_if_null(ops, task_getioprio);
diff --git a/security/security.c b/security/security.c
index 1e35dd669209..5cf9ca6890f6 100644
--- a/security/security.c
+++ b/security/security.c
@@ -759,11 +759,6 @@ void security_task_getsecid(struct task_struct *p, u32 *secid)
 }
 EXPORT_SYMBOL(security_task_getsecid);
 
-int security_task_setgroups(struct group_info *group_info)
-{
-	return security_ops->task_setgroups(group_info);
-}
-
 int security_task_setnice(struct task_struct *p, int nice)
 {
 	return security_ops->task_setnice(p, nice);
-- 
cgit v1.2.3-55-g7522


From 05b90496f2f366b9d3eea468351888ddf010782a Mon Sep 17 00:00:00 2001
From: Eric Paris
Date: Wed, 7 Apr 2010 15:15:25 -0400
Subject: security: remove dead hook acct

Unused hook.  Remove.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 14 --------------
 kernel/acct.c            | 20 +++++---------------
 security/capability.c    |  6 ------
 security/security.c      |  5 -----
 4 files changed, 5 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index ac536eedec90..d670c9a3ec2b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1254,13 +1254,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@cap contains the capability <include/linux/capability.h>.
  *	@audit: Whether to write an audit message or not
  *	Return 0 if the capability is granted for @tsk.
- * @acct:
- *	Check permission before enabling or disabling process accounting.  If
- *	accounting is being enabled, then @file refers to the open file used to
- *	store accounting records.  If accounting is being disabled, then @file
- *	is NULL.
- *	@file contains the file structure for the accounting file (may be NULL).
- *	Return 0 if permission is granted.
  * @sysctl:
  *	Check permission before accessing the @table sysctl variable in the
  *	manner specified by @op.
@@ -1383,7 +1376,6 @@ struct security_operations {
 		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, const struct cred *cred,
 			int cap, int audit);
-	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
 	int (*quota_on) (struct dentry *dentry);
@@ -1665,7 +1657,6 @@ int security_capset(struct cred *new, const struct cred *old,
 int security_capable(int cap);
 int security_real_capable(struct task_struct *tsk, int cap);
 int security_real_capable_noaudit(struct task_struct *tsk, int cap);
-int security_acct(struct file *file);
 int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
@@ -1883,11 +1874,6 @@ int security_real_capable_noaudit(struct task_struct *tsk, int cap)
 	return ret;
 }
 
-static inline int security_acct(struct file *file)
-{
-	return 0;
-}
-
 static inline int security_sysctl(struct ctl_table *table, int op)
 {
 	return 0;
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..9e53bb2acfff 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
 {
 	struct file *file;
 	struct vfsmount *mnt;
-	int error;
 	struct pid_namespace *ns;
 	struct bsd_acct_struct *acct = NULL;
 
@@ -244,13 +243,6 @@ static int acct_on(char *name)
 		}
 	}
 
-	error = security_acct(file);
-	if (error) {
-		kfree(acct);
-		filp_close(file, NULL);
-		return error;
-	}
-
 	spin_lock(&acct_lock);
 	if (ns->bacct == NULL) {
 		ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
  */
 SYSCALL_DEFINE1(acct, const char __user *, name)
 {
-	int error;
+	int error = 0;
 
 	if (!capable(CAP_SYS_PACCT))
 		return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		if (acct == NULL)
 			return 0;
 
-		error = security_acct(NULL);
-		if (!error) {
-			spin_lock(&acct_lock);
-			acct_file_reopen(acct, NULL, NULL);
-			spin_unlock(&acct_lock);
-		}
+		spin_lock(&acct_lock);
+		acct_file_reopen(acct, NULL, NULL);
+		spin_unlock(&acct_lock);
 	}
+
 	return error;
 }
 
diff --git a/security/capability.c b/security/capability.c
index 8cc2b8f3b166..7f093d573ede 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -12,11 +12,6 @@
 
 #include <linux/security.h>
 
-static int cap_acct(struct file *file)
-{
-	return 0;
-}
-
 static int cap_sysctl(ctl_table *table, int op)
 {
 	return 0;
@@ -865,7 +860,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, ptrace_traceme);
 	set_to_cap_if_null(ops, capget);
 	set_to_cap_if_null(ops, capset);
-	set_to_cap_if_null(ops, acct);
 	set_to_cap_if_null(ops, capable);
 	set_to_cap_if_null(ops, quotactl);
 	set_to_cap_if_null(ops, quota_on);
diff --git a/security/security.c b/security/security.c
index 490f77753b2d..8585019a1a59 100644
--- a/security/security.c
+++ b/security/security.c
@@ -190,11 +190,6 @@ int security_real_capable_noaudit(struct task_struct *tsk, int cap)
 	return ret;
 }
 
-int security_acct(struct file *file)
-{
-	return security_ops->acct(file);
-}
-
 int security_sysctl(struct ctl_table *table, int op)
 {
 	return security_ops->sysctl(table, op);
-- 
cgit v1.2.3-55-g7522


From 6a867a395558a7f882d041783e4cdea6744ca2bf Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Tue, 6 Apr 2010 14:30:51 -0700
Subject: time: Remove xtime_cache

With the earlier logarithmic time accumulation patch, xtime will now
always be within one "tick" of the current time, instead of possibly
half a second off.

This removes the need for the xtime_cache value, which always stored the
time at the last interrupt, so this patch cleans that up removing the
xtime_cache related code.

This patch also addresses an issue with an earlier version of this change,
where xtime_cache was normalizing xtime, which could in some cases be
not valid (ie: tv_nsec == NSEC_PER_SEC). This is fixed by handling
the edge case in update_wall_time().

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Petr Titěra <P.Titera@century.cz>
LKML-Reference: <1270589451-30773-1-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      |  1 -
 kernel/time/timekeeping.c | 35 ++++++++++++++++-------------------
 2 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index 6e026e45a179..ea3559f0b3f2 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -150,7 +150,6 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern void update_wall_time(void);
-extern void update_xtime_cache(u64 nsec);
 extern void timekeeping_leap_insert(int leapsecond);
 
 struct tms;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16736379a9ca..1137f245a4ba 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
 
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
-	xtime_cache = xtime;
-	timespec_add_ns(&xtime_cache, nsec);
-}
-
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime = *tv;
 
-	update_xtime_cache(0);
-
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
 	}
 	set_normalized_timespec(&wall_to_monotonic,
 				-boot.tv_sec, -boot.tv_nsec);
-	update_xtime_cache(0);
 	total_sleep_time.tv_sec = 0;
 	total_sleep_time.tv_nsec = 0;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
 		wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
 		total_sleep_time = timespec_add_safe(total_sleep_time, ts);
 	}
-	update_xtime_cache(0);
 	/* re-base the last cycle value */
 	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
 	timekeeper.ntp_error = 0;
@@ -788,7 +777,6 @@ void update_wall_time(void)
 {
 	struct clocksource *clock;
 	cycle_t offset;
-	u64 nsecs;
 	int shift = 0, maxshift;
 
 	/* Make sure we're fully resumed: */
@@ -846,7 +834,9 @@ void update_wall_time(void)
 		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
 	}
 
-	/* store full nanoseconds into xtime after rounding it up and
+
+	/*
+	 * Store full nanoseconds into xtime after rounding it up and
 	 * add the remainder to the error difference.
 	 */
 	xtime.tv_nsec =	((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -854,8 +844,15 @@ void update_wall_time(void)
 	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<
 				timekeeper.ntp_error_shift;
 
-	nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
-	update_xtime_cache(nsecs);
+	/*
+	 * Finally, make sure that after the rounding
+	 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+	 */
+	if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+		xtime.tv_nsec -= NSEC_PER_SEC;
+		xtime.tv_sec++;
+		second_overflow();
+	}
 
 	/* check to see if there is a new clocksource to use */
 	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -895,13 +892,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 
 unsigned long get_seconds(void)
 {
-	return xtime_cache.tv_sec;
+	return xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 
 struct timespec __current_kernel_time(void)
 {
-	return xtime_cache;
+	return xtime;
 }
 
 struct timespec current_kernel_time(void)
@@ -912,7 +909,7 @@ struct timespec current_kernel_time(void)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
-		now = xtime_cache;
+		now = xtime;
 	} while (read_seqretry(&xtime_lock, seq));
 
 	return now;
@@ -927,7 +924,7 @@ struct timespec get_monotonic_coarse(void)
 	do {
 		seq = read_seqbegin(&xtime_lock);
 
-		now = xtime_cache;
+		now = xtime;
 		mono = wall_to_monotonic;
 	} while (read_seqretry(&xtime_lock, seq));
 
-- 
cgit v1.2.3-55-g7522


From ae731f8d0785ccd3380f511bae888933b6562e45 Mon Sep 17 00:00:00 2001
From: Marc Zyngier
Date: Mon, 15 Mar 2010 22:56:33 +0000
Subject: genirq: Introduce request_any_context_irq()

Now that we enjoy threaded interrupts, we're starting to see irq_chip
implementations (wm831x, pca953x) that make use of threaded interrupts
for the controller, and nested interrupts for the client interrupt. It
all works very well, with one drawback:

Drivers requesting an IRQ must now know whether the handler will
run in a thread context or not, and call request_threaded_irq() or
request_irq() accordingly.

The problem is that the requesting driver sometimes doesn't know
about the nature of the interrupt, specially when the interrupt
controller is a discrete chip (typically a GPIO expander connected
over I2C) that can be connected to a wide variety of otherwise perfectly
supported hardware.

This patch introduces the request_any_context_irq() function that mostly
mimics the usual request_irq(), except that it checks whether the irq
level is configured as nested or not, and calls the right backend.
On success, it also returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.

[ tglx: Made return value an enum, simplified code and made the export
  	of request_any_context_irq GPL ]

Signed-off-by: Marc Zyngier <maz@misterjones.org>
Cc: <joachim.eastwood@jotron.com>
LKML-Reference: <927ea285bd0c68934ddae1a47e44a9ba@localhost>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 23 +++++++++++++++++++++++
 kernel/irq/manage.c       | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00ac1e5..d7e7a7660c6c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -77,6 +77,18 @@ enum {
 	IRQTF_AFFINITY,
 };
 
+/**
+ * These values can be returned by request_any_context_irq() and
+ * describe the context the interrupt will be run in.
+ *
+ * IRQC_IS_HARDIRQ - interrupt runs in hardirq context
+ * IRQC_IS_NESTED - interrupt runs in a nested threaded context
+ */
+enum {
+	IRQC_IS_HARDIRQ	= 0,
+	IRQC_IS_NESTED,
+};
+
 typedef irqreturn_t (*irq_handler_t)(int, void *);
 
 /**
@@ -120,6 +132,10 @@ request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
 	return request_threaded_irq(irq, handler, NULL, flags, name, dev);
 }
 
+extern int __must_check
+request_any_context_irq(unsigned int irq, irq_handler_t handler,
+			unsigned long flags, const char *name, void *dev_id);
+
 extern void exit_irq_thread(void);
 #else
 
@@ -141,6 +157,13 @@ request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	return request_irq(irq, handler, flags, name, dev);
 }
 
+static inline int __must_check
+request_any_context_irq(unsigned int irq, irq_handler_t handler,
+			unsigned long flags, const char *name, void *dev_id)
+{
+	return request_irq(irq, handler, flags, name, dev_id);
+}
+
 static inline void exit_irq_thread(void) { }
 #endif
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488730a5..84f32278ff1f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1120,3 +1120,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	return retval;
 }
 EXPORT_SYMBOL(request_threaded_irq);
+
+/**
+ *	request_any_context_irq - allocate an interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs.
+ *		  Threaded handler for threaded interrupts.
+ *	@flags: Interrupt type flags
+ *	@name: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources and enables the
+ *	interrupt line and IRQ handling. It selects either a
+ *	hardirq or threaded handling method depending on the
+ *	context.
+ *
+ *	On failure, it returns a negative value. On success,
+ *	it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ */
+int request_any_context_irq(unsigned int irq, irq_handler_t handler,
+			    unsigned long flags, const char *name, void *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	int ret;
+
+	if (!desc)
+		return -EINVAL;
+
+	if (desc->status & IRQ_NESTED_THREAD) {
+		ret = request_threaded_irq(irq, NULL, handler,
+					   flags, name, dev_id);
+		return !ret ? IRQC_IS_NESTED : ret;
+	}
+
+	ret = request_irq(irq, handler, flags, name, dev_id);
+	return !ret ? IRQC_IS_HARDIRQ : ret;
+}
+EXPORT_SYMBOL_GPL(request_any_context_irq);
-- 
cgit v1.2.3-55-g7522


From e58aa3d2d0cc01ad8d6f7f640a0670433f794922 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Fri, 26 Mar 2010 00:06:51 +0000
Subject: genirq: Run irq handlers with interrupts disabled

Running interrupt handlers with interrupts enabled can cause stack
overflows. That has been observed with multiqueue NICs delivering all
their interrupts to a single core. We might band aid that somehow by
checking the interrupt stacks, but the real safe fix is to run the irq
handlers with interrupts disabled.

Drivers for whacky hardware still can reenable them in the handler
itself, if the need arises. (They do already due to lockdep)

The risk of doing this is rather low:

 - lockdep already enforces this
 - CONFIG_NOHZ has shaken out the drivers which relied on jiffies updates
 - time keeping is not longer sensitive to the timer interrupt being delayed

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Linus Torvalds <torvalds@osdl.org>
LKML-Reference: <20100326000405.758579387@linutronix.de>

---
 kernel/irq/handle.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
-	if (!(action->flags & IRQF_DISABLED))
-		local_irq_enable_in_hardirq();
-
 	do {
 		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
-- 
cgit v1.2.3-55-g7522


From 6932bf37bed45ce8ed531928b1b0f98162fe6df6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 26 Mar 2010 00:06:55 +0000
Subject: genirq: Remove IRQF_DISABLED from core code

Remove all code which is related to IRQF_DISABLED from the core kernel
code. IRQF_DISABLED still exists as a flag, but becomes a NOOP and
will be removed after a grace period. That way we can easily revert to
the previous behaviour by just restoring the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Linus Torvalds <torvalds@osdl.org>
LKML-Reference: <20100326000405.991244690@linutronix.de>
---
 Documentation/feature-removal-schedule.txt |  7 +++++++
 include/linux/interrupt.h                  |  3 ++-
 kernel/irq/manage.c                        | 30 ------------------------------
 3 files changed, 9 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ed511af0f79a..9c31c2e63684 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -589,3 +589,10 @@ Why:	Useful in 2003, implementation is a hack.
 	Generally invoked by accident today.
 	Seen as doing more harm than good.
 Who:	Len Brown <len.brown@intel.com>
+
+----------------------------
+
+What:	IRQF_DISABLED
+When:	2.6.36
+Why:	The flag is a NOOP as we run interrupt handlers with interrupts disabled
+Who:	Thomas Gleixner <tglx@linutronix.de>
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index d7e7a7660c6c..e6d2f4441fda 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -39,7 +39,8 @@
  * These flags used only by the kernel as part of the
  * irq handling routines.
  *
- * IRQF_DISABLED - keep irqs disabled when calling the action handler
+ * IRQF_DISABLED - keep irqs disabled when calling the action handler.
+ *                 DEPRECATED. This flag is a NOOP and scheduled to be removed
  * IRQF_SAMPLE_RANDOM - irq is used to feed the random generator
  * IRQF_SHARED - allow sharing the irq among several devices
  * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 84f32278ff1f..444d5a81a209 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -757,16 +757,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->status |= IRQ_ONESHOT;
 
-		/*
-		 * Force MSI interrupts to run with interrupts
-		 * disabled. The multi vector cards can cause stack
-		 * overflows due to nested interrupts when enough of
-		 * them are directed to a core and fire at the same
-		 * time.
-		 */
-		if (desc->msi_desc)
-			new->flags |= IRQF_DISABLED;
-
 		if (!(desc->status & IRQ_NOAUTOEN)) {
 			desc->depth = 0;
 			desc->status &= ~IRQ_DISABLED;
@@ -1027,7 +1017,6 @@ EXPORT_SYMBOL(free_irq);
  *	Flags:
  *
  *	IRQF_SHARED		Interrupt is shared
- *	IRQF_DISABLED	Disable local interrupts while processing
  *	IRQF_SAMPLE_RANDOM	The interrupt can be used for entropy
  *	IRQF_TRIGGER_*		Specify active edge(s) or level
  *
@@ -1040,25 +1029,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	struct irq_desc *desc;
 	int retval;
 
-	/*
-	 * handle_IRQ_event() always ignores IRQF_DISABLED except for
-	 * the _first_ irqaction (sigh).  That can cause oopsing, but
-	 * the behavior is classified as "will not fix" so we need to
-	 * start nudging drivers away from using that idiom.
-	 */
-	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
-					(IRQF_SHARED|IRQF_DISABLED)) {
-		pr_warning(
-		  "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
-			irq, devname);
-	}
-
-#ifdef CONFIG_LOCKDEP
-	/*
-	 * Lockdep wants atomic interrupt handlers:
-	 */
-	irqflags |= IRQF_DISABLED;
-#endif
 	/*
 	 * Sanity-check: shared interrupts must pass in a real dev-ID,
 	 * otherwise we'll have trouble later trying to figure out
-- 
cgit v1.2.3-55-g7522


From 97f5f0cd8cd0a05449cbb77d1e6f02e026875802 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov
Date: Sun, 21 Mar 2010 22:31:26 -0700
Subject: Input: implement SysRq as a separate input handler

Instead of keeping SysRq support inside of legacy keyboard driver split
it out into a separate input handler (filter). This stops most SysRq input
events from leaking into evdev clients (some events, such as first SysRq
scancode - not keycode - event, are still leaked into both legacy keyboard
and evdev).

[martinez.javier@gmail.com: fix compile error when CONFIG_MAGIC_SYSRQ is
 not defined]
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/char/keyboard.c |  40 +-------
 drivers/char/sysrq.c    | 243 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/sysrq.h   |  23 ++---
 kernel/sysctl.c         |  23 ++++-
 4 files changed, 246 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index ada25bb8941e..50f6c01f44ec 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -38,7 +38,6 @@
 #include <linux/kbd_kern.h>
 #include <linux/kbd_diacr.h>
 #include <linux/vt_kern.h>
-#include <linux/sysrq.h>
 #include <linux/input.h>
 #include <linux/reboot.h>
 #include <linux/notifier.h>
@@ -82,8 +81,7 @@ void compute_shiftstate(void);
 typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value,
 			    char up_flag);
 static k_handler_fn K_HANDLERS;
-k_handler_fn *k_handler[16] = { K_HANDLERS };
-EXPORT_SYMBOL_GPL(k_handler);
+static k_handler_fn *k_handler[16] = { K_HANDLERS };
 
 #define FN_HANDLERS\
 	fn_null,	fn_enter,	fn_show_ptregs,	fn_show_mem,\
@@ -147,22 +145,6 @@ static struct ledptr {
 	unsigned char valid:1;
 } ledptrs[3];
 
-/* Simple translation table for the SysRq keys */
-
-#ifdef CONFIG_MAGIC_SYSRQ
-unsigned char kbd_sysrq_xlate[KEY_MAX + 1] =
-        "\000\0331234567890-=\177\t"                    /* 0x00 - 0x0f */
-        "qwertyuiop[]\r\000as"                          /* 0x10 - 0x1f */
-        "dfghjkl;'`\000\\zxcv"                          /* 0x20 - 0x2f */
-        "bnm,./\000*\000 \000\201\202\203\204\205"      /* 0x30 - 0x3f */
-        "\206\207\210\211\212\000\000789-456+1"         /* 0x40 - 0x4f */
-        "230\177\000\000\213\214\000\000\000\000\000\000\000\000\000\000" /* 0x50 - 0x5f */
-        "\r\000/";                                      /* 0x60 - 0x6f */
-static int sysrq_down;
-static int sysrq_alt_use;
-#endif
-static int sysrq_alt;
-
 /*
  * Notifier list for console keyboard events
  */
@@ -1108,7 +1090,8 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
 			 * pressing PrtSc/SysRq alone, but simply 0x54
 			 * when pressing Alt+PrtSc/SysRq.
 			 */
-			if (sysrq_alt) {
+			if (test_bit(KEY_LEFTALT, key_down) ||
+			    test_bit(KEY_RIGHTALT, key_down)) {
 				put_queue(vc, 0x54 | up_flag);
 			} else {
 				put_queue(vc, 0xe0);
@@ -1176,8 +1159,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
 
 	kbd = kbd_table + vc->vc_num;
 
-	if (keycode == KEY_LEFTALT || keycode == KEY_RIGHTALT)
-		sysrq_alt = down ? keycode : 0;
 #ifdef CONFIG_SPARC
 	if (keycode == KEY_STOP)
 		sparc_l1_a_state = down;
@@ -1190,21 +1171,6 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
 			if (keycode < BTN_MISC && printk_ratelimit())
 				printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode);
 
-#ifdef CONFIG_MAGIC_SYSRQ	       /* Handle the SysRq Hack */
-	if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) {
-		if (!sysrq_down) {
-			sysrq_down = down;
-			sysrq_alt_use = sysrq_alt;
-		}
-		return;
-	}
-	if (sysrq_down && !down && keycode == sysrq_alt_use)
-		sysrq_down = 0;
-	if (sysrq_down && down && !rep) {
-		handle_sysrq(kbd_sysrq_xlate[keycode], tty);
-		return;
-	}
-#endif
 #ifdef CONFIG_SPARC
 	if (keycode == KEY_A && sparc_l1_a_state) {
 		sparc_l1_a_state = 0;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 59de2525d303..193f9c214946 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -1,7 +1,4 @@
-/* -*- linux-c -*-
- *
- *	$Id: sysrq.c,v 1.15 1998/08/23 14:56:41 mj Exp $
- *
+/*
  *	Linux Magic System Request Key Hacks
  *
  *	(c) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
@@ -10,8 +7,13 @@
  *	(c) 2000 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
  *	overhauled to use key registration
  *	based upon discusions in irc://irc.openprojects.net/#kernelnewbies
+ *
+ *	Copyright (c) 2010 Dmitry Torokhov
+ *	Input handler conversion
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
@@ -39,33 +41,34 @@
 #include <linux/hrtimer.h>
 #include <linux/oom.h>
 #include <linux/slab.h>
+#include <linux/input.h>
 
 #include <asm/ptrace.h>
 #include <asm/irq_regs.h>
 
 /* Whether we react on sysrq keys or just ignore them */
-int __read_mostly __sysrq_enabled = 1;
-
-static int __read_mostly sysrq_always_enabled;
+static int __read_mostly sysrq_enabled = 1;
+static bool __read_mostly sysrq_always_enabled;
 
-int sysrq_on(void)
+static bool sysrq_on(void)
 {
-	return __sysrq_enabled || sysrq_always_enabled;
+	return sysrq_enabled || sysrq_always_enabled;
 }
 
 /*
  * A value of 1 means 'all', other nonzero values are an op mask:
  */
-static inline int sysrq_on_mask(int mask)
+static bool sysrq_on_mask(int mask)
 {
-	return sysrq_always_enabled || __sysrq_enabled == 1 ||
-						(__sysrq_enabled & mask);
+	return sysrq_always_enabled ||
+	       sysrq_enabled == 1 ||
+	       (sysrq_enabled & mask);
 }
 
 static int __init sysrq_always_enabled_setup(char *str)
 {
-	sysrq_always_enabled = 1;
-	printk(KERN_INFO "debug: sysrq always enabled.\n");
+	sysrq_always_enabled = true;
+	pr_info("sysrq always enabled.\n");
 
 	return 1;
 }
@@ -76,6 +79,7 @@ __setup("sysrq_always_enabled", sysrq_always_enabled_setup);
 static void sysrq_handle_loglevel(int key, struct tty_struct *tty)
 {
 	int i;
+
 	i = key - '0';
 	console_loglevel = 7;
 	printk("Loglevel set to %d\n", i);
@@ -101,7 +105,7 @@ static struct sysrq_key_op sysrq_SAK_op = {
 	.enable_mask	= SYSRQ_ENABLE_KEYBOARD,
 };
 #else
-#define sysrq_SAK_op (*(struct sysrq_key_op *)0)
+#define sysrq_SAK_op (*(struct sysrq_key_op *)NULL)
 #endif
 
 #ifdef CONFIG_VT
@@ -119,7 +123,7 @@ static struct sysrq_key_op sysrq_unraw_op = {
 	.enable_mask	= SYSRQ_ENABLE_KEYBOARD,
 };
 #else
-#define sysrq_unraw_op (*(struct sysrq_key_op *)0)
+#define sysrq_unraw_op (*(struct sysrq_key_op *)NULL)
 #endif /* CONFIG_VT */
 
 static void sysrq_handle_crash(int key, struct tty_struct *tty)
@@ -195,7 +199,7 @@ static struct sysrq_key_op sysrq_showlocks_op = {
 	.action_msg	= "Show Locks Held",
 };
 #else
-#define sysrq_showlocks_op (*(struct sysrq_key_op *)0)
+#define sysrq_showlocks_op (*(struct sysrq_key_op *)NULL)
 #endif
 
 #ifdef CONFIG_SMP
@@ -298,7 +302,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 #else
-#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0)
+#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)NULL)
 #endif
 
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
@@ -477,6 +481,7 @@ struct sysrq_key_op *__sysrq_get_key_op(int key)
 	i = sysrq_key_table_key2index(key);
 	if (i != -1)
 	        op_p = sysrq_key_table[i];
+
         return op_p;
 }
 
@@ -488,11 +493,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p)
                 sysrq_key_table[i] = op_p;
 }
 
-/*
- * This is the non-locking version of handle_sysrq.  It must/can only be called
- * by sysrq key handlers, as they are inside of the lock
- */
-void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
+static void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
 {
 	struct sysrq_key_op *op_p;
 	int orig_log_level;
@@ -544,10 +545,6 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
 	spin_unlock_irqrestore(&sysrq_key_table_lock, flags);
 }
 
-/*
- * This function is called by the keyboard handler when SysRq is pressed
- * and any other keycode arrives.
- */
 void handle_sysrq(int key, struct tty_struct *tty)
 {
 	if (sysrq_on())
@@ -555,10 +552,177 @@ void handle_sysrq(int key, struct tty_struct *tty)
 }
 EXPORT_SYMBOL(handle_sysrq);
 
+#ifdef CONFIG_INPUT
+
+/* Simple translation table for the SysRq keys */
+static const unsigned char sysrq_xlate[KEY_MAX + 1] =
+        "\000\0331234567890-=\177\t"                    /* 0x00 - 0x0f */
+        "qwertyuiop[]\r\000as"                          /* 0x10 - 0x1f */
+        "dfghjkl;'`\000\\zxcv"                          /* 0x20 - 0x2f */
+        "bnm,./\000*\000 \000\201\202\203\204\205"      /* 0x30 - 0x3f */
+        "\206\207\210\211\212\000\000789-456+1"         /* 0x40 - 0x4f */
+        "230\177\000\000\213\214\000\000\000\000\000\000\000\000\000\000" /* 0x50 - 0x5f */
+        "\r\000/";                                      /* 0x60 - 0x6f */
+
+static bool sysrq_down;
+static int sysrq_alt_use;
+static int sysrq_alt;
+
+static bool sysrq_filter(struct input_handle *handle, unsigned int type,
+		         unsigned int code, int value)
+{
+	if (type != EV_KEY)
+		goto out;
+
+	switch (code) {
+
+	case KEY_LEFTALT:
+	case KEY_RIGHTALT:
+		if (value)
+			sysrq_alt = code;
+		else if (sysrq_down && code == sysrq_alt_use)
+			sysrq_down = false;
+		break;
+
+	case KEY_SYSRQ:
+		if (value == 1 && sysrq_alt) {
+			sysrq_down = true;
+			sysrq_alt_use = sysrq_alt;
+		}
+		break;
+
+	default:
+		if (sysrq_down && value && value != 2)
+			__handle_sysrq(sysrq_xlate[code], NULL, 1);
+		break;
+	}
+
+out:
+	return sysrq_down;
+}
+
+static int sysrq_connect(struct input_handler *handler,
+			 struct input_dev *dev,
+			 const struct input_device_id *id)
+{
+	struct input_handle *handle;
+	int error;
+
+	sysrq_down = false;
+	sysrq_alt = 0;
+
+	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->dev = dev;
+	handle->handler = handler;
+	handle->name = "sysrq";
+
+	error = input_register_handle(handle);
+	if (error) {
+		pr_err("Failed to register input sysrq handler, error %d\n",
+			error);
+		goto err_free;
+	}
+
+	error = input_open_device(handle);
+	if (error) {
+		pr_err("Failed to open input device, error %d\n", error);
+		goto err_unregister;
+	}
+
+	return 0;
+
+ err_unregister:
+	input_unregister_handle(handle);
+ err_free:
+	kfree(handle);
+	return error;
+}
+
+static void sysrq_disconnect(struct input_handle *handle)
+{
+	input_close_device(handle);
+	input_unregister_handle(handle);
+	kfree(handle);
+}
+
+/*
+ * We are matching on KEY_LEFTALT insteard of KEY_SYSRQ because not all
+ * keyboards have SysRq ikey predefined and so user may add it to keymap
+ * later, but we expect all such keyboards to have left alt.
+ */
+static const struct input_device_id sysrq_ids[] = {
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT |
+				INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { BIT_MASK(KEY_LEFTALT) },
+	},
+	{ },
+};
+
+static struct input_handler sysrq_handler = {
+	.filter		= sysrq_filter,
+	.connect	= sysrq_connect,
+	.disconnect	= sysrq_disconnect,
+	.name		= "sysrq",
+	.id_table	= sysrq_ids,
+};
+
+static bool sysrq_handler_registered;
+
+static inline void sysrq_register_handler(void)
+{
+	int error;
+
+	error = input_register_handler(&sysrq_handler);
+	if (error)
+		pr_err("Failed to register input handler, error %d", error);
+	else
+		sysrq_handler_registered = true;
+}
+
+static inline void sysrq_unregister_handler(void)
+{
+	if (sysrq_handler_registered) {
+		input_unregister_handler(&sysrq_handler);
+		sysrq_handler_registered = false;
+	}
+}
+
+#else
+
+static inline void sysrq_register_handler(void)
+{
+}
+
+static inline void sysrq_unregister_handler(void)
+{
+}
+
+#endif /* CONFIG_INPUT */
+
+int sysrq_toggle_support(int enable_mask)
+{
+	bool was_enabled = sysrq_on();
+
+	sysrq_enabled = enable_mask;
+
+	if (was_enabled != sysrq_on()) {
+		if (sysrq_on())
+			sysrq_register_handler();
+		else
+			sysrq_unregister_handler();
+	}
+
+	return 0;
+}
+
 static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p,
                                 struct sysrq_key_op *remove_op_p)
 {
-
 	int retval;
 	unsigned long flags;
 
@@ -599,6 +763,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
 			return -EFAULT;
 		__handle_sysrq(c, NULL, 0);
 	}
+
 	return count;
 }
 
@@ -606,10 +771,28 @@ static const struct file_operations proc_sysrq_trigger_operations = {
 	.write		= write_sysrq_trigger,
 };
 
+static void sysrq_init_procfs(void)
+{
+	if (!proc_create("sysrq-trigger", S_IWUSR, NULL,
+			 &proc_sysrq_trigger_operations))
+		pr_err("Failed to register proc interface\n");
+}
+
+#else
+
+static inline void sysrq_init_procfs(void)
+{
+}
+
+#endif /* CONFIG_PROC_FS */
+
 static int __init sysrq_init(void)
 {
-	proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
+	sysrq_init_procfs();
+
+	if (sysrq_on())
+		sysrq_register_handler();
+
 	return 0;
 }
 module_init(sysrq_init);
-#endif
diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h
index 99adcdc0d3ca..4496322e28dd 100644
--- a/include/linux/sysrq.h
+++ b/include/linux/sysrq.h
@@ -39,41 +39,34 @@ struct sysrq_key_op {
 
 #ifdef CONFIG_MAGIC_SYSRQ
 
-extern int sysrq_on(void);
-
-/*
- * Do not use this one directly:
- */
-extern int __sysrq_enabled;
-
 /* Generic SysRq interface -- you may call it from any device driver, supplying
  * ASCII code of the key, pointer to registers and kbd/tty structs (if they
  * are available -- else NULL's).
  */
 
 void handle_sysrq(int key, struct tty_struct *tty);
-void __handle_sysrq(int key, struct tty_struct *tty, int check_mask);
 int register_sysrq_key(int key, struct sysrq_key_op *op);
 int unregister_sysrq_key(int key, struct sysrq_key_op *op);
 struct sysrq_key_op *__sysrq_get_key_op(int key);
 
+int sysrq_toggle_support(int enable_mask);
+
 #else
 
-static inline int sysrq_on(void)
+static inline void handle_sysrq(int key, struct tty_struct *tty)
 {
-	return 0;
 }
-static inline int __reterr(void)
+
+static inline int register_sysrq_key(int key, struct sysrq_key_op *op)
 {
 	return -EINVAL;
 }
-static inline void handle_sysrq(int key, struct tty_struct *tty)
+
+static inline int unregister_sysrq_key(int key, struct sysrq_key_op *op)
 {
+	return -EINVAL;
 }
 
-#define register_sysrq_key(ig,nore) __reterr()
-#define unregister_sysrq_key(ig,nore) __reterr()
-
 #endif
 
 #endif /* _LINUX_SYSRQ_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..ce724a0dd0bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -163,6 +163,27 @@ static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_MAGIC_SYSRQ
+static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+
+static int sysrq_sysctl_handler(ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int error;
+
+	error = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (error)
+		return error;
+
+	if (write)
+		sysrq_toggle_support(__sysrq_enabled);
+
+	return 0;
+}
+
+#endif
+
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
@@ -567,7 +588,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &__sysrq_enabled,
 		.maxlen		= sizeof (int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysrq_sysctl_handler,
 	},
 #endif
 #ifdef CONFIG_PROC_SYSCTL
-- 
cgit v1.2.3-55-g7522


From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Mon, 5 Apr 2010 15:35:57 +0200
Subject: perf: Store active software events in a hashlist

Each time a software event triggers, we need to walk through
the entire list of events from the current cpu and task contexts
to retrieve a running perf event that matches.
We also need to check a matching perf event is actually counting.

This walk is wasteful and makes the event fast path scaling
down with a growing number of events running on the same
contexts.

To solve this, we store the running perf events in a hashlist to
get an immediate access to them against their type:event_id when
they trigger.

v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic
      maths along the way)
    - Only allocate hlist for online cpus, but keep track of the
      refcount on offline possible cpus too, so that we allocate it
      if needed when it becomes online.
    - Drop the kref use as it's not adapted to our tricks anymore.

v3: - Fix bad refcount check (address instead of value). Thanks to
      Eric Dumazet who spotted this.
    - While exiting cpu, move the hlist release out of the IPI path
      to lock the hlist mutex sanely.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  12 +++
 kernel/perf_event.c        | 246 +++++++++++++++++++++++++++++++++------------
 2 files changed, 195 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6e96cc8225d4..bf896d0b2e9c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -589,6 +589,14 @@ enum perf_group_flag {
 	PERF_GROUP_SOFTWARE = 0x1,
 };
 
+#define SWEVENT_HLIST_BITS	8
+#define SWEVENT_HLIST_SIZE	(1 << SWEVENT_HLIST_BITS)
+
+struct swevent_hlist {
+	struct hlist_head	heads[SWEVENT_HLIST_SIZE];
+	struct rcu_head		rcu_head;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -597,6 +605,7 @@ struct perf_event {
 	struct list_head		group_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	struct hlist_node		hlist_entry;
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
@@ -744,6 +753,9 @@ struct perf_cpu_context {
 	int				active_oncpu;
 	int				max_pertask;
 	int				exclusive;
+	struct swevent_hlist		*swevent_hlist;
+	struct mutex			hlist_mutex;
+	int				hlist_refcount;
 
 	/*
 	 * Recursion avoidance:
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fcf42dcd6089..9efdfe5b8d3b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-	/*
-	 * The event is active, we're good!
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		return 1;
-
-	/*
-	 * The event is off/error, not counting.
-	 */
-	if (event->state != PERF_EVENT_STATE_INACTIVE)
-		return 0;
-
-	/*
-	 * The event is inactive, if the context is active
-	 * we're part of a group that didn't make it on the 'pmu',
-	 * not counting.
-	 */
-	if (event->ctx->is_active)
-		return 0;
-
-	/*
-	 * We're inactive and the context is too, this means the
-	 * task is scheduled out, we're counting events that happen
-	 * to us, like migration events.
-	 */
-	return 1;
-}
-
 static int perf_tp_event_match(struct perf_event *event,
 				struct perf_sample_data *data);
 
@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event,
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
-		return 0;
-
-	if (!perf_swevent_is_counting(event))
-		return 0;
-
 	if (event->attr.type != type)
 		return 0;
 
@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event,
 	return 1;
 }
 
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
-				     enum perf_type_id type,
-				     u32 event_id, u64 nr, int nmi,
-				     struct perf_sample_data *data,
-				     struct pt_regs *regs)
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+	u64 val = event_id | (type << 32);
+
+	return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+	u64 hash;
+	struct swevent_hlist *hlist;
+
+	hash = swevent_hash(type, event_id);
+
+	hlist = rcu_dereference(ctx->swevent_hlist);
+	if (!hlist)
+		return NULL;
+
+	return &hlist->heads[hash];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
+	struct hlist_node *node;
+	struct hlist_head *head;
 
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+	cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+
+	head = find_swevent_head(cpuctx, type, event_id);
+
+	if (!head)
+		goto end;
+
+	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
+end:
+	rcu_read_unlock();
 }
 
 int perf_swevent_get_recursion_context(void)
@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
-
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-	rcu_read_lock();
-	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-				 nr, nmi, data, regs);
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-	rcu_read_unlock();
-}
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	struct perf_cpu_context *cpuctx;
+	struct hlist_head *head;
+
+	cpuctx = &__get_cpu_var(perf_cpu_context);
 
 	if (hwc->sample_period) {
 		hwc->last_period = hwc->sample_period;
 		perf_swevent_set_period(event);
 	}
+
+	head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+	if (WARN_ON_ONCE(!head))
+		return -EINVAL;
+
+	hlist_add_head_rcu(&event->hlist_entry, head);
+
 	return 0;
 }
 
 static void perf_swevent_disable(struct perf_event *event)
 {
+	hlist_del_rcu(&event->hlist_entry);
 }
 
 static const struct pmu perf_ops_generic = {
@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 0;
 }
 
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
+{
+	struct swevent_hlist *hlist;
+
+	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+	kfree(hlist);
+}
+
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+	struct swevent_hlist *hlist;
+
+	if (!cpuctx->swevent_hlist)
+		return;
+
+	hlist = cpuctx->swevent_hlist;
+	rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!--cpuctx->hlist_refcount)
+		swevent_hlist_release(cpuctx);
+
+	mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+	int cpu;
+
+	if (event->cpu != -1) {
+		swevent_hlist_put_cpu(event, event->cpu);
+		return;
+	}
+
+	for_each_possible_cpu(cpu)
+		swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	int err = 0;
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+		struct swevent_hlist *hlist;
+
+		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+		if (!hlist) {
+			err = -ENOMEM;
+			goto exit;
+		}
+		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+	}
+	cpuctx->hlist_refcount++;
+ exit:
+	mutex_unlock(&cpuctx->hlist_mutex);
+
+	return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+	int err;
+	int cpu, failed_cpu;
+
+	if (event->cpu != -1)
+		return swevent_hlist_get_cpu(event, event->cpu);
+
+	get_online_cpus();
+	for_each_possible_cpu(cpu) {
+		err = swevent_hlist_get_cpu(event, cpu);
+		if (err) {
+			failed_cpu = cpu;
+			goto fail;
+		}
+	}
+	put_online_cpus();
+
+	return 0;
+ fail:
+	for_each_possible_cpu(cpu) {
+		if (cpu == failed_cpu)
+			break;
+		swevent_hlist_put_cpu(event, cpu);
+	}
+
+	put_online_cpus();
+	return err;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
 	perf_trace_disable(event->attr.config);
+	swevent_hlist_put(event);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+	int err;
+
 	/*
 	 * Raw tracepoint data is a severe data leak, only allow root to
 	 * have these.
@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 		return NULL;
 
 	event->destroy = tp_perf_event_destroy;
+	err = swevent_hlist_get(event);
+	if (err) {
+		perf_trace_disable(event->attr.config);
+		return ERR_PTR(err);
+	}
 
 	return &perf_ops_generic;
 }
@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	WARN_ON(event->parent);
 
 	atomic_dec(&perf_swevent_enabled[event_id]);
+	swevent_hlist_put(event);
 }
 
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
 	case PERF_COUNT_SW_EMULATION_FAULTS:
 		if (!event->parent) {
+			int err;
+
+			err = swevent_hlist_get(event);
+			if (err)
+				return ERR_PTR(err);
+
 			atomic_inc(&perf_swevent_enabled[event_id]);
 			event->destroy = sw_perf_event_destroy;
 		}
@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void)
 
 	for_each_possible_cpu(cpu) {
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		mutex_init(&cpuctx->hlist_mutex);
 		__perf_event_init_context(&cpuctx->ctx, NULL);
 	}
 }
@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
 	spin_unlock(&perf_resource_lock);
+
+	mutex_lock(&cpuctx->hlist_mutex);
+	if (cpuctx->hlist_refcount > 0) {
+		struct swevent_hlist *hlist;
+
+		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+		WARN_ON_ONCE(!hlist);
+		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+	}
+	mutex_unlock(&cpuctx->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu)
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
+	mutex_lock(&cpuctx->hlist_mutex);
+	swevent_hlist_release(cpuctx);
+	mutex_unlock(&cpuctx->hlist_mutex);
+
 	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
 	mutex_unlock(&ctx->mutex);
-- 
cgit v1.2.3-55-g7522


From df8290bf7ea6b3051e2f315579a6e829309ec1ed Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Fri, 9 Apr 2010 00:28:14 +0200
Subject: perf: Make clock software events consistent with general exclusion
 rules

The cpu/task clock events implement their own version of exclusion
on top of exclude_user and exclude_kernel.

The result is that when the event triggered in the kernel but we
have exclude_kernel set, we try to rewind using task_pt_regs.
There are two side effects of this:

- we call task_pt_regs even on kernel threads, which doesn't give
  us the desired result.
- if the event occured in the kernel, we shouldn't rewind to the
  user context. We want to actually ignore the event.

get_irq_regs() will always give us the right interrupted context, so
use its result and submit it to perf_exclude_context() that knows
when an event must be ignored.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9efdfe5b8d3b..095101d685bc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4164,15 +4164,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 	regs = get_irq_regs();
-	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
-	 */
-	if ((event->attr.exclude_kernel || !regs) &&
-			!event->attr.exclude_user)
-		regs = task_pt_regs(current);
 
-	if (regs) {
+	if (regs && !perf_exclude_event(event, regs)) {
 		if (!(event->attr.exclude_idle && current->pid == 0))
 			if (perf_event_overflow(event, 0, &data, regs))
 				ret = HRTIMER_NORESTART;
-- 
cgit v1.2.3-55-g7522


From 93ccae7a2227466a0d071fe52c51319f2f34c365 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu
Date: Mon, 12 Apr 2010 13:17:08 -0400
Subject: tracing/kprobes: Support basic types on dynamic events

Support basic types of integer (u8, u16, u32, u64, s8, s16, s32, s64) in
kprobe tracer. With this patch, users can specify above basic types on
each arguments after ':'. If omitted, the argument type is set as
unsigned long (u32 or u64, arch-dependent).

 e.g.
  echo 'p account_system_time+0 hardirq_offset=%si:s32' > kprobe_events

  adds a probe recording hardirq_offset in signed-32bits value on the
  entry of account_system_time.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171708.3790.18599.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/trace/kprobetrace.txt |   4 +-
 kernel/trace/trace.h                |  16 +-
 kernel/trace/trace_kprobe.c         | 535 ++++++++++++++++++++++--------------
 3 files changed, 334 insertions(+), 221 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index a9100b28eb84..ec94748ae65b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -40,7 +40,9 @@ Synopsis of kprobe_events
   $stack	: Fetch stack address.
   $retval	: Fetch return value.(*)
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
-  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+  NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
+  FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
+		  (u8/u16/u32/u64/s8/s16/s32/s64) are supported.
 
   (*) only for return probe.
   (**) this is useful for fetching a field of data structures.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bec2c973ff0c..3ebdb6bd2362 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -102,29 +102,17 @@ struct syscall_trace_exit {
 	long			ret;
 };
 
-struct kprobe_trace_entry {
+struct kprobe_trace_entry_head {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			nargs;
-	unsigned long		args[];
 };
 
-#define SIZEOF_KPROBE_TRACE_ENTRY(n)			\
-	(offsetof(struct kprobe_trace_entry, args) +	\
-	(sizeof(unsigned long) * (n)))
-
-struct kretprobe_trace_entry {
+struct kretprobe_trace_entry_head {
 	struct trace_entry	ent;
 	unsigned long		func;
 	unsigned long		ret_ip;
-	int			nargs;
-	unsigned long		args[];
 };
 
-#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)			\
-	(offsetof(struct kretprobe_trace_entry, args) +	\
-	(sizeof(unsigned long) * (n)))
-
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae9..a7514326052b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
 #include <linux/ctype.h>
 #include <linux/ptrace.h>
 #include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <asm/bitsperlong.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -40,7 +42,6 @@
 
 /* Reserved field names */
 #define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_NARGS "__probe_nargs"
 #define FIELD_STRING_RETIP "__probe_ret_ip"
 #define FIELD_STRING_FUNC "__probe_func"
 
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
 	"common_tgid",
 	"common_lock_depth",
 	FIELD_STRING_IP,
-	FIELD_STRING_NARGS,
 	FIELD_STRING_RETIP,
 	FIELD_STRING_FUNC,
 };
 
-struct fetch_func {
-	unsigned long (*func)(struct pt_regs *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+#define PRINT_TYPE_FUNC_NAME(type)	print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)	print_type_format_##type
+
+/* Printing  in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)			\
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,	\
+						const char *name, void *data)\
+{									\
+	return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+}									\
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+
+/* Data fetch function type */
+typedef	void (*fetch_func_t)(struct pt_regs *, void *, void *);
+
+struct fetch_param {
+	fetch_func_t	fn;
 	void *data;
 };
 
-static __kprobes unsigned long call_fetch(struct fetch_func *f,
-					  struct pt_regs *regs)
+static __kprobes void call_fetch(struct fetch_param *fprm,
+				 struct pt_regs *regs, void *dest)
 {
-	return f->func(regs, f->data);
+	return fprm->fn(regs, fprm->data, dest);
 }
 
-/* fetch handlers */
-static __kprobes unsigned long fetch_register(struct pt_regs *regs,
-					      void *offset)
-{
-	return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+#define FETCH_FUNC_NAME(kind, type)	fetch_##kind##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(kind)  \
+DEFINE_FETCH_##kind(u8)			\
+DEFINE_FETCH_##kind(u16)		\
+DEFINE_FETCH_##kind(u32)		\
+DEFINE_FETCH_##kind(u64)
+
+#define CHECK_BASIC_FETCH_FUNCS(kind, fn)	\
+	((FETCH_FUNC_NAME(kind, u8) == fn) ||	\
+	 (FETCH_FUNC_NAME(kind, u16) == fn) ||	\
+	 (FETCH_FUNC_NAME(kind, u32) == fn) ||	\
+	 (FETCH_FUNC_NAME(kind, u64) == fn))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type)						\
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,	\
+					  void *offset, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_get_register(regs,			\
+				(unsigned int)((unsigned long)offset));	\
 }
-
-static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
-					   void *num)
-{
-	return regs_get_kernel_stack_nth(regs,
-					 (unsigned int)((unsigned long)num));
+DEFINE_BASIC_FETCH_FUNCS(reg)
+
+#define DEFINE_FETCH_stack(type)					\
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+					  void *offset, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_get_kernel_stack_nth(regs,		\
+				(unsigned int)((unsigned long)offset));	\
 }
+DEFINE_BASIC_FETCH_FUNCS(stack)
 
-static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
-{
-	unsigned long retval;
-
-	if (probe_kernel_address(addr, retval))
-		return 0;
-	return retval;
+#define DEFINE_FETCH_retval(type)					\
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+					  void *dummy, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_return_value(regs);			\
 }
-
-static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
-					      void *dummy)
-{
-	return regs_return_value(regs);
-}
-
-static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
-						   void *dummy)
-{
-	return kernel_stack_pointer(regs);
+DEFINE_BASIC_FETCH_FUNCS(retval)
+
+#define DEFINE_FETCH_memory(type)					\
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+					  void *addr, void *dest)	\
+{									\
+	type retval;							\
+	if (probe_kernel_address(addr, retval))				\
+		*(type *)dest = 0;					\
+	else								\
+		*(type *)dest = retval;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(memory)
 
 /* Memory fetching by symbol */
 struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
 	return sc;
 }
 
-static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
-{
-	struct symbol_cache *sc = data;
-
-	if (sc->addr)
-		return fetch_memory(regs, (void *)sc->addr);
-	else
-		return 0;
+#define DEFINE_FETCH_symbol(type)					\
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+					  void *data, void *dest)	\
+{									\
+	struct symbol_cache *sc = data;					\
+	if (sc->addr)							\
+		fetch_memory_##type(regs, (void *)sc->addr, dest);	\
+	else								\
+		*(type *)dest = 0;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(symbol)
 
-/* Special indirect memory access interface */
-struct indirect_fetch_data {
-	struct fetch_func orig;
+/* Dereference memory access function */
+struct deref_fetch_param {
+	struct fetch_param orig;
 	long offset;
 };
 
-static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
-{
-	struct indirect_fetch_data *ind = data;
-	unsigned long addr;
-
-	addr = call_fetch(&ind->orig, regs);
-	if (addr) {
-		addr += ind->offset;
-		return fetch_memory(regs, (void *)addr);
-	} else
-		return 0;
+#define DEFINE_FETCH_deref(type)					\
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+					    void *data, void *dest)	\
+{									\
+	struct deref_fetch_param *dprm = data;				\
+	unsigned long addr;						\
+	call_fetch(&dprm->orig, regs, &addr);				\
+	if (addr) {							\
+		addr += dprm->offset;					\
+		fetch_memory_##type(regs, (void *)addr, dest);		\
+	} else								\
+		*(type *)dest = 0;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(deref)
 
-static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
-	if (data->orig.func == fetch_indirect)
-		free_indirect_fetch_data(data->orig.data);
-	else if (data->orig.func == fetch_symbol)
+	if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
+		free_deref_fetch_param(data->orig.data);
+	else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
 		free_symbol_cache(data->orig.data);
 	kfree(data);
 }
 
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+#define ASSIGN_FETCH_FUNC(kind, type)	\
+	.kind = FETCH_FUNC_NAME(kind, type)
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)	\
+	{.name = #ptype,			\
+	 .size = sizeof(ftype),			\
+	 .is_signed = sign,			\
+	 .print = PRINT_TYPE_FUNC_NAME(ptype),	\
+	 .fmt = PRINT_TYPE_FMT_NAME(ptype),	\
+ASSIGN_FETCH_FUNC(reg, ftype),			\
+ASSIGN_FETCH_FUNC(stack, ftype),		\
+ASSIGN_FETCH_FUNC(retval, ftype),		\
+ASSIGN_FETCH_FUNC(memory, ftype),		\
+ASSIGN_FETCH_FUNC(symbol, ftype),		\
+ASSIGN_FETCH_FUNC(deref, ftype),		\
+	}
+
+/* Fetch type information table */
+static const struct fetch_type {
+	const char	*name;		/* Name of type */
+	size_t		size;		/* Byte size of type */
+	int		is_signed;	/* Signed flag */
+	print_type_func_t	print;	/* Print functions */
+	const char	*fmt;		/* Fromat string */
+	/* Fetch functions */
+	fetch_func_t	reg;
+	fetch_func_t	stack;
+	fetch_func_t	retval;
+	fetch_func_t	memory;
+	fetch_func_t	symbol;
+	fetch_func_t	deref;
+} fetch_type_table[] = {
+	ASSIGN_FETCH_TYPE(u8,  u8,  0),
+	ASSIGN_FETCH_TYPE(u16, u16, 0),
+	ASSIGN_FETCH_TYPE(u32, u32, 0),
+	ASSIGN_FETCH_TYPE(u64, u64, 0),
+	ASSIGN_FETCH_TYPE(s8,  u8,  1),
+	ASSIGN_FETCH_TYPE(s16, u16, 1),
+	ASSIGN_FETCH_TYPE(s32, u32, 1),
+	ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+	int i;
+
+	if (!type)
+		type = DEFAULT_FETCH_TYPE_STR;
+
+	for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+		if (strcmp(type, fetch_type_table[i].name) == 0)
+			return &fetch_type_table[i];
+	return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+					  void *dummy, void *dest)
+{
+	*(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+
 /**
  * Kprobe event core functions
  */
 
 struct probe_arg {
-	struct fetch_func	fetch;
-	const char		*name;
+	struct fetch_param	fetch;
+	unsigned int		offset;	/* Offset from argument entry */
+	const char		*name;	/* Name of this argument */
+	const char		*comm;	/* Command of this argument */
+	const struct fetch_type	*type;	/* Type of this argument */
 };
 
 /* Flags for trace_probe */
@@ -204,6 +326,7 @@ struct trace_probe {
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_call	call;
 	struct trace_event		event;
+	ssize_t			size;		/* trace entry size */
 	unsigned int		nr_args;
 	struct probe_arg	args[];
 };
@@ -212,6 +335,7 @@ struct trace_probe {
 	(offsetof(struct trace_probe, args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+
 static __kprobes int probe_is_return(struct trace_probe *tp)
 {
 	return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
 	return tp->symbol ? tp->symbol : "unknown";
 }
 
-static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
-{
-	int ret = -EINVAL;
-
-	if (ff->func == fetch_register) {
-		const char *name;
-		name = regs_query_register_name((unsigned int)((long)ff->data));
-		ret = snprintf(buf, n, "%%%s", name);
-	} else if (ff->func == fetch_stack)
-		ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
-	else if (ff->func == fetch_memory)
-		ret = snprintf(buf, n, "@0x%p", ff->data);
-	else if (ff->func == fetch_symbol) {
-		struct symbol_cache *sc = ff->data;
-		if (sc->offset)
-			ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
-					sc->offset);
-		else
-			ret = snprintf(buf, n, "@%s", sc->symbol);
-	} else if (ff->func == fetch_retvalue)
-		ret = snprintf(buf, n, "$retval");
-	else if (ff->func == fetch_stack_address)
-		ret = snprintf(buf, n, "$stack");
-	else if (ff->func == fetch_indirect) {
-		struct indirect_fetch_data *id = ff->data;
-		size_t l = 0;
-		ret = snprintf(buf, n, "%+ld(", id->offset);
-		if (ret >= n)
-			goto end;
-		l += ret;
-		ret = probe_arg_string(buf + l, n - l, &id->orig);
-		if (ret < 0)
-			goto end;
-		l += ret;
-		ret = snprintf(buf + l, n - l, ")");
-		ret += l;
-	}
-end:
-	if (ret >= n)
-		return -ENOSPC;
-	return ret;
-}
-
 static int register_probe_event(struct trace_probe *tp);
 static void unregister_probe_event(struct trace_probe *tp);
 
@@ -347,11 +428,12 @@ error:
 
 static void free_probe_arg(struct probe_arg *arg)
 {
-	if (arg->fetch.func == fetch_symbol)
+	if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+		free_deref_fetch_param(arg->fetch.data);
+	else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
 		free_symbol_cache(arg->fetch.data);
-	else if (arg->fetch.func == fetch_indirect)
-		free_indirect_fetch_data(arg->fetch.data);
 	kfree(arg->name);
+	kfree(arg->comm);
 }
 
 static void free_trace_probe(struct trace_probe *tp)
@@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
 #define PARAM_MAX_ARGS 16
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
-static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+			    struct fetch_param *f, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
 
 	if (strcmp(arg, "retval") == 0) {
-		if (is_return) {
-			ff->func = fetch_retvalue;
-			ff->data = NULL;
-		} else
+		if (is_return)
+			f->fn = t->retval;
+		else
 			ret = -EINVAL;
 	} else if (strncmp(arg, "stack", 5) == 0) {
 		if (arg[5] == '\0') {
-			ff->func = fetch_stack_address;
-			ff->data = NULL;
+			if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+				f->fn = fetch_stack_address;
+			else
+				ret = -EINVAL;
 		} else if (isdigit(arg[5])) {
 			ret = strict_strtoul(arg + 5, 10, &param);
 			if (ret || param > PARAM_MAX_STACK)
 				ret = -EINVAL;
 			else {
-				ff->func = fetch_stack;
-				ff->data = (void *)param;
+				f->fn = t->stack;
+				f->data = (void *)param;
 			}
 		} else
 			ret = -EINVAL;
@@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 }
 
 /* Recursive argument parser */
-static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+			     struct fetch_param *f, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
@@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 
 	switch (arg[0]) {
 	case '$':
-		ret = parse_probe_vars(arg + 1, ff, is_return);
+		ret = parse_probe_vars(arg + 1, t, f, is_return);
 		break;
 	case '%':	/* named register */
 		ret = regs_query_register_offset(arg + 1);
 		if (ret >= 0) {
-			ff->func = fetch_register;
-			ff->data = (void *)(unsigned long)ret;
+			f->fn = t->reg;
+			f->data = (void *)(unsigned long)ret;
 			ret = 0;
 		}
 		break;
@@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 			ret = strict_strtoul(arg + 1, 0, &param);
 			if (ret)
 				break;
-			ff->func = fetch_memory;
-			ff->data = (void *)param;
+			f->fn = t->memory;
+			f->data = (void *)param;
 		} else {
 			ret = split_symbol_offset(arg + 1, &offset);
 			if (ret)
 				break;
-			ff->data = alloc_symbol_cache(arg + 1, offset);
-			if (ff->data)
-				ff->func = fetch_symbol;
-			else
-				ret = -EINVAL;
+			f->data = alloc_symbol_cache(arg + 1, offset);
+			if (f->data)
+				f->fn = t->symbol;
 		}
 		break;
-	case '+':	/* indirect memory */
+	case '+':	/* deref memory */
 	case '-':
 		tmp = strchr(arg, '(');
-		if (!tmp) {
-			ret = -EINVAL;
+		if (!tmp)
 			break;
-		}
 		*tmp = '\0';
 		ret = strict_strtol(arg + 1, 0, &offset);
 		if (ret)
@@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 		arg = tmp + 1;
 		tmp = strrchr(arg, ')');
 		if (tmp) {
-			struct indirect_fetch_data *id;
+			struct deref_fetch_param *dprm;
+			const struct fetch_type *t2 = find_fetch_type(NULL);
 			*tmp = '\0';
-			id = kzalloc(sizeof(struct indirect_fetch_data),
-				     GFP_KERNEL);
-			if (!id)
+			dprm = kzalloc(sizeof(struct deref_fetch_param),
+				       GFP_KERNEL);
+			if (!dprm)
 				return -ENOMEM;
-			id->offset = offset;
-			ret = __parse_probe_arg(arg, &id->orig, is_return);
+			dprm->offset = offset;
+			ret = __parse_probe_arg(arg, t2, &dprm->orig,
+						is_return);
 			if (ret)
-				kfree(id);
+				kfree(dprm);
 			else {
-				ff->func = fetch_indirect;
-				ff->data = (void *)id;
+				f->fn = t->deref;
+				f->data = (void *)dprm;
 			}
-		} else
-			ret = -EINVAL;
+		}
 		break;
-	default:
-		/* TODO: support custom handler */
-		ret = -EINVAL;
 	}
+	if (!ret && !f->fn)
+		ret = -EINVAL;
 	return ret;
 }
 
 /* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+			   struct probe_arg *parg, int is_return)
 {
+	const char *t;
+
 	if (strlen(arg) > MAX_ARGSTR_LEN) {
 		pr_info("Argument is too long.: %s\n",  arg);
 		return -ENOSPC;
 	}
-	return __parse_probe_arg(arg, ff, is_return);
+	parg->comm = kstrdup(arg, GFP_KERNEL);
+	if (!parg->comm) {
+		pr_info("Failed to allocate memory for command '%s'.\n", arg);
+		return -ENOMEM;
+	}
+	t = strchr(parg->comm, ':');
+	if (t) {
+		arg[t - parg->comm] = '\0';
+		t++;
+	}
+	parg->type = find_fetch_type(t);
+	if (!parg->type) {
+		pr_info("Unsupported type: %s\n", t);
+		return -EINVAL;
+	}
+	parg->offset = tp->size;
+	tp->size += parg->type->size;
+	return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
 }
 
 /* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv)
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
-	 * Indirect memory fetch:
+	 * Dereferencing memory fetch:
 	 *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
 	 * Alias name of args:
 	 *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+	 * Type of args:
+	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.
 	 */
 	struct trace_probe *tp;
 	int i, ret = 0;
 	int is_return = 0, is_delete = 0;
-	char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+	char *symbol = NULL, *event = NULL, *group = NULL;
+	char *arg, *tmp;
 	unsigned long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv)
 		else
 			arg = argv[i];
 
-		if (conflict_field_name(argv[i], tp->args, i)) {
-			pr_info("Argument%d name '%s' conflicts with "
-				"another field.\n", i, argv[i]);
-			ret = -EINVAL;
-			goto error;
-		}
-
 		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
 		if (!tp->args[i].name) {
 			pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv)
 			ret = -ENOMEM;
 			goto error;
 		}
+		tmp = strchr(tp->args[i].name, ':');
+		if (tmp)
+			*tmp = '_';	/* convert : to _ */
+
+		if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+			pr_info("Argument%d name '%s' conflicts with "
+				"another field.\n", i, argv[i]);
+			ret = -EINVAL;
+			goto error;
+		}
 
 		/* Parse fetch argument */
-		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+		ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
 		if (ret) {
 			pr_info("Parse error at argument%d. (%d)\n", i, ret);
 			kfree(tp->args[i].name);
@@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_probe *tp = v;
-	int i, ret;
-	char buf[MAX_ARGSTR_LEN + 1];
+	int i;
 
 	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
 	seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
@@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	else
 		seq_printf(m, " %s", probe_symbol(tp));
 
-	for (i = 0; i < tp->nr_args; i++) {
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
-		if (ret < 0) {
-			pr_warning("Argument%d decoding error(%d).\n", i, ret);
-			return ret;
-		}
-		seq_printf(m, " %s=%s", tp->args[i].name, buf);
-	}
+	for (i = 0; i < tp->nr_args; i++)
+		seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
 	seq_printf(m, "\n");
+
 	return 0;
 }
 
@@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = {
 static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
-	struct kprobe_trace_entry *entry;
+	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
+	u8 *data;
 	int size, i, pc;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
@@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	size = sizeof(*entry) + tp->size;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
 						  irq_flags, pc);
@@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 					  struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
-	struct kretprobe_trace_entry *entry;
+	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
+	u8 *data;
 	int size, i, pc;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
@@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	size = sizeof(*entry) + tp->size;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
 						  irq_flags, pc);
@@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 enum print_line_t
 print_kprobe_event(struct trace_iterator *iter, int flags)
 {
-	struct kprobe_trace_entry *field;
+	struct kprobe_trace_entry_head *field;
 	struct trace_seq *s = &iter->seq;
 	struct trace_event *event;
 	struct trace_probe *tp;
+	u8 *data;
 	int i;
 
-	field = (struct kprobe_trace_entry *)iter->ent;
+	field = (struct kprobe_trace_entry_head *)iter->ent;
 	event = ftrace_find_event(field->ent.type);
 	tp = container_of(event, struct trace_probe, event);
 
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
-	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " %s=%lx",
-				      tp->args[i].name, field->args[i]))
+	data = (u8 *)&field[1];
+	for (i = 0; i < tp->nr_args; i++)
+		if (!tp->args[i].type->print(s, tp->args[i].name,
+					     data + tp->args[i].offset))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1046,13 +1151,14 @@ partial:
 enum print_line_t
 print_kretprobe_event(struct trace_iterator *iter, int flags)
 {
-	struct kretprobe_trace_entry *field;
+	struct kretprobe_trace_entry_head *field;
 	struct trace_seq *s = &iter->seq;
 	struct trace_event *event;
 	struct trace_probe *tp;
+	u8 *data;
 	int i;
 
-	field = (struct kretprobe_trace_entry *)iter->ent;
+	field = (struct kretprobe_trace_entry_head *)iter->ent;
 	event = ftrace_find_event(field->ent.type);
 	tp = container_of(event, struct trace_probe, event);
 
@@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
-	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " %s=%lx",
-				      tp->args[i].name, field->args[i]))
+	data = (u8 *)&field[1];
+	for (i = 0; i < tp->nr_args; i++)
+		if (!tp->args[i].type->print(s, tp->args[i].name,
+					     data + tp->args[i].offset))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
 static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
-	struct kprobe_trace_entry field;
+	struct kprobe_trace_entry_head field;
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
-	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
-	for (i = 0; i < tp->nr_args; i++)
-		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_define_field(event_call, tp->args[i].type->name,
+					 tp->args[i].name,
+					 sizeof(field) + tp->args[i].offset,
+					 tp->args[i].type->size,
+					 tp->args[i].type->is_signed,
+					 FILTER_OTHER);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
 static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
-	struct kretprobe_trace_entry field;
+	struct kretprobe_trace_entry_head field;
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
 	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
-	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
-	for (i = 0; i < tp->nr_args; i++)
-		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_define_field(event_call, tp->args[i].type->name,
+					 tp->args[i].name,
+					 sizeof(field) + tp->args[i].offset,
+					 tp->args[i].type->size,
+					 tp->args[i].type->is_signed,
+					 FILTER_OTHER);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
@@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
 
 	for (i = 0; i < tp->nr_args; i++) {
-		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
-				tp->args[i].name);
+		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+				tp->args[i].name, tp->args[i].type->fmt);
 	}
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
-	struct kprobe_trace_entry *entry;
+	struct kprobe_trace_entry_head *entry;
+	u8 *data;
 	int size, __size, i;
 	unsigned long irq_flags;
 	int rctx;
 
-	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	__size = sizeof(*entry) + tp->size;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	if (!entry)
 		return;
 
-	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
@@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
-	struct kretprobe_trace_entry *entry;
+	struct kretprobe_trace_entry_head *entry;
+	u8 *data;
 	int size, __size, i;
 	unsigned long irq_flags;
 	int rctx;
 
-	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	__size = sizeof(*entry) + tp->size;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 	if (!entry)
 		return;
 
-	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
 			       irq_flags, regs);
-- 
cgit v1.2.3-55-g7522


From 95476b64ab11d528de2557366ec584977c215b9e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Wed, 14 Apr 2010 23:42:18 +0200
Subject: perf: Fix hlist related build error

hlist helpers need to be available for all software events, not
only trace events.

Pull them out outside the ifdef CONFIG_EVENT_TRACING section.

Fixes:
	kernel/perf_event.c:4573: error: implicit declaration of function 'swevent_hlist_put'
	kernel/perf_event.c:4614: error: implicit declaration of function 'swevent_hlist_get'
	kernel/perf_event.c:5534: error: implicit declaration of function 'swevent_hlist_release

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1271281338-23491-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 60 ++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 095101d685bc..07b7a435bf03 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4313,36 +4313,6 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_event_read,
 };
 
-#ifdef CONFIG_EVENT_TRACING
-
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-		   int entry_size, struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
-	};
-
-	perf_sample_data_init(&data, addr);
-	data.raw = &raw;
-
-	/* Trace events already protected against recursion */
-	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-			 &data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
-
-static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data)
-{
-	void *record = data->raw->data;
-
-	if (likely(!event->filter) || filter_match_preds(event->filter, record))
-		return 1;
-	return 0;
-}
-
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
 	struct swevent_hlist *hlist;
@@ -4442,6 +4412,36 @@ static int swevent_hlist_get(struct perf_event *event)
 	return err;
 }
 
+#ifdef CONFIG_EVENT_TRACING
+
+void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+		   int entry_size, struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw = {
+		.size = entry_size,
+		.data = record,
+	};
+
+	perf_sample_data_init(&data, addr);
+	data.raw = &raw;
+
+	/* Trace events already protected against recursion */
+	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+			 &data, regs);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	void *record = data->raw->data;
+
+	if (likely(!event->filter) || filter_match_preds(event->filter, record))
+		return 1;
+	return 0;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
 	perf_trace_disable(event->attr.config);
-- 
cgit v1.2.3-55-g7522


From b6ac23af2c66e114d3a87ef28d56f1ceec283007 Mon Sep 17 00:00:00 2001
From: Divyesh Shah
Date: Thu, 15 Apr 2010 08:54:59 +0200
Subject: blkio: fix for modular blk-cgroup build

After merging the block tree, 20100414's linux-next build (x86_64
allmodconfig) failed like this:

ERROR: "get_gendisk" [block/blk-cgroup.ko] undefined!
ERROR: "sched_clock" [block/blk-cgroup.ko] undefined!

This happens because the two symbols aren't exported and hence not available
when blk-cgroup code is built as a module. I've tried to stay consistent with
the use of EXPORT_SYMBOL or EXPORT_SYMBOL_GPL with the other symbols in the
respective files.

Signed-off-by: Divyesh Shah <dpshah@google.com>
Acked-by: Gui Jianfeng <guijianfeng@cn.fujitsu.cn>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c        | 1 +
 kernel/sched_clock.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..154b5f80b3ab 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 
 	return disk;
 }
+EXPORT_SYMBOL(get_gendisk);
 
 /**
  * bdget_disk - do bdget() by gendisk and partition number
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
 					* (NSEC_PER_SEC / HZ);
 }
+EXPORT_SYMBOL_GPL(sched_clock);
 
 static __read_mostly int sched_clock_running;
 
-- 
cgit v1.2.3-55-g7522


From 09a40af5240de02d848247ab82440ad75b31ab11 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Thu, 15 Apr 2010 07:29:59 +0200
Subject: sched: Fix UP update_avg() build warning

update_avg() is only used for SMP builds, move it to the nearest
SMP block.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1271309399.14779.17.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ab562ae4007c..de0da71daf77 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1872,12 +1872,6 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
-static void update_avg(u64 *avg, u64 sample)
-{
-	s64 diff = sample - *avg;
-	*avg += diff >> 3;
-}
-
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
@@ -2332,6 +2326,12 @@ int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_
 
 	return cpu;
 }
+
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
 #endif
 
 /***
-- 
cgit v1.2.3-55-g7522


From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001
From: Zhang, Yanmin
Date: Mon, 19 Apr 2010 13:32:41 +0800
Subject: perf: Enhance perf to allow for guest statistic collection from host

Below patch introduces perf_guest_info_callbacks and related
register/unregister functions. Add more PERF_RECORD_MISC_XXX bits
meaning guest kernel and guest user space.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/perf_event.h | 15 ++++-----------
 arch/x86/kernel/cpu/perf_event.c  | 31 +++++++++++++++++++++++++++++++
 include/linux/perf_event.h        | 21 ++++++++++++++++++++-
 kernel/perf_event.c               | 23 ++++++++++++++++++++++-
 4 files changed, 77 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f6d43dbfd8e7..254883d0c7e0 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -135,17 +135,10 @@ extern void perf_events_lapic_init(void);
  */
 #define PERF_EFLAGS_EXACT	(1UL << 3)
 
-#define perf_misc_flags(regs)				\
-({	int misc = 0;					\
-	if (user_mode(regs))				\
-		misc |= PERF_RECORD_MISC_USER;		\
-	else						\
-		misc |= PERF_RECORD_MISC_KERNEL;	\
-	if (regs->flags & PERF_EFLAGS_EXACT)		\
-		misc |= PERF_RECORD_MISC_EXACT;		\
-	misc; })
-
-#define perf_instruction_pointer(regs)	((regs)->ip)
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
 
 #else
 static inline void init_hw_perf_events(void)		{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 626154a9f535..2ea78abf69d9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1720,6 +1720,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	struct perf_callchain_entry *entry;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
+
 	if (in_nmi())
 		entry = &__get_cpu_var(pmc_nmi_entry);
 	else
@@ -1743,3 +1748,29 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long ip;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		ip = perf_guest_cbs->get_guest_ip();
+	else
+		ip = instruction_pointer(regs);
+	return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		misc |= perf_guest_cbs->is_user_mode() ?
+			PERF_RECORD_MISC_GUEST_USER :
+			PERF_RECORD_MISC_GUEST_KERNEL;
+	} else
+		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
+			PERF_RECORD_MISC_KERNEL;
+	if (regs->flags & PERF_EFLAGS_EXACT)
+		misc |= PERF_RECORD_MISC_EXACT;
+
+	return misc;
+}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index bf896d0b2e9c..24de5f181a41 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -288,11 +288,13 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 };
 
-#define PERF_RECORD_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_RECORD_MISC_KERNEL			(1 << 0)
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
+#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
 #define PERF_RECORD_MISC_EXACT			(1 << 14)
 /*
@@ -446,6 +448,12 @@ enum perf_callchain_context {
 # include <asm/perf_event.h>
 #endif
 
+struct perf_guest_info_callbacks {
+	int (*is_in_guest) (void);
+	int (*is_user_mode) (void);
+	unsigned long (*get_guest_ip) (void);
+};
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 #include <asm/hw_breakpoint.h>
 #endif
@@ -932,6 +940,12 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 		__perf_event_mmap(vma);
 }
 
+extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern int perf_register_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+extern int perf_unregister_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
@@ -1001,6 +1015,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi,
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
+static inline int perf_register_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+static inline int perf_unregister_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 07b7a435bf03..9dbe8cdaf145 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 }
 
 
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = cbs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
 /*
  * Output
  */
@@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
 		.event_id  = {
 			.header = {
 				.type = PERF_RECORD_MMAP,
-				.misc = 0,
+				.misc = PERF_RECORD_MISC_USER,
 				/* .size */
 			},
 			/* .pid */
-- 
cgit v1.2.3-55-g7522


From cecbca96da387428e220e307a9c945e37e2f4d9e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sun, 18 Apr 2010 19:08:41 +0200
Subject: tracing: Dump either the oops's cpu source or all cpus buffers

The ftrace_dump_on_oops kernel parameter, sysctl and sysrq let one
dump every cpu buffers when an oops or panic happens.

It's nice when you have few cpus but it may take ages if have many,
plus you miss the real origin of the problem in all the cpu traces.

Sometimes, all you need is to dump the cpu buffer that triggered the
opps, most of the time it is our main interest.

This patch modifies ftrace_dump_on_oops to handle this choice.

The ftrace_dump_on_oops kernel parameter, when it comes alone, has
the same behaviour than before. But ftrace_dump_on_oops=orig_cpu
will only dump the buffer of the cpu that oops'ed.

Similarly, sysctl kernel.ftrace_dump_on_oops=1 and
echo 1 > /proc/sys/kernel/ftrace_dump_on_oops keep their previous
behaviour. But setting 2 jumps into cpu origin dump mode.

v2: Fix double setup
v3: Fix spelling issues reported by Randy Dunlap
v4: Also update __ftrace_dump in the selftests

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  6 ++++-
 Documentation/trace/ftrace.txt      |  6 +++--
 drivers/char/sysrq.c                |  2 +-
 include/linux/ftrace.h              |  4 ++-
 include/linux/kernel.h              | 11 ++++++--
 kernel/trace/trace.c                | 51 ++++++++++++++++++++++++++++---------
 kernel/trace/trace_selftest.c       |  5 ++--
 7 files changed, 64 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e4cbca58536c..ab67b33300fb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -789,8 +789,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			as early as possible in order to facilitate early
 			boot debugging.
 
-	ftrace_dump_on_oops
+	ftrace_dump_on_oops[=orig_cpu]
 			[FTRACE] will dump the trace buffers on oops.
+			If no parameter is passed, ftrace will dump
+			buffers of all CPUs, but if you pass orig_cpu, it will
+			dump only the buffer of the CPU that triggered the
+			oops.
 
 	ftrace_filter=[function-list]
 			[FTRACE] Limit the functions traced by the function
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 03485bfbd797..52011815c905 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1337,12 +1337,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
 can either use the sysctl function or set it via the proc system
 interface.
 
-  sysctl kernel.ftrace_dump_on_oops=1
+  sysctl kernel.ftrace_dump_on_oops=n
 
 or
 
-  echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
+  echo n > /proc/sys/kernel/ftrace_dump_on_oops
 
+If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
+only dump the buffer of the CPU that triggered the oops.
 
 Here's an example of such a dump after a null pointer
 dereference in a kernel module:
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 59de2525d303..d4e8b213a462 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -289,7 +289,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
 
 static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
 {
-	ftrace_dump();
+	ftrace_dump(DUMP_ALL);
 }
 static struct sysrq_key_op sysrq_ftrace_dump_op = {
 	.handler	= sysrq_ftrace_dump,
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 01e6adea07ec..ea5b1aae0e8b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -492,7 +492,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 	return tsk->trace & TSK_TRACE_FL_GRAPH;
 }
 
-extern int ftrace_dump_on_oops;
+enum ftrace_dump_mode;
+
+extern enum ftrace_dump_mode ftrace_dump_on_oops;
 
 #ifdef CONFIG_PREEMPT
 #define INIT_TRACE_RECURSION		.trace_recursion = 0,
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 9365227dbaf6..9fb1c1299032 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,6 +490,13 @@ static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
 static inline int tracing_is_on(void) { return 0; }
 #endif
+
+enum ftrace_dump_mode {
+	DUMP_NONE,
+	DUMP_ALL,
+	DUMP_ORIG,
+};
+
 #ifdef CONFIG_TRACING
 extern void tracing_start(void);
 extern void tracing_stop(void);
@@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
-extern void ftrace_dump(void);
+extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 {
 	return 0;
 }
-static inline void ftrace_dump(void) { }
+static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bed83cab6da2..7b516c7ef9a0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly	tracing_buffer_mask;
  *
  * It is default off, but you can enable it with either specifying
  * "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
  */
-int ftrace_dump_on_oops;
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
 
 static int tracing_set_tracer(const char *buf);
 
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-	ftrace_dump_on_oops = 1;
-	return 1;
+	if (*str++ != '=' || !*str) {
+		ftrace_dump_on_oops = DUMP_ALL;
+		return 1;
+	}
+
+	if (!strcmp("orig_cpu", str)) {
+		ftrace_dump_on_oops = DUMP_ORIG;
+                return 1;
+        }
+
+        return 0;
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
@@ -4338,7 +4350,7 @@ static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
 	if (ftrace_dump_on_oops)
-		ftrace_dump();
+		ftrace_dump(ftrace_dump_on_oops);
 	return NOTIFY_OK;
 }
 
@@ -4355,7 +4367,7 @@ static int trace_die_handler(struct notifier_block *self,
 	switch (val) {
 	case DIE_OOPS:
 		if (ftrace_dump_on_oops)
-			ftrace_dump();
+			ftrace_dump(ftrace_dump_on_oops);
 		break;
 	default:
 		break;
@@ -4396,7 +4408,8 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
-static void __ftrace_dump(bool disable_tracing)
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
 	static arch_spinlock_t ftrace_dump_lock =
 		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4429,12 +4442,25 @@ static void __ftrace_dump(bool disable_tracing)
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
-	printk(KERN_TRACE "Dumping ftrace buffer:\n");
-
 	/* Simulate the iterator */
 	iter.tr = &global_trace;
 	iter.trace = current_trace;
-	iter.cpu_file = TRACE_PIPE_ALL_CPU;
+
+	switch (oops_dump_mode) {
+	case DUMP_ALL:
+		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+		break;
+	case DUMP_ORIG:
+		iter.cpu_file = raw_smp_processor_id();
+		break;
+	case DUMP_NONE:
+		goto out_enable;
+	default:
+		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+	}
+
+	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
@@ -4473,6 +4499,7 @@ static void __ftrace_dump(bool disable_tracing)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
+ out_enable:
 	/* Re-enable tracing if requested */
 	if (!disable_tracing) {
 		trace_flags |= old_userobj;
@@ -4489,9 +4516,9 @@ static void __ftrace_dump(bool disable_tracing)
 }
 
 /* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-	__ftrace_dump(true);
+	__ftrace_dump(true, oops_dump_mode);
 }
 
 __init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9398034f814a..6a9d36ddfcf2 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -256,7 +256,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST	100000000
 
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -267,7 +268,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
 		ftrace_graph_stop();
 		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
 		if (ftrace_dump_on_oops)
-			__ftrace_dump(false);
+			__ftrace_dump(false, DUMP_ALL);
 		return 0;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 4b402210486c6414fe5fbfd85934a0a22da56b04 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt
Date: Fri, 16 Apr 2010 23:20:00 +0200
Subject: mutex: Don't spin when the owner CPU is offline or other weird cases

Due to recent load-balancer changes that delay the task migration to
the next wakeup, the adaptive mutex spinning ends up in a live lock
when the owner's CPU gets offlined because the cpu_online() check
lives before the owner running check.

This patch changes mutex_spin_on_owner() to return 0 (don't spin) in
any case where we aren't sure about the owner struct validity or CPU
number, and if the said CPU is offline. There is no point going back &
re-evaluate spinning in corner cases like that, let's just go to
sleep.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271212509.13059.135.camel@pasglop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6af210a7de70..de0bd26e520a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3780,7 +3780,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 	 * the mutex owner just released it and exited.
 	 */
 	if (probe_kernel_address(&owner->cpu, cpu))
-		goto out;
+		return 0;
 #else
 	cpu = owner->cpu;
 #endif
@@ -3790,14 +3790,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 	 * the cpu field may no longer be valid.
 	 */
 	if (cpu >= nr_cpumask_bits)
-		goto out;
+		return 0;
 
 	/*
 	 * We need to validate that we can do a
 	 * get_cpu() and that we have the percpu area.
 	 */
 	if (!cpu_online(cpu))
-		goto out;
+		return 0;
 
 	rq = cpu_rq(cpu);
 
@@ -3816,7 +3816,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 
 		cpu_relax();
 	}
-out:
+
 	return 1;
 }
 #endif
-- 
cgit v1.2.3-55-g7522


From 74f5187ac873042f502227701ed1727e7c5fbfa9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 22 Apr 2010 21:50:19 +0200
Subject: sched: Cure load average vs NO_HZ woes

Chase reported that due to us decrementing calc_load_task prematurely
(before the next LOAD_FREQ sample), the load average could be scewed
by as much as the number of CPUs in the machine.

This patch, based on Chase's patch, cures the problem by keeping the
delta of the CPU going into NO_HZ idle separately and folding that in
on the next LOAD_FREQ update.

This restores the balance and we get strict LOAD_FREQ period samples.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1271934490.1776.343.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 80 +++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched_idletask.c |  3 +-
 2 files changed, 68 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index de0da71daf77..0cc913a8554f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1815,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
 
@@ -2950,6 +2950,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+static long calc_load_fold_active(struct rq *this_rq)
+{
+	long nr_active, delta = 0;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long) this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+	}
+
+	return delta;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+static void calc_load_account_idle(struct rq *this_rq)
+{
+	long delta;
+
+	delta = calc_load_fold_active(this_rq);
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+	long delta = 0;
+
+	/*
+	 * Its got a race, we don't care...
+	 */
+	if (atomic_long_read(&calc_load_tasks_idle))
+		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+	return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+	return 0;
+}
+#endif
+
 /**
  * get_avenrun - get the load average array
  * @loads:	pointer to dest load array
@@ -2996,20 +3051,22 @@ void calc_global_load(void)
 }
 
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
  */
 static void calc_load_account_active(struct rq *this_rq)
 {
-	long nr_active, delta;
+	long delta;
 
-	nr_active = this_rq->nr_running;
-	nr_active += (long) this_rq->nr_uninterruptible;
+	if (time_before(jiffies, this_rq->calc_load_update))
+		return;
 
-	if (nr_active != this_rq->calc_load_active) {
-		delta = nr_active - this_rq->calc_load_active;
-		this_rq->calc_load_active = nr_active;
+	delta  = calc_load_fold_active(this_rq);
+	delta += calc_load_fold_idle();
+	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
-	}
+
+	this_rq->calc_load_update += LOAD_FREQ;
 }
 
 /*
@@ -3041,10 +3098,7 @@ static void update_cpu_load(struct rq *this_rq)
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 
-	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
-		this_rq->calc_load_update += LOAD_FREQ;
-		calc_load_account_active(this_rq);
-	}
+	calc_load_account_active(this_rq);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bea2b8f12024..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -23,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-	/* adjust the active tasks as we might go into a long sleep */
-	calc_load_account_active(rq);
+	calc_load_account_idle(rq);
 	return rq->idle;
 }
 
-- 
cgit v1.2.3-55-g7522


From 669c55e9f99b90e46eaa0f98a67ec53d46dc969a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 16 Apr 2010 14:59:29 +0200
Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd))

Dave reported that his large SPARC machines spend lots of time in
hweight64(), try and optimize some of those needless cpumask_weight()
invocations (esp. with the large offstack cpumasks these are very
expensive indeed).

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  3 +++
 kernel/sched_fair.c   | 12 +++++-------
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3e900f318d7..dfea40574b2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -960,6 +960,7 @@ struct sched_domain {
 	char *name;
 #endif
 
+	unsigned int span_weight;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
diff --git a/kernel/sched.c b/kernel/sched.c
index 0cc913a8554f..4956ed092838 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6271,6 +6271,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 
+	for (tmp = sd; tmp; tmp = tmp->parent)
+		tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 88d3053ac7c2..0a413c7e3ab8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1508,9 +1508,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		 * Pick the largest domain to update shares over
 		 */
 		tmp = sd;
-		if (affine_sd && (!tmp ||
-				  cpumask_weight(sched_domain_span(affine_sd)) >
-				  cpumask_weight(sched_domain_span(sd))))
+		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
 			tmp = affine_sd;
 
 		if (tmp) {
@@ -1554,10 +1552,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
-		weight = cpumask_weight(sched_domain_span(sd));
+		weight = sd->span_weight;
 		sd = NULL;
 		for_each_domain(cpu, tmp) {
-			if (weight <= cpumask_weight(sched_domain_span(tmp)))
+			if (weight <= tmp->span_weight)
 				break;
 			if (tmp->flags & sd_flag)
 				sd = tmp;
@@ -2243,7 +2241,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long weight = sd->span_weight;
 	unsigned long smt_gain = sd->smt_gain;
 
 	smt_gain /= weight;
@@ -2276,7 +2274,7 @@ unsigned long scale_rt_power(int cpu)
 
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long weight = sd->span_weight;
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-- 
cgit v1.2.3-55-g7522


From 99bd5e2f245d8cd17d040c82d40becdb3efd9b69 Mon Sep 17 00:00:00 2001
From: Suresh Siddha
Date: Wed, 31 Mar 2010 16:47:45 -0700
Subject: sched: Fix select_idle_sibling() logic in select_task_rq_fair()

Issues in the current select_idle_sibling() logic in select_task_rq_fair()
in the context of a task wake-up:

a) Once we select the idle sibling, we use that domain (spanning the cpu that
   the task is currently woken-up and the idle sibling that we found) in our
   wake_affine() decisions. This domain is completely different from the
   domain(we are supposed to use) that spans the cpu that the task currently
   woken-up and the cpu where the task previously ran.

b) We do select_idle_sibling() check only for the cpu that the task is
   currently woken-up on. If select_task_rq_fair() selects the previously run
   cpu for waking the task, doing a select_idle_sibling() check
   for that cpu also helps and we don't do this currently.

c) In the scenarios where the cpu that the task is woken-up is busy but
   with its HT siblings are idle, we are selecting the task be woken-up
   on the idle HT sibling instead of a core that it previously ran
   and currently completely idle. i.e., we are not taking decisions based on
   wake_affine() but directly selecting an idle sibling that can cause
   an imbalance at the SMT/MC level which will be later corrected by the
   periodic load balancer.

Fix this by first going through the load imbalance calculations using
wake_affine() and once we make a decision of woken-up cpu vs previously-ran cpu,
then choose a possible idle sibling for waking up the task on.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1270079265.7835.8.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 82 ++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0a413c7e3ab8..cbd8b8a296d1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1375,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
 {
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
+	struct sched_domain *sd;
 	int i;
 
 	/*
-	 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
-	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
-	 * always a better target than the current cpu.
+	 * If the task is going to be woken-up on this cpu and if it is
+	 * already idle, then it is the right target.
+	 */
+	if (target == cpu && idle_cpu(cpu))
+		return cpu;
+
+	/*
+	 * If the task is going to be woken-up on the cpu where it previously
+	 * ran and if it is currently idle, then it the right target.
 	 */
-	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+	if (target == prev_cpu && idle_cpu(prev_cpu))
 		return prev_cpu;
 
 	/*
-	 * Otherwise, iterate the domain and find an elegible idle cpu.
+	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
-	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-		if (!cpu_rq(i)->cfs.nr_running) {
-			target = i;
+	for_each_domain(target, sd) {
+		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
 			break;
+
+		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+			if (idle_cpu(i)) {
+				target = i;
+				break;
+			}
 		}
+
+		/*
+		 * Lets stop looking for an idle sibling when we reached
+		 * the domain that spans the current cpu and prev_cpu.
+		 */
+		if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+			break;
 	}
 
 	return target;
@@ -1421,7 +1440,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
-	int want_affine = 0, cpu_idle = !current->pid;
+	int want_affine = 0;
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
@@ -1460,36 +1479,13 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		}
 
 		/*
-		 * While iterating the domains looking for a spanning
-		 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
-		 * in cache sharing domains along the way.
+		 * If both cpu and prev_cpu are part of this domain,
+		 * cpu is a valid SD_WAKE_AFFINE target.
 		 */
-		if (want_affine) {
-			int target = -1;
-
-			/*
-			 * If both cpu and prev_cpu are part of this domain,
-			 * cpu is a valid SD_WAKE_AFFINE target.
-			 */
-			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-				target = cpu;
-
-			/*
-			 * If there's an idle sibling in this domain, make that
-			 * the wake_affine target instead of the current cpu.
-			 */
-			if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
-				target = select_idle_sibling(p, tmp, target);
-
-			if (target >= 0) {
-				if (tmp->flags & SD_WAKE_AFFINE) {
-					affine_sd = tmp;
-					want_affine = 0;
-					if (target != cpu)
-						cpu_idle = 1;
-				}
-				cpu = target;
-			}
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+			affine_sd = tmp;
+			want_affine = 0;
 		}
 
 		if (!want_sd && !want_affine)
@@ -1520,8 +1516,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 #endif
 
 	if (affine_sd) {
-		if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-			return cpu;
+		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+			return select_idle_sibling(p, cpu);
+		else
+			return select_idle_sibling(p, prev_cpu);
 	}
 
 	while (sd) {
-- 
cgit v1.2.3-55-g7522


From 9106b69382912ddc403a307b69bf894a6f3004e4 Mon Sep 17 00:00:00 2001
From: Jiri Olsa
Date: Fri, 2 Apr 2010 19:01:20 +0200
Subject: tracing: Add ftrace events for graph tracer

Add ftrace events for graph tracer, so the graph output could be shared
with other tracers.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1270227683-14631-2-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 669b9c31861d..db9e06bb766e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
 #define TRACE_GRAPH_PRINT_PROC		0x8
 #define TRACE_GRAPH_PRINT_DURATION	0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME	0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME	0x20
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
@@ -1096,6 +1096,12 @@ print_graph_function(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags)
+{
+	return print_graph_function(iter);
+}
+
 static void print_lat_header(struct seq_file *s)
 {
 	static const char spaces[] = "                "	/* 16 spaces */
@@ -1199,6 +1205,16 @@ static void graph_trace_close(struct trace_iterator *iter)
 	}
 }
 
+static struct trace_event graph_trace_entry_event = {
+	.type		= TRACE_GRAPH_ENT,
+	.trace		= print_graph_function_event,
+};
+
+static struct trace_event graph_trace_ret_event = {
+	.type		= TRACE_GRAPH_RET,
+	.trace		= print_graph_function_event,
+};
+
 static struct tracer graph_trace __read_mostly = {
 	.name		= "function_graph",
 	.open		= graph_trace_open,
@@ -1220,6 +1236,16 @@ static __init int init_graph_trace(void)
 {
 	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
 
+	if (!register_ftrace_event(&graph_trace_entry_event)) {
+		pr_warning("Warning: could not register graph trace events\n");
+		return 1;
+	}
+
+	if (!register_ftrace_event(&graph_trace_ret_event)) {
+		pr_warning("Warning: could not register graph trace events\n");
+		return 1;
+	}
+
 	return register_tracer(&graph_trace);
 }
 
-- 
cgit v1.2.3-55-g7522


From d7a8d9e907cc294ec7a4a7046d1886375fbcc82e Mon Sep 17 00:00:00 2001
From: Jiri Olsa
Date: Fri, 2 Apr 2010 19:01:21 +0200
Subject: tracing: Have graph flags passed in to ouput functions

Let the function graph tracer have custom flags passed to its
output functions.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1270227683-14631-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h                 |   6 +-
 kernel/trace/trace_functions_graph.c | 123 ++++++++++++++++++++---------------
 2 files changed, 73 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..970004c5fa79 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -491,7 +491,9 @@ extern int trace_clock_id;
 
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
@@ -524,7 +526,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index db9e06bb766e..de5f6518aba0 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -527,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
 
 /* Signal a overhead of time execution to the output */
 static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+		     u32 flags)
 {
 	/* If duration disappear, we don't need anything */
-	if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+	if (!(flags & TRACE_GRAPH_PRINT_DURATION))
 		return 1;
 
 	/* Non nested entry or return */
 	if (duration == -1)
 		return trace_seq_printf(s, "  ");
 
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+	if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
 		/* Duration exceeded 100 msecs */
 		if (duration > 100000ULL)
 			return trace_seq_printf(s, "! ");
@@ -563,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 
 static enum print_line_t
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
-		enum trace_type type, int cpu, pid_t pid)
+		enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
 	int ret;
 	struct trace_seq *s = &iter->seq;
@@ -573,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		return TRACE_TYPE_UNHANDLED;
 
 	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+	if (flags & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+	if (flags & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -597,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	}
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -610,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Don't close the duration column if haven't one */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		trace_seq_printf(s, " |");
 	ret = trace_seq_printf(s, "\n");
 
@@ -680,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *entry,
-		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+		struct ftrace_graph_ret_entry *ret_entry,
+		struct trace_seq *s, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ret *graph_ret;
@@ -712,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	}
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	ret = print_graph_overhead(duration, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = print_graph_duration(duration, s);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -740,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_entry_nested(struct trace_iterator *iter,
 			 struct ftrace_graph_ent_entry *entry,
-			 struct trace_seq *s, int cpu)
+			 struct trace_seq *s, int cpu, u32 flags)
 {
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 	struct fgraph_data *data = iter->private;
@@ -760,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
 	}
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -791,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 
 static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
-		     int type, unsigned long addr)
+		     int type, unsigned long addr, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct trace_entry *ent = iter->ent;
@@ -804,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 	if (type) {
 		/* Interrupt */
-		ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+		ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+	if (flags & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+	if (flags & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, ent->pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -846,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter)
+			struct trace_iterator *iter, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ent *call = &field->graph_ent;
@@ -854,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	static enum print_line_t ret;
 	int cpu = iter->cpu;
 
-	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	leaf_ret = get_return_for_leaf(iter, field);
 	if (leaf_ret)
-		ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+		ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
 	else
-		ret = print_graph_entry_nested(iter, field, s, cpu);
+		ret = print_graph_entry_nested(iter, field, s, cpu, flags);
 
 	if (data) {
 		/*
@@ -880,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent, struct trace_iterator *iter)
+		   struct trace_entry *ent, struct trace_iterator *iter,
+		   u32 flags)
 {
 	unsigned long long duration = trace->rettime - trace->calltime;
 	struct fgraph_data *data = iter->private;
@@ -910,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		}
 	}
 
-	if (print_graph_prologue(iter, s, 0, 0))
+	if (print_graph_prologue(iter, s, 0, 0, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	ret = print_graph_overhead(duration, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = print_graph_duration(duration, s);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -949,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	}
 
 	/* Overrun */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+	if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
 		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
 					trace->overrun);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+			      cpu, pid, flags);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -964,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
-		    struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+		    struct trace_iterator *iter, u32 flags)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct fgraph_data *data = iter->private;
@@ -977,16 +981,16 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 	if (data)
 		depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
 
-	if (print_graph_prologue(iter, s, 0, 0))
+	if (print_graph_prologue(iter, s, 0, 0, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -1041,7 +1045,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 
 
 enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	struct ftrace_graph_ent_entry *field;
 	struct fgraph_data *data = iter->private;
@@ -1062,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
 	if (data && data->failed) {
 		field = &data->ent;
 		iter->cpu = data->cpu;
-		ret = print_graph_entry(field, s, iter);
+		ret = print_graph_entry(field, s, iter, flags);
 		if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
 			per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
 			ret = TRACE_TYPE_NO_CONSUME;
@@ -1082,38 +1086,44 @@ print_graph_function(struct trace_iterator *iter)
 		struct ftrace_graph_ent_entry saved;
 		trace_assign_type(field, entry);
 		saved = *field;
-		return print_graph_entry(&saved, s, iter);
+		return print_graph_entry(&saved, s, iter, flags);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry, iter);
+		return print_graph_return(&field->ret, s, entry, iter, flags);
 	}
 	default:
-		return print_graph_comment(s, entry, iter);
+		return print_graph_comment(s, entry, iter, flags);
 	}
 
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	return print_graph_function_flags(iter, tracer_flags.val);
+}
+
 static enum print_line_t
 print_graph_function_event(struct trace_iterator *iter, int flags)
 {
 	return print_graph_function(iter);
 }
 
-static void print_lat_header(struct seq_file *s)
+static void print_lat_header(struct seq_file *s, u32 flags)
 {
 	static const char spaces[] = "                "	/* 16 spaces */
 		"    "					/* 4 spaces */
 		"                 ";			/* 17 spaces */
 	int size = 0;
 
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		size += 16;
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		size += 4;
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		size += 17;
 
 	seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
@@ -1124,42 +1134,47 @@ static void print_lat_header(struct seq_file *s)
 	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
 	int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
 
 	if (lat)
-		print_lat_header(s);
+		print_lat_header(s, flags);
 
 	/* 1st line */
 	seq_printf(s, "#");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "     TIME       ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		seq_printf(s, " CPU");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "  TASK/PID       ");
 	if (lat)
 		seq_printf(s, "|||||");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "  DURATION   ");
 	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
 	seq_printf(s, "#");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "      |         ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		seq_printf(s, " |  ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "   |    |        ");
 	if (lat)
 		seq_printf(s, "|||||");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "   |   |      ");
 	seq_printf(s, "               |   |   |   |\n");
 }
 
+static void print_graph_headers(struct seq_file *s)
+{
+	print_graph_headers_flags(s, tracer_flags.val);
+}
+
 static void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
-- 
cgit v1.2.3-55-g7522


From b8bc1389b74c2b66255651a6fcfae56c78b6e63f Mon Sep 17 00:00:00 2001
From: Alessio Igor Bogani
Date: Sun, 25 Apr 2010 13:18:48 +0200
Subject: ptrace: Cleanup useless header

BKL isn't present anymore into this file thus we can safely remove
smp_lock.h inclusion.

Signed-off-by: Alessio Igor Bogani <abogani@texware.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: James Morris <jmorris@namei.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/ptrace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 53575020f82b..2f0f50b450a3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
-- 
cgit v1.2.3-55-g7522


From 62b915f1060996a8e1f69be50e3b8e9e43b710cb Mon Sep 17 00:00:00 2001
From: Jiri Olsa
Date: Fri, 2 Apr 2010 19:01:22 +0200
Subject: tracing: Add graph output support for irqsoff tracer

Add function graph output to irqsoff tracer.

The graph output is enabled by setting new 'display-graph' trace option.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1270227683-14631-4-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h               |  15 +-
 kernel/trace/trace.c                 |  35 +++--
 kernel/trace/trace.h                 |  21 +++
 kernel/trace/trace_functions_graph.c |  15 +-
 kernel/trace/trace_irqsoff.c         | 271 +++++++++++++++++++++++++++++++++--
 5 files changed, 324 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ea5b1aae0e8b..8415a522f430 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -352,6 +352,10 @@ struct ftrace_graph_ret {
 	int depth;
 };
 
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 /* for init task */
@@ -400,10 +404,6 @@ extern char __irqentry_text_end[];
 
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-/* Type of the callback handlers for tracing function graph*/
-typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
-
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 				trace_func_graph_ent_t entryfunc);
 
@@ -441,6 +441,13 @@ static inline void unpause_graph_tracing(void)
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 
+static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+			  trace_func_graph_ent_t entryfunc)
+{
+	return -1;
+}
+static inline void unregister_ftrace_graph(void) { }
+
 static inline int task_curr_ret_stack(struct task_struct *tsk)
 {
 	return -1;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b516c7ef9a0..8b9ba41ec146 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1808,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
 }
 
 
-static void
+void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -2017,7 +2017,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
 }
 
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
 {
 	int cpu;
 
@@ -2084,6 +2084,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	return print_trace_fmt(iter);
 }
 
+void trace_default_header(struct seq_file *m)
+{
+	struct trace_iterator *iter = m->private;
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+		/* print nothing if the buffers are empty */
+		if (trace_empty(iter))
+			return;
+		print_trace_header(m, iter);
+		if (!(trace_flags & TRACE_ITER_VERBOSE))
+			print_lat_help_header(m);
+	} else {
+		if (!(trace_flags & TRACE_ITER_VERBOSE))
+			print_func_help_header(m);
+	}
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -2096,17 +2113,9 @@ static int s_show(struct seq_file *m, void *v)
 		}
 		if (iter->trace && iter->trace->print_header)
 			iter->trace->print_header(m);
-		else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
-			/* print nothing if the buffers are empty */
-			if (trace_empty(iter))
-				return 0;
-			print_trace_header(m, iter);
-			if (!(trace_flags & TRACE_ITER_VERBOSE))
-				print_lat_help_header(m);
-		} else {
-			if (!(trace_flags & TRACE_ITER_VERBOSE))
-				print_func_help_header(m);
-		}
+		else
+			trace_default_header(m);
+
 	} else if (iter->leftover) {
 		/*
 		 * If we filled the seq_file buffer earlier, we
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 970004c5fa79..911e9864e94a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -378,6 +378,9 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -491,11 +494,29 @@ extern int trace_clock_id;
 
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN       0x1
+#define TRACE_GRAPH_PRINT_CPU           0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
+#define TRACE_GRAPH_PRINT_PROC          0x8
+#define TRACE_GRAPH_PRINT_DURATION      0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+
 extern enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags);
 extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+			       struct ftrace_graph_ent *trace,
+			       unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+				 struct ftrace_graph_ret *trace,
+				 unsigned long flags, int pc);
+
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index de5f6518aba0..dd11c830eb84 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	return ret;
 }
 
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
 		return trace_graph_entry(trace);
 }
 
-static void __trace_graph_return(struct trace_array *tr,
+void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
@@ -1093,6 +1093,11 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter, flags);
 	}
+	case TRACE_STACK:
+	case TRACE_FN:
+		/* dont trace stack and functions as comments */
+		return TRACE_TYPE_UNHANDLED;
+
 	default:
 		return print_graph_comment(s, entry, iter, flags);
 	}
@@ -1170,12 +1175,12 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
 	seq_printf(s, "               |   |   |   |\n");
 }
 
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers(struct seq_file *s)
 {
 	print_graph_headers_flags(s, tracer_flags.val);
 }
 
-static void graph_trace_open(struct trace_iterator *iter)
+void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
 	struct fgraph_data *data;
@@ -1210,7 +1215,7 @@ static void graph_trace_open(struct trace_iterator *iter)
 	pr_warning("function graph tracer: not enough memory\n");
 }
 
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
 {
 	struct fgraph_data *data = iter->private;
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
 
 static int save_lat_flag;
 
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
 # define irq_trace() (0)
 #endif
 
+#define TRACE_DISPLAY_GRAPH	1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* display latency trace as call graph */
+	{ TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val  = 0,
+	.opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+	int cpu;
+
+	if (!(bit & TRACE_DISPLAY_GRAPH))
+		return -EINVAL;
+
+	if (!(is_graph() ^ set))
+		return 0;
+
+	stop_irqsoff_tracer(irqsoff_trace, !set);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(tracing_cpu, cpu) = 0;
+
+	tracing_max_latency = 0;
+	tracing_reset_online_cpus(irqsoff_trace);
+
+	return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int ret;
+	int cpu;
+	int pc;
+
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return 0;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return 0;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		ret = __trace_graph_entry(tr, trace, flags, pc);
+	} else
+		ret = 0;
+
+	atomic_dec(&data->disabled);
+	return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_graph_return(tr, trace, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+	if (is_graph())
+		graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+	if (iter->private)
+		graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+			    TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+	u32 flags = GRAPH_TRACER_FLAGS;
+
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	/*
+	 * In graph mode call the graph tracer output function,
+	 * otherwise go with the TRACE_FN event handler
+	 */
+	if (is_graph())
+		return print_graph_function_flags(iter, flags);
+
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+	if (is_graph()) {
+		struct trace_iterator *iter = s->private;
+		u32 flags = GRAPH_TRACER_FLAGS;
+
+		if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return;
+
+			print_trace_header(s, iter);
+			flags |= TRACE_GRAPH_PRINT_DURATION;
+		} else
+			flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+		print_graph_headers_flags(s, flags);
+	} else
+		trace_default_header(s);
+}
+
+static void
+trace_graph_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long flags, int pc)
+{
+	u64 time = trace_clock_local();
+	struct ftrace_graph_ent ent = {
+		.func  = ip,
+		.depth = 0,
+	};
+	struct ftrace_graph_ret ret = {
+		.func     = ip,
+		.depth    = 0,
+		.calltime = time,
+		.rettime  = time,
+	};
+
+	__trace_graph_entry(tr, &ent, flags, pc);
+	__trace_graph_return(tr, &ret, flags, pc);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long parent_ip,
+		 unsigned long flags, int pc)
+{
+	if (!is_graph())
+		trace_function(tr, ip, parent_ip, flags, pc);
+	else {
+		trace_graph_function(tr, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, flags, pc);
+	}
+}
+
+#else
+#define __trace_function trace_function
+
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+	return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 	/* Skip 5 functions to get to the irq/preempt enable function */
 	__trace_stack(tr, flags, 5, pc);
 
@@ -172,7 +388,7 @@ out_unlock:
 out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
-	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	__trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	local_save_flags(flags);
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	__trace_function(tr, ip, parent_ip, flags, preempt_count());
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
-	register_ftrace_function(&trace_ops);
-	if (tracing_is_enabled())
+	int ret = 0;
+
+	if (!graph)
+		ret = register_ftrace_function(&trace_ops);
+	else
+		ret = register_ftrace_graph(&irqsoff_graph_return,
+					    &irqsoff_graph_entry);
+
+	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
 	else
 		tracer_enabled = 0;
+
+	return ret;
 }
 
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
 	tracer_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
+
+	if (!graph)
+		unregister_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_graph();
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	/* make sure that the tracer is visible */
 	smp_wmb();
 	tracing_reset_online_cpus(tr);
-	start_irqsoff_tracer(tr);
+
+	if (start_irqsoff_tracer(tr, is_graph()))
+		printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-	stop_irqsoff_tracer(tr);
+	stop_irqsoff_tracer(tr, is_graph());
 
 	if (!save_lat_flag)
 		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
 #endif
+	.open           = irqsoff_trace_open,
+	.close          = irqsoff_trace_close,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
 #endif
+	.open		= irqsoff_trace_open,
+	.close		= irqsoff_trace_close,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
+	.open		= irqsoff_trace_open,
+	.close		= irqsoff_trace_close,
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
-- 
cgit v1.2.3-55-g7522


From 72c9ddfd4c5bf54ef03cfdf57026416cb678eeba Mon Sep 17 00:00:00 2001
From: David Miller
Date: Tue, 20 Apr 2010 15:47:11 -0700
Subject: ring-buffer: Make non-consuming read less expensive with lots of
 cpus.

When performing a non-consuming read, a synchronize_sched() is
performed once for every cpu which is actively tracing.

This is very expensive, and can make it take several seconds to open
up the 'trace' file with lots of cpus.

Only one synchronize_sched() call is actually necessary.  What is
desired is for all cpus to see the disabling state change.  So we
transform the existing sequence:

	for_each_cpu() {
		ring_buffer_read_start();
	}

where each ring_buffer_start() call performs a synchronize_sched(),
into the following:

	for_each_cpu() {
		ring_buffer_read_prepare();
	}
	ring_buffer_read_prepare_sync();
	for_each_cpu() {
		ring_buffer_read_start();
	}

wherein only the single ring_buffer_read_prepare_sync() call needs to
do the synchronize_sched().

The first phase, via ring_buffer_read_prepare(), allocates the 'iter'
memory and increments ->record_disabled.

In the second phase, ring_buffer_read_prepare_sync() makes sure this
->record_disabled state is visible fully to all cpus.

And in the final third phase, the ring_buffer_read_start() calls reset
the 'iter' objects allocated in the first phase since we now know that
none of the cpus are adding trace entries any more.

This makes openning the 'trace' file nearly instantaneous on a
sparc64 Niagara2 box with 128 cpus tracing.

Signed-off-by: David S. Miller <davem@davemloft.net>
LKML-Reference: <20100420.154711.11246950.davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  4 ++-
 kernel/trace/ring_buffer.c  | 64 ++++++++++++++++++++++++++++++++++++++-------
 kernel/trace/trace.c        | 11 +++++---
 3 files changed, 65 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index c8297761e414..25b4f686d918 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -127,7 +127,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_prepare_sync(void);
+void ring_buffer_read_start(struct ring_buffer_iter *iter);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5885cdfc41f3..2a090448ef6b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3332,23 +3332,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  *
- * This starts up an iteration through the buffer. It also disables
- * the recording to the buffer until the reading is finished.
- * This prevents the reading from being corrupted. This is not
- * a consuming read, so a producer is not expected.
+ * This performs the initial preparations necessary to iterate
+ * through the buffer.  Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
  *
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
-	unsigned long flags;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
@@ -3362,15 +3369,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	iter->cpu_buffer = cpu_buffer;
 
 	atomic_inc(&cpu_buffer->record_disabled);
+
+	return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized.  Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
 	synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+
+	if (!iter)
+		return;
+
+	cpu_buffer = iter->cpu_buffer;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
-	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b9ba41ec146..756d7283318b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2201,15 +2201,20 @@ __tracing_open(struct inode *inode, struct file *file)
 
 	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
 		for_each_tracing_cpu(cpu) {
-
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_start(iter->tr->buffer, cpu);
+				ring_buffer_read_prepare(iter->tr->buffer, cpu);
+		}
+		ring_buffer_read_prepare_sync();
+		for_each_tracing_cpu(cpu) {
+			ring_buffer_read_start(iter->buffer_iter[cpu]);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-				ring_buffer_read_start(iter->tr->buffer, cpu);
+			ring_buffer_read_prepare(iter->tr->buffer, cpu);
+		ring_buffer_read_prepare_sync();
+		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
 	}
 
-- 
cgit v1.2.3-55-g7522


From a838b2e634405fb89ddbf4fa9412acb33911911f Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Tue, 27 Apr 2010 13:26:58 -0400
Subject: ring-buffer: Make benchmark handle missed events

With the addition of the "missed events" flags that is stored in the
commit field of the ring buffer page, the ring_buffer_benchmark
was not updated to handle this. If events are missed, then the
missed events flag is set in the ring buffer page, the benchmark
will count that flag as part of the size of the page and will hit the BUG()
when it tries to read beyond the page.

The solution is simply to have the ring buffer benchmark mask off
the extra bits.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index dc56556b55a2..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
 	if (ret >= 0) {
 		rpage = bpage;
-		commit = local_read(&rpage->commit);
+		/* The commit may have missed event flags set, clear them */
+		commit = local_read(&rpage->commit) & 0xfffff;
 		for (i = 0; i < commit && !kill_test; i += inc) {
 
 			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
-- 
cgit v1.2.3-55-g7522


From e330b3bcd83199dd63a819d8d12e40f9edae6c77 Mon Sep 17 00:00:00 2001
From: Chase Douglas
Date: Mon, 26 Apr 2010 14:02:05 -0400
Subject: tracing: Show sample std dev in function profiling

When combined with function graph tracing the ftrace function profiler
also prints the average run time of functions. While this gives us some
good information, it doesn't tell us anything about the variance of the
run times of the function. This change prints out the s^2 sample
standard deviation alongside the average.

This change adds one entry to the profile record structure. This
increases the memory footprint of the function profiler by 1/3 on a
32-bit system, and by 1/5 on a 64-bit system when function graphing is
enabled, though the memory is only allocated when the profiler is turned
on. During the profiling, one extra line of code adds the squared
calltime to the new record entry, so this should not adversly affect
performance.

Note that the square of the sample standard deviation is printed because
there is no sqrt implementation for unsigned long long in the kernel.

Signed-off-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1272304925-2436-1-git-send-email-chase.douglas@canonical.com>

[ fixed comment about ns^2 -> us^2 conversion ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..3bcb340d6f02 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
 	unsigned long			counter;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	unsigned long long		time;
+	unsigned long long		time_squared;
 #endif
 };
 
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "  Function                               "
-		   "Hit    Time            Avg\n"
+		   "Hit    Time            Avg             s^2\n"
 		      "  --------                               "
-		   "---    ----            ---\n");
+		   "---    ----            ---             ---\n");
 #else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
 	static DEFINE_MUTEX(mutex);
 	static struct trace_seq s;
 	unsigned long long avg;
+	unsigned long long stddev;
 #endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
 	avg = rec->time;
 	do_div(avg, rec->counter);
 
+	/* Sample standard deviation (s^2) */
+	if (rec->counter <= 1)
+		stddev = 0;
+	else {
+		stddev = rec->time_squared - rec->counter * avg * avg;
+		/*
+		 * Divide only 1000 for ns^2 -> us^2 conversion.
+		 * trace_print_graph_duration will divide 1000 again.
+		 */
+		do_div(stddev, (rec->counter - 1) * 1000);
+	}
+
 	mutex_lock(&mutex);
 	trace_seq_init(&s);
 	trace_print_graph_duration(rec->time, &s);
 	trace_seq_puts(&s, "    ");
 	trace_print_graph_duration(avg, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(stddev, &s);
 	trace_print_seq(m, &s);
 	mutex_unlock(&mutex);
 #endif
@@ -668,8 +684,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	}
 
 	rec = ftrace_find_profiled_func(stat, trace->func);
-	if (rec)
+	if (rec) {
 		rec->time += calltime;
+		rec->time_squared += calltime * calltime;
+	}
 
  out:
 	local_irq_restore(flags);
-- 
cgit v1.2.3-55-g7522


From 37e44bc50d91df1fe7edcf6f02fe168c6d802e64 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Tue, 27 Apr 2010 21:04:24 -0400
Subject: tracing: Fix sleep time function profiling

When sleep_time is off the function profiler ignores the time that a task
is scheduled out. When the task is scheduled out a timestamp is taken.
When the task is scheduled back in, the timestamp is compared to the
current time and the saved calltimes are adjusted accordingly.

But when stopping the function profiler, the sched switch hook that
does this adjustment was stopped before shutting down the tracer.
This allowed some tasks to not get their timestamps set when they
scheduled out. When the function profiler started again, this would
skew the times of the scheduler functions.

This patch moves the stopping of the sched switch to after the function
profiler is stopped. It also ignores zero set calltimes, which may
happen on start up.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3bcb340d6f02..8c9c2934c45f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -666,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
+	/* If the calltime was zero'd ignore it */
+	if (!trace->calltime)
+		goto out;
+
 	calltime = trace->rettime - trace->calltime;
 
 	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -3357,11 +3361,11 @@ void unregister_ftrace_graph(void)
 		goto out;
 
 	ftrace_graph_active--;
-	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 
  out:
 	mutex_unlock(&ftrace_lock);
-- 
cgit v1.2.3-55-g7522


From 47dd5be2d6a82b8153e059a1d09eb3879d485bfd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 30 Apr 2010 07:23:51 +0200
Subject: workqueue: flush_delayed_work: keep the original workqueue for
 re-queueing

flush_delayed_work() always uses keventd_wq for re-queueing,
but it should use the workqueue this dwork was queued on.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..5bfb213984b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork)
 {
 	if (del_timer_sync(&dwork->timer)) {
 		struct cpu_workqueue_struct *cwq;
-		cwq = wq_per_cpu(keventd_wq, get_cpu());
+		cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
 		__queue_work(cwq, &dwork->work);
 		put_cpu();
 	}
-- 
cgit v1.2.3-55-g7522


From eef6a7d5c2f38adadab8240fabf43730fe796482 Mon Sep 17 00:00:00 2001
From: Alan Stern
Date: Fri, 12 Feb 2010 17:39:21 +0900
Subject: workqueue: warn about flush_scheduled_work()

This patch (as1319) adds kerneldoc and a pointed warning to
flush_scheduled_work().

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5bfb213984b2..0225fea89340 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -845,6 +845,30 @@ int schedule_on_each_cpu(work_func_t func)
 	return 0;
 }
 
+/**
+ * flush_scheduled_work - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the kernel-global workqueue and blocks until its
+ * completion.
+ *
+ * Think twice before calling this function!  It's very easy to get into
+ * trouble if you don't take great care.  Either of the following situations
+ * will lead to deadlock:
+ *
+ *	One of the work items currently on the workqueue needs to acquire
+ *	a lock held by your code or its caller.
+ *
+ *	Your code is running in the context of a work routine.
+ *
+ * They will be detected by lockdep when they occur, but the first might not
+ * occur very often.  It depends on what work items are on the workqueue and
+ * what locks they need, which you have no control over.
+ *
+ * In most situations flushing the entire workqueue is overkill; you merely
+ * need to know that a particular work item isn't queued and isn't running.
+ * In such cases you should use cancel_delayed_work_sync() or
+ * cancel_work_sync() instead.
+ */
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
-- 
cgit v1.2.3-55-g7522


From 4d707b9f48e2c4aa94b96f1133813b73df71fb55 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Apr 2010 17:40:40 +0200
Subject: workqueue: change cancel_work_sync() to clear work->data

In short: change cancel_work_sync(work) to mark this work as "never
queued" upon return.

When cancel_work_sync(work) succeeds, we know that this work can't be
queued or running, and since we own WORK_STRUCT_PENDING nobody can change
the bits in work->data under us. This means we can also clear the "cwq"
part along with _PENDING bit lockless before return, unless the work is
queued nobody can assume get_wq_data() is stable even under cwq->lock.

This change can speedup the subsequent cancel/flush requests, and as
Dmitry pointed out this simplifies the usage of work_struct's which
can be queued on different workqueues. Consider this pseudo code from
the input subsystem:

	struct workqueue_struct *WQ;
	struct work_struct *WORK;

	for (;;) {
		WQ = create_workqueue();
		...
		if (condition())
			queue_work(WQ, WORK);
		...
		cancel_work_sync(WORK);
		destroy_workqueue(WQ);
	}

If condition() returns T and then F, cancel_work_sync() will crash the
kernel because WORK->data still points to the already destroyed workqueue.
With this patch the code like above becomes correct.

Suggested-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0225fea89340..77dabbf64b8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
 	atomic_long_set(&work->data, new);
 }
 
+/*
+ * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
+ */
+static inline void clear_wq_data(struct work_struct *work)
+{
+	unsigned long flags = *work_data_bits(work) &
+				(1UL << WORK_STRUCT_STATIC);
+	atomic_long_set(&work->data, flags);
+}
+
 static inline
 struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 {
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
 		wait_on_work(work);
 	} while (unlikely(ret < 0));
 
-	work_clear_pending(work);
+	clear_wq_data(work);
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 8b08ca52f5942c21564bbb90ccfb61053f2c26a1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 21 Apr 2010 13:02:07 -0700
Subject: rcu: Fix RCU lockdep splat in set_task_cpu on fork path

Add an RCU read-side critical section to suppress this false
positive.

Located-by: Eric Paris <eparis@parisplace.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: eric.dumazet@gmail.com
LKML-Reference: <1271880131-3951-1-git-send-email-paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index de0bd26e520a..3c2a54f70ffe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -323,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+	/*
+	 * Strictly speaking this rcu_read_lock() is not needed since the
+	 * task_group is tied to the cgroup, which in turn can never go away
+	 * as long as there are tasks attached to it.
+	 *
+	 * However since task_group() uses task_subsys_state() which is an
+	 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
+	 */
+	rcu_read_lock();
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
@@ -332,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
+	rcu_read_unlock();
 }
 
 #else
-- 
cgit v1.2.3-55-g7522


From 8b46f880841aac821af8efa6581bb0e46b8b9845 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 21 Apr 2010 13:02:08 -0700
Subject: rcu: Fix RCU lockdep splat on freezer_fork path

Add an RCU read-side critical section to suppress this false
positive.

Located-by: Eric Paris <eparis@parisplace.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: eric.dumazet@gmail.com
LKML-Reference: <1271880131-3951-2-git-send-email-paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cgroup_freezer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index da5e13975531..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -205,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 	 * No lock is needed, since the task isn't on tasklist yet,
 	 * so it can't be moved to another cgroup, which means the
 	 * freezer won't be removed and will be valid during this
-	 * function call.
+	 * function call.  Nevertheless, apply RCU read-side critical
+	 * section to suppress RCU lockdep false positives.
 	 */
+	rcu_read_lock();
 	freezer = task_freezer(task);
+	rcu_read_unlock();
 
 	/*
 	 * The root cgroup is non-freezable, so we can skip the
-- 
cgit v1.2.3-55-g7522


From 8795d7717c467bea7b0a0649d44a258e09f34db2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Thu, 15 Apr 2010 23:10:42 +0200
Subject: lockdep: Fix redundant_hardirqs_on incremented with irqs enabled

When a path restore the flags while irqs are already enabled, we
update the per cpu var redundant_hardirqs_on in a racy fashion
and debug_atomic_inc() warns about this situation.

In this particular case, loosing a few hits in a stat is not a big
deal, so increment it without protection.

v2: Don't bother with disabling irq, we can miss one count in
    rare situations

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 kernel/lockdep.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 78325f8f1139..1b58a1bbcc87 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2298,7 +2298,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
-		debug_atomic_inc(redundant_hardirqs_on);
+		/*
+		 * Neither irq nor preemption are disabled here
+		 * so this is racy by nature but loosing one hit
+		 * in a stat is not a big deal.
+		 */
+		this_cpu_inc(lockdep_stats.redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
-- 
cgit v1.2.3-55-g7522


From 913769f24eadcd38a936ffae41d9b4895ec02e43 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Thu, 15 Apr 2010 23:10:43 +0200
Subject: lockdep: Simplify debug atomic ops

Simplify debug_atomic_inc/dec by using this_cpu_inc/dec() instead
of doing it through an indirect get_cpu_var() and a manual
incrementation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep_internals.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8d7d4b6c741a..2b174762fa0e 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -140,19 +140,13 @@ struct lockdep_stats {
 DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
 #define debug_atomic_inc(ptr)			{		\
-	struct lockdep_stats *__cpu_lockdep_stats;		\
-								\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
-	__cpu_lockdep_stats->ptr++;				\
+	this_cpu_inc(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_dec(ptr)			{		\
-	struct lockdep_stats *__cpu_lockdep_stats;		\
-								\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
-	__cpu_lockdep_stats->ptr--;				\
+	this_cpu_inc(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_read(ptr)		({				\
-- 
cgit v1.2.3-55-g7522


From 5a2e3995951176e1aaa63d17ae2e1d26ac99003d Mon Sep 17 00:00:00 2001
From: Kei Tokunaga
Date: Thu, 1 Apr 2010 20:40:58 +0900
Subject: [SCSI] ftrace: add __print_hex()

__print_hex() prints values in an array in hex (w/o '0x') (space separated)
EX) 92 33 32 f3 ee 4d

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@jp.fujitsu.com>
Signed-off-by: Kei Tokunaga <tokunaga.keiich@jp.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 include/linux/ftrace_event.h |  3 +++
 include/trace/ftrace.h       |  3 +++
 kernel/trace/trace_output.c  | 15 +++++++++++++++
 3 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c711..c3c5aaaae53a 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -25,6 +25,9 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 				     const struct trace_print_flags *symbol_array);
 
+const char *ftrace_print_hex_seq(struct trace_seq *p,
+				 const unsigned char *buf, int len);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ea6f9d4a20e9..c48320b3dabd 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -198,6 +198,9 @@
 		ftrace_print_symbols_seq(p, value, symbols);		\
 	})
 
+#undef __print_hex
+#define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace enum print_line_t					\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..7c4a0ca650b5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -355,6 +355,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
+const char *
+ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+{
+	int i;
+	const char *ret = p->buffer + p->len;
+
+	for (i = 0; i < buf_len; i++)
+		trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_hex_seq);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.2.3-55-g7522


From bf81623542332bc2cedf3db49cbb2edb724780d2 Mon Sep 17 00:00:00 2001
From: Kei Tokunaga
Date: Thu, 1 Apr 2010 20:41:40 +0900
Subject: [SCSI] add scsi trace core functions and put trace points

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@jp.fujitsu.com>
Signed-off-by: Kei Tokunaga <tokunaga.keiich@jp.fujitsu.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 drivers/scsi/Makefile       |   1 +
 drivers/scsi/scsi.c         |   6 +
 drivers/scsi/scsi_error.c   |   4 +
 drivers/scsi/scsi_trace.c   | 185 +++++++++++++++++++++++++
 include/trace/events/scsi.h | 328 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c |   1 +
 6 files changed, 525 insertions(+)
 create mode 100644 drivers/scsi/scsi_trace.c
 create mode 100644 include/trace/events/scsi.h

(limited to 'kernel')

diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 92a8c500b23d..1c7ac49be649 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -162,6 +162,7 @@ scsi_mod-y			+= scsi_scan.o scsi_sysfs.o scsi_devinfo.o
 scsi_mod-$(CONFIG_SCSI_NETLINK)	+= scsi_netlink.o
 scsi_mod-$(CONFIG_SYSCTL)	+= scsi_sysctl.o
 scsi_mod-$(CONFIG_SCSI_PROC_FS)	+= scsi_proc.o
+scsi_mod-y			+= scsi_trace.o
 
 scsi_tgt-y			+= scsi_tgt_lib.o scsi_tgt_if.o
 
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1c08f6164658..ad0ed212db4a 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -67,6 +67,9 @@
 #include "scsi_priv.h"
 #include "scsi_logging.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/scsi.h>
+
 static void scsi_done(struct scsi_cmnd *cmd);
 
 /*
@@ -747,10 +750,12 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 		cmd->result = (DID_NO_CONNECT << 16);
 		scsi_done(cmd);
 	} else {
+		trace_scsi_dispatch_cmd_start(cmd);
 		rtn = host->hostt->queuecommand(cmd, scsi_done);
 	}
 	spin_unlock_irqrestore(host->host_lock, flags);
 	if (rtn) {
+		trace_scsi_dispatch_cmd_error(cmd, rtn);
 		if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
 		    rtn != SCSI_MLQUEUE_TARGET_BUSY)
 			rtn = SCSI_MLQUEUE_HOST_BUSY;
@@ -781,6 +786,7 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
  */
 static void scsi_done(struct scsi_cmnd *cmd)
 {
+	trace_scsi_dispatch_cmd_done(cmd);
 	blk_complete_request(cmd->request);
 }
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 33175974b55a..f31d868f9362 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -39,6 +39,8 @@
 #include "scsi_logging.h"
 #include "scsi_transport_api.h"
 
+#include <trace/events/scsi.h>
+
 #define SENSE_TIMEOUT		(10*HZ)
 
 /*
@@ -52,6 +54,7 @@
 void scsi_eh_wakeup(struct Scsi_Host *shost)
 {
 	if (shost->host_busy == shost->host_failed) {
+		trace_scsi_eh_wakeup(shost);
 		wake_up_process(shost->ehandler);
 		SCSI_LOG_ERROR_RECOVERY(5,
 				printk("Waking error handler thread\n"));
@@ -127,6 +130,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 	struct scsi_cmnd *scmd = req->special;
 	enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
 
+	trace_scsi_dispatch_cmd_timeout(scmd);
 	scsi_log_completion(scmd, TIMEOUT_ERROR);
 
 	if (scmd->device->host->transportt->eh_timed_out)
diff --git a/drivers/scsi/scsi_trace.c b/drivers/scsi/scsi_trace.c
new file mode 100644
index 000000000000..9a3342a2fa36
--- /dev/null
+++ b/drivers/scsi/scsi_trace.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2010 FUJITSU LIMITED
+ * Copyright (C) 2010 Tomohiro Kusumi <kusumi.tomohiro@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/kernel.h>
+#include <linux/trace_seq.h>
+#include <trace/events/scsi.h>
+
+#define SERVICE_ACTION(cdb) ((cdb[8] << 8) | cdb[9])
+
+static const char *
+scsi_trace_misc(struct trace_seq *, unsigned char *, int);
+
+static const char *
+scsi_trace_rw6(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= ((cdb[1] & 0x1F) << 16);
+	lba |=  (cdb[2] << 8);
+	lba |=   cdb[3];
+	txlen = cdb[4];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu",
+			 (unsigned long long)lba, (unsigned long long)txlen);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *
+scsi_trace_rw10(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= (cdb[2] << 24);
+	lba |= (cdb[3] << 16);
+	lba |= (cdb[4] << 8);
+	lba |=  cdb[5];
+	txlen |= (cdb[7] << 8);
+	txlen |=  cdb[8];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu",
+			 (unsigned long long)lba, (unsigned long long)txlen);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *
+scsi_trace_rw12(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= (cdb[2] << 24);
+	lba |= (cdb[3] << 16);
+	lba |= (cdb[4] << 8);
+	lba |=  cdb[5];
+	txlen |= (cdb[6] << 24);
+	txlen |= (cdb[7] << 16);
+	txlen |= (cdb[8] << 8);
+	txlen |=  cdb[9];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu",
+			 (unsigned long long)lba, (unsigned long long)txlen);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *
+scsi_trace_rw16(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= ((u64)cdb[2] << 56);
+	lba |= ((u64)cdb[3] << 48);
+	lba |= ((u64)cdb[4] << 40);
+	lba |= ((u64)cdb[5] << 32);
+	lba |= (cdb[6] << 24);
+	lba |= (cdb[7] << 16);
+	lba |= (cdb[8] << 8);
+	lba |=  cdb[9];
+	txlen |= (cdb[10] << 24);
+	txlen |= (cdb[11] << 16);
+	txlen |= (cdb[12] << 8);
+	txlen |=  cdb[13];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu",
+			 (unsigned long long)lba, (unsigned long long)txlen);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *
+scsi_trace_rw32(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= ((u64)cdb[12] << 56);
+	lba |= ((u64)cdb[13] << 48);
+	lba |= ((u64)cdb[14] << 40);
+	lba |= ((u64)cdb[15] << 32);
+	lba |= (cdb[16] << 24);
+	lba |= (cdb[17] << 16);
+	lba |= (cdb[18] << 8);
+	lba |=  cdb[19];
+	txlen |= (cdb[28] << 24);
+	txlen |= (cdb[29] << 16);
+	txlen |= (cdb[30] << 8);
+	txlen |=  cdb[31];
+
+	trace_seq_printf(p, "%s_32 lba=%llu txlen=%llu",
+			(SERVICE_ACTION(cdb) == READ_32 ? "READ" : "WRITE"),
+			(unsigned long long)lba, (unsigned long long)txlen);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *
+scsi_trace_varlen(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	switch (SERVICE_ACTION(cdb)) {
+	case READ_32:
+	case WRITE_32:
+		return scsi_trace_rw32(p, cdb, len);
+	default:
+		return scsi_trace_misc(p, cdb, len);
+	}
+}
+
+static const char *
+scsi_trace_misc(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+
+	trace_seq_printf(p, "-");
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+const char *
+scsi_trace_parse_cdb(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	switch (cdb[0]) {
+	case READ_6:
+	case WRITE_6:
+		return scsi_trace_rw6(p, cdb, len);
+	case READ_10:
+	case WRITE_10:
+		return scsi_trace_rw10(p, cdb, len);
+	case READ_12:
+	case WRITE_12:
+		return scsi_trace_rw12(p, cdb, len);
+	case READ_16:
+	case WRITE_16:
+		return scsi_trace_rw16(p, cdb, len);
+	case VARIABLE_LENGTH_CMD:
+		return scsi_trace_varlen(p, cdb, len);
+	default:
+		return scsi_trace_misc(p, cdb, len);
+	}
+}
diff --git a/include/trace/events/scsi.h b/include/trace/events/scsi.h
new file mode 100644
index 000000000000..76cbf828eb86
--- /dev/null
+++ b/include/trace/events/scsi.h
@@ -0,0 +1,328 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM scsi
+
+#if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCSI_H
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#define scsi_opcode_name(opcode)	{ opcode, #opcode }
+#define show_opcode_name(val)					\
+	__print_symbolic(val,					\
+		scsi_opcode_name(TEST_UNIT_READY),		\
+		scsi_opcode_name(REZERO_UNIT),			\
+		scsi_opcode_name(REQUEST_SENSE),		\
+		scsi_opcode_name(FORMAT_UNIT),			\
+		scsi_opcode_name(READ_BLOCK_LIMITS),		\
+		scsi_opcode_name(REASSIGN_BLOCKS),		\
+		scsi_opcode_name(INITIALIZE_ELEMENT_STATUS),	\
+		scsi_opcode_name(READ_6),			\
+		scsi_opcode_name(WRITE_6),			\
+		scsi_opcode_name(SEEK_6),			\
+		scsi_opcode_name(READ_REVERSE),			\
+		scsi_opcode_name(WRITE_FILEMARKS),		\
+		scsi_opcode_name(SPACE),			\
+		scsi_opcode_name(INQUIRY),			\
+		scsi_opcode_name(RECOVER_BUFFERED_DATA),	\
+		scsi_opcode_name(MODE_SELECT),			\
+		scsi_opcode_name(RESERVE),			\
+		scsi_opcode_name(RELEASE),			\
+		scsi_opcode_name(COPY),				\
+		scsi_opcode_name(ERASE),			\
+		scsi_opcode_name(MODE_SENSE),			\
+		scsi_opcode_name(START_STOP),			\
+		scsi_opcode_name(RECEIVE_DIAGNOSTIC),		\
+		scsi_opcode_name(SEND_DIAGNOSTIC),		\
+		scsi_opcode_name(ALLOW_MEDIUM_REMOVAL),		\
+		scsi_opcode_name(SET_WINDOW),			\
+		scsi_opcode_name(READ_CAPACITY),		\
+		scsi_opcode_name(READ_10),			\
+		scsi_opcode_name(WRITE_10),			\
+		scsi_opcode_name(SEEK_10),			\
+		scsi_opcode_name(POSITION_TO_ELEMENT),		\
+		scsi_opcode_name(WRITE_VERIFY),			\
+		scsi_opcode_name(VERIFY),			\
+		scsi_opcode_name(SEARCH_HIGH),			\
+		scsi_opcode_name(SEARCH_EQUAL),			\
+		scsi_opcode_name(SEARCH_LOW),			\
+		scsi_opcode_name(SET_LIMITS),			\
+		scsi_opcode_name(PRE_FETCH),			\
+		scsi_opcode_name(READ_POSITION),		\
+		scsi_opcode_name(SYNCHRONIZE_CACHE),		\
+		scsi_opcode_name(LOCK_UNLOCK_CACHE),		\
+		scsi_opcode_name(READ_DEFECT_DATA),		\
+		scsi_opcode_name(MEDIUM_SCAN),			\
+		scsi_opcode_name(COMPARE),			\
+		scsi_opcode_name(COPY_VERIFY),			\
+		scsi_opcode_name(WRITE_BUFFER),			\
+		scsi_opcode_name(READ_BUFFER),			\
+		scsi_opcode_name(UPDATE_BLOCK),			\
+		scsi_opcode_name(READ_LONG),			\
+		scsi_opcode_name(WRITE_LONG),			\
+		scsi_opcode_name(CHANGE_DEFINITION),		\
+		scsi_opcode_name(WRITE_SAME),			\
+		scsi_opcode_name(UNMAP),			\
+		scsi_opcode_name(READ_TOC),			\
+		scsi_opcode_name(LOG_SELECT),			\
+		scsi_opcode_name(LOG_SENSE),			\
+		scsi_opcode_name(XDWRITEREAD_10),		\
+		scsi_opcode_name(MODE_SELECT_10),		\
+		scsi_opcode_name(RESERVE_10),			\
+		scsi_opcode_name(RELEASE_10),			\
+		scsi_opcode_name(MODE_SENSE_10),		\
+		scsi_opcode_name(PERSISTENT_RESERVE_IN),	\
+		scsi_opcode_name(PERSISTENT_RESERVE_OUT),	\
+		scsi_opcode_name(VARIABLE_LENGTH_CMD),		\
+		scsi_opcode_name(REPORT_LUNS),			\
+		scsi_opcode_name(MAINTENANCE_IN),		\
+		scsi_opcode_name(MAINTENANCE_OUT),		\
+		scsi_opcode_name(MOVE_MEDIUM),			\
+		scsi_opcode_name(EXCHANGE_MEDIUM),		\
+		scsi_opcode_name(READ_12),			\
+		scsi_opcode_name(WRITE_12),			\
+		scsi_opcode_name(WRITE_VERIFY_12),		\
+		scsi_opcode_name(SEARCH_HIGH_12),		\
+		scsi_opcode_name(SEARCH_EQUAL_12),		\
+		scsi_opcode_name(SEARCH_LOW_12),		\
+		scsi_opcode_name(READ_ELEMENT_STATUS),		\
+		scsi_opcode_name(SEND_VOLUME_TAG),		\
+		scsi_opcode_name(WRITE_LONG_2),			\
+		scsi_opcode_name(READ_16),			\
+		scsi_opcode_name(WRITE_16),			\
+		scsi_opcode_name(VERIFY_16),			\
+		scsi_opcode_name(WRITE_SAME_16),		\
+		scsi_opcode_name(SERVICE_ACTION_IN),		\
+		scsi_opcode_name(SAI_READ_CAPACITY_16),		\
+		scsi_opcode_name(SAI_GET_LBA_STATUS),		\
+		scsi_opcode_name(MI_REPORT_TARGET_PGS),		\
+		scsi_opcode_name(MO_SET_TARGET_PGS),		\
+		scsi_opcode_name(READ_32),			\
+		scsi_opcode_name(WRITE_32),			\
+		scsi_opcode_name(WRITE_SAME_32),		\
+		scsi_opcode_name(ATA_16),			\
+		scsi_opcode_name(ATA_12))
+
+#define scsi_hostbyte_name(result)	{ result, #result }
+#define show_hostbyte_name(val)					\
+	__print_symbolic(val,					\
+		scsi_hostbyte_name(DID_OK),			\
+		scsi_hostbyte_name(DID_NO_CONNECT),		\
+		scsi_hostbyte_name(DID_BUS_BUSY),		\
+		scsi_hostbyte_name(DID_TIME_OUT),		\
+		scsi_hostbyte_name(DID_BAD_TARGET),		\
+		scsi_hostbyte_name(DID_ABORT),			\
+		scsi_hostbyte_name(DID_PARITY),			\
+		scsi_hostbyte_name(DID_ERROR),			\
+		scsi_hostbyte_name(DID_RESET),			\
+		scsi_hostbyte_name(DID_BAD_INTR),		\
+		scsi_hostbyte_name(DID_PASSTHROUGH),		\
+		scsi_hostbyte_name(DID_SOFT_ERROR),		\
+		scsi_hostbyte_name(DID_IMM_RETRY),		\
+		scsi_hostbyte_name(DID_REQUEUE),		\
+		scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED),	\
+		scsi_hostbyte_name(DID_TRANSPORT_FAILFAST))
+
+#define scsi_driverbyte_name(result)	{ result, #result }
+#define show_driverbyte_name(val)				\
+	__print_symbolic(val,					\
+		scsi_driverbyte_name(DRIVER_OK),		\
+		scsi_driverbyte_name(DRIVER_BUSY),		\
+		scsi_driverbyte_name(DRIVER_SOFT),		\
+		scsi_driverbyte_name(DRIVER_MEDIA),		\
+		scsi_driverbyte_name(DRIVER_ERROR),		\
+		scsi_driverbyte_name(DRIVER_INVALID),		\
+		scsi_driverbyte_name(DRIVER_TIMEOUT),		\
+		scsi_driverbyte_name(DRIVER_HARD),		\
+		scsi_driverbyte_name(DRIVER_SENSE))
+
+#define scsi_msgbyte_name(result)	{ result, #result }
+#define show_msgbyte_name(val)					\
+	__print_symbolic(val,					\
+		scsi_msgbyte_name(COMMAND_COMPLETE),		\
+		scsi_msgbyte_name(EXTENDED_MESSAGE),		\
+		scsi_msgbyte_name(SAVE_POINTERS),		\
+		scsi_msgbyte_name(RESTORE_POINTERS),		\
+		scsi_msgbyte_name(DISCONNECT),			\
+		scsi_msgbyte_name(INITIATOR_ERROR),		\
+		scsi_msgbyte_name(ABORT_TASK_SET),		\
+		scsi_msgbyte_name(MESSAGE_REJECT),		\
+		scsi_msgbyte_name(NOP),				\
+		scsi_msgbyte_name(MSG_PARITY_ERROR),		\
+		scsi_msgbyte_name(LINKED_CMD_COMPLETE),		\
+		scsi_msgbyte_name(LINKED_FLG_CMD_COMPLETE),	\
+		scsi_msgbyte_name(TARGET_RESET),		\
+		scsi_msgbyte_name(ABORT_TASK),			\
+		scsi_msgbyte_name(CLEAR_TASK_SET),		\
+		scsi_msgbyte_name(INITIATE_RECOVERY),		\
+		scsi_msgbyte_name(RELEASE_RECOVERY),		\
+		scsi_msgbyte_name(CLEAR_ACA),			\
+		scsi_msgbyte_name(LOGICAL_UNIT_RESET),		\
+		scsi_msgbyte_name(SIMPLE_QUEUE_TAG),		\
+		scsi_msgbyte_name(HEAD_OF_QUEUE_TAG),		\
+		scsi_msgbyte_name(ORDERED_QUEUE_TAG),		\
+		scsi_msgbyte_name(IGNORE_WIDE_RESIDUE),		\
+		scsi_msgbyte_name(ACA),				\
+		scsi_msgbyte_name(QAS_REQUEST),			\
+		scsi_msgbyte_name(BUS_DEVICE_RESET),		\
+		scsi_msgbyte_name(ABORT))
+
+#define scsi_statusbyte_name(result)	{ result, #result }
+#define show_statusbyte_name(val)				\
+	__print_symbolic(val,					\
+		scsi_statusbyte_name(SAM_STAT_GOOD),		\
+		scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION),	\
+		scsi_statusbyte_name(SAM_STAT_CONDITION_MET),	\
+		scsi_statusbyte_name(SAM_STAT_BUSY),		\
+		scsi_statusbyte_name(SAM_STAT_INTERMEDIATE),	\
+		scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \
+		scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT),	\
+		scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED),	\
+		scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL),	\
+		scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE),	\
+		scsi_statusbyte_name(SAM_STAT_TASK_ABORTED))
+
+const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int);
+#define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len)
+
+TRACE_EVENT(scsi_dispatch_cmd_start,
+
+	TP_PROTO(struct scsi_cmnd *cmd),
+
+	TP_ARGS(cmd),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	host_no	)
+		__field( unsigned int,	channel	)
+		__field( unsigned int,	id	)
+		__field( unsigned int,	lun	)
+		__field( unsigned int,	opcode	)
+		__field( unsigned int,	cmd_len )
+		__dynamic_array(unsigned char,	cmnd, cmd->cmd_len)
+	),
+
+	TP_fast_assign(
+		__entry->host_no	= cmd->device->host->host_no;
+		__entry->channel	= cmd->device->channel;
+		__entry->id		= cmd->device->id;
+		__entry->lun		= cmd->device->lun;
+		__entry->opcode		= cmd->cmnd[0];
+		__entry->cmd_len	= cmd->cmd_len;
+		memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
+	),
+
+	TP_printk("host_no=%u channel=%u id=%u lun=%u cmnd=(%s %s raw=%s)",
+		  __entry->host_no, __entry->channel, __entry->id,
+		  __entry->lun, show_opcode_name(__entry->opcode),
+		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
+		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len))
+);
+
+TRACE_EVENT(scsi_dispatch_cmd_error,
+
+	TP_PROTO(struct scsi_cmnd *cmd, int rtn),
+
+	TP_ARGS(cmd, rtn),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	host_no	)
+		__field( unsigned int,	channel	)
+		__field( unsigned int,	id	)
+		__field( unsigned int,	lun	)
+		__field( int,		rtn	)
+		__field( unsigned int,	opcode	)
+		__field( unsigned int,	cmd_len )
+		__dynamic_array(unsigned char,	cmnd, cmd->cmd_len)
+	),
+
+	TP_fast_assign(
+		__entry->host_no	= cmd->device->host->host_no;
+		__entry->channel	= cmd->device->channel;
+		__entry->id		= cmd->device->id;
+		__entry->lun		= cmd->device->lun;
+		__entry->rtn		= rtn;
+		__entry->opcode		= cmd->cmnd[0];
+		__entry->cmd_len	= cmd->cmd_len;
+		memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
+	),
+
+	TP_printk("host_no=%u channel=%u id=%u lun=%u cmnd=(%s %s raw=%s)"
+		  " rtn=%d",
+		  __entry->host_no, __entry->channel, __entry->id,
+		  __entry->lun, show_opcode_name(__entry->opcode),
+		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
+		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
+		  __entry->rtn)
+);
+
+DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template,
+
+	TP_PROTO(struct scsi_cmnd *cmd),
+
+	TP_ARGS(cmd),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	host_no	)
+		__field( unsigned int,	channel	)
+		__field( unsigned int,	id	)
+		__field( unsigned int,	lun	)
+		__field( int,		result	)
+		__field( unsigned int,	opcode	)
+		__field( unsigned int,	cmd_len )
+		__dynamic_array(unsigned char,	cmnd, cmd->cmd_len)
+	),
+
+	TP_fast_assign(
+		__entry->host_no	= cmd->device->host->host_no;
+		__entry->channel	= cmd->device->channel;
+		__entry->id		= cmd->device->id;
+		__entry->lun		= cmd->device->lun;
+		__entry->result		= cmd->result;
+		__entry->opcode		= cmd->cmnd[0];
+		__entry->cmd_len	= cmd->cmd_len;
+		memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
+	),
+
+	TP_printk("host_no=%u channel=%u id=%u lun=%u cmnd=(%s %s raw=%s) "
+		  "result=(driver=%s host=%s message=%s status=%s)",
+		  __entry->host_no, __entry->channel, __entry->id,
+		  __entry->lun, show_opcode_name(__entry->opcode),
+		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
+		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
+		  show_driverbyte_name(((__entry->result) >> 24) & 0xff),
+		  show_hostbyte_name(((__entry->result) >> 16) & 0xff),
+		  show_msgbyte_name(((__entry->result) >> 8) & 0xff),
+		  show_statusbyte_name(__entry->result & 0xff))
+);
+
+DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done,
+	     TP_PROTO(struct scsi_cmnd *cmd),
+	     TP_ARGS(cmd));
+
+DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout,
+	     TP_PROTO(struct scsi_cmnd *cmd),
+	     TP_ARGS(cmd));
+
+TRACE_EVENT(scsi_eh_wakeup,
+
+	TP_PROTO(struct Scsi_Host *shost),
+
+	TP_ARGS(shost),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	host_no	)
+	),
+
+	TP_fast_assign(
+		__entry->host_no	= shost->host_no;
+	),
+
+	TP_printk("host_no=%u", __entry->host_no)
+);
+
+#endif /*  _TRACE_SCSI_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7c4a0ca650b5..9cb5df5dc656 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 
 	return 1;
 }
+EXPORT_SYMBOL(trace_seq_putc);
 
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
-- 
cgit v1.2.3-55-g7522


From 87e9b2024659c614a876ce359a57e98a47b5ef37 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sat, 17 Apr 2010 18:11:59 +0200
Subject: hw-breakpoints: Check disabled breakpoints again

We stopped checking disabled breakpoints because we weren't
allowing breakpoints on NULL addresses. And gdb tends to set
NULL addresses on inactive breakpoints.

But refusing NULL addresses was actually a regression that has
been fixed now. There is no reason anymore to not validate
inactive breakpoint settings.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a6..9ed9ae3a48b3 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -316,17 +316,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	/*
-	 * Ptrace breakpoints can be temporary perf events only
-	 * meant to reserve a slot. In this case, it is created disabled and
-	 * we don't want to check the params right now (as we put a null addr)
-	 * But perf tools create events as disabled and we want to check
-	 * the params for them.
-	 * This is a quick hack that will be removed soon, once we remove
-	 * the tmp breakpoints from ptrace
-	 */
-	if (!bp->attr.disabled || !bp->overflow_handler)
-		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
 
 	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
 	if (ret)
-- 
cgit v1.2.3-55-g7522


From b2812d031dea86926e9c10f7714af33ac2f6b43d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sun, 18 Apr 2010 18:11:53 +0200
Subject: hw-breakpoints: Change/Enforce some breakpoints policies

The current policies of breakpoints in x86 and SH are the following:

- task bound breakpoints can only break on userspace addresses
- cpu wide breakpoints can only break on kernel addresses

The former rule prevents ptrace breakpoints to be set to trigger on
kernel addresses, which is good. But as a side effect, we can't
breakpoint on kernel addresses for task bound breakpoints.

The latter rule simply makes no sense, there is no reason why we
can't set breakpoints on userspace while performing cpu bound
profiles.

We want the following new policies:

- task bound breakpoint can set userspace address breakpoints, with
no particular privilege required.
- task bound breakpoints can set kernelspace address breakpoints but
must be privileged to do that.
- cpu bound breakpoints can do what they want as they are privileged
already.

To implement these new policies, this patch checks if we are dealing
with a kernel address breakpoint, if so and if the exclude_kernel
parameter is set, we tell the user that the breakpoint is invalid,
which makes a good generic ptrace protection.
If we don't have exclude_kernel, ensure the user has the right
privileges as kernel breakpoints are quite sensitive (risk of
trap recursion attacks and global performance impacts).

[ Paul Mundt: keep addr space check for sh signal delivery and fix
  double function declaration]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++---
 arch/sh/kernel/hw_breakpoint.c       | 34 ++++++------------------------
 arch/x86/include/asm/hw_breakpoint.h |  5 ++---
 arch/x86/kernel/hw_breakpoint.c      | 41 ++++++------------------------------
 kernel/hw_breakpoint.c               | 26 +++++++++++++++++++++--
 5 files changed, 41 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 965dd780d51b..382bad937dcc 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -47,9 +47,8 @@ struct pmu;
 #define HBP_NUM		2
 
 /* arch/sh/kernel/hw_breakpoint.c */
-extern int arch_check_va_in_userspace(unsigned long va, u16 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index 675eea7785d9..1f2cf6229862 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -119,26 +119,17 @@ static int get_hbp_len(u16 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u16 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -226,8 +217,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -270,15 +260,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
@@ -363,8 +344,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		perf_bp_event(bp, args->regs);
 
 		/* Deliver the signal to userspace */
-		if (arch_check_va_in_userspace(bp->attr.bp_addr,
-					       bp->attr.bp_len)) {
+		if (!arch_check_bp_in_kernelspace(bp)) {
 			siginfo_t info;
 
 			info.si_signo = args->signr;
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23a..c77a5a6fab9d 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -44,9 +44,8 @@ struct arch_hw_breakpoint {
 struct perf_event;
 struct pmu;
 
-extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a8f1b803d2fd 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -188,26 +188,17 @@ static int get_hbp_len(u8 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 
 	ret = -EINVAL;
 
-	if (info->type == X86_BREAKPOINT_EXECUTE)
-		/*
-		 * Ptrace-refactoring code
-		 * For now, we'll allow instruction breakpoint only for user-space
-		 * addresses
-		 */
-		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
-			info->len != X86_BREAKPOINT_EXECUTE)
-			return ret;
-
 	switch (info->len) {
 	case X86_BREAKPOINT_LEN_1:
 		align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 9ed9ae3a48b3..89e8a050c43a 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -308,6 +308,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
 	return 0;
 }
 
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+	int ret;
+
+	ret = arch_validate_hwbkpt_settings(bp);
+	if (ret)
+		return ret;
+
+	if (arch_check_bp_in_kernelspace(bp)) {
+		if (bp->attr.exclude_kernel)
+			return -EINVAL;
+		/*
+		 * Don't let unprivileged users set a breakpoint in the trap
+		 * path to avoid trap recursion attacks.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
 	int ret;
@@ -316,7 +338,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = validate_hw_breakpoint(bp);
 
 	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
 	if (ret)
@@ -363,7 +385,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	if (attr->disabled)
 		goto end;
 
-	err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	err = validate_hw_breakpoint(bp);
 	if (!err)
 		perf_event_enable(bp);
 
-- 
cgit v1.2.3-55-g7522


From 0102752e4c9e0655b39734550d4c35327954f7f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sun, 11 Apr 2010 18:55:56 +0200
Subject: hw-breakpoints: Separate constraint space for data and instruction
 breakpoints

There are two outstanding fashions for archs to implement hardware
breakpoints.

The first is to separate breakpoint address pattern definition
space between data and instruction breakpoints. We then have
typically distinct instruction address breakpoint registers
and data address breakpoint registers, delivered with
separate control registers for data and instruction breakpoints
as well. This is the case of PowerPc and ARM for example.

The second consists in having merged breakpoint address space
definition between data and instruction breakpoint. Address
registers can host either instruction or data address and
the access mode for the breakpoint is defined in a control
register. This is the case of x86 and Super H.

This patch adds a new CONFIG_HAVE_MIXED_BREAKPOINTS_REGS config
that archs can select if they belong to the second case. Those
will have their slot allocation merged for instructions and
data breakpoints.

The others will have a separate slot tracking between data and
instruction breakpoints.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                  | 11 ++++++
 arch/sh/Kconfig               |  1 +
 arch/x86/Kconfig              |  1 +
 include/linux/hw_breakpoint.h |  9 +++--
 kernel/hw_breakpoint.c        | 86 +++++++++++++++++++++++++++++--------------
 5 files changed, 78 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index f06010fb4838..acda512da2e2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -137,6 +137,17 @@ config HAVE_HW_BREAKPOINT
 	bool
 	depends on PERF_EVENTS
 
+config HAVE_MIXED_BREAKPOINTS_REGS
+	bool
+	depends on HAVE_HW_BREAKPOINT
+	help
+	  Depending on the arch implementation of hardware breakpoints,
+	  some of them have separate registers for data and instruction
+	  breakpoints addresses, others have mixed registers to store
+	  them but define the access type in a control register.
+	  Select this option if your arch implements breakpoints under the
+	  latter fashion.
+
 config HAVE_USER_RETURN_NOTIFIER
 	bool
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 8d90564c2bcf..e6d8ab5cfa9d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH32
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_ARCH_KGDB
 	select HAVE_HW_BREAKPOINT
+	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS if HAVE_HW_BREAKPOINT
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 97a95dfd1181..01177dcbe261 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,6 +53,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO
 	select HAVE_HW_BREAKPOINT
+	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index a0aa5a9cfb0e..7e8899093098 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -9,9 +9,12 @@ enum {
 };
 
 enum {
-	HW_BREAKPOINT_R = 1,
-	HW_BREAKPOINT_W = 2,
-	HW_BREAKPOINT_X = 4,
+	HW_BREAKPOINT_EMPTY	= 0,
+	HW_BREAKPOINT_R		= 1,
+	HW_BREAKPOINT_W		= 2,
+	HW_BREAKPOINT_RW	= HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+	HW_BREAKPOINT_X		= 4,
+	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 89e8a050c43a..8ead1345e33b 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -45,18 +45,28 @@
 
 #include <linux/hw_breakpoint.h>
 
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 /*
  * Constraints data
  */
 
 /* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
@@ -67,14 +77,22 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
 
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+		return TYPE_DATA;
+
+	return TYPE_INST;
+}
+
 /*
  * Report the maximum number of pinned breakpoints a task
  * have in this cpu
  */
-static unsigned int max_task_bp_pinned(int cpu)
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
 	int i;
-	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
 	for (i = HBP_NUM -1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
@@ -84,7 +102,7 @@ static unsigned int max_task_bp_pinned(int cpu)
 	return 0;
 }
 
-static int task_bp_pinned(struct task_struct *tsk)
+static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
 {
 	struct perf_event_context *ctx = tsk->perf_event_ctxp;
 	struct list_head *list;
@@ -105,7 +123,8 @@ static int task_bp_pinned(struct task_struct *tsk)
 	 */
 	list_for_each_entry(bp, list, event_entry) {
 		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-			count++;
+			if (find_slot_idx(bp) == type)
+				count++;
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +137,19 @@ static int task_bp_pinned(struct task_struct *tsk)
  * a given cpu (cpu > -1) or in all of them (cpu = -1).
  */
 static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+		    enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
 	if (cpu >= 0) {
-		slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			slots->pinned += max_task_bp_pinned(cpu);
+			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(tsk);
-		slots->flexible = per_cpu(nr_bp_flexible, cpu);
+			slots->pinned += task_bp_pinned(tsk, type);
+		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
 	}
@@ -137,16 +157,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 	for_each_online_cpu(cpu) {
 		unsigned int nr;
 
-		nr = per_cpu(nr_cpu_bp_pinned, cpu);
+		nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			nr += max_task_bp_pinned(cpu);
+			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(tsk);
+			nr += task_bp_pinned(tsk, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
 
-		nr = per_cpu(nr_bp_flexible, cpu);
+		nr = per_cpu(nr_bp_flexible[type], cpu);
 
 		if (nr > slots->flexible)
 			slots->flexible = nr;
@@ -156,14 +176,15 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+				enum bp_type_idx type)
 {
 	unsigned int *tsk_pinned;
 	int count = 0;
 
-	count = task_bp_pinned(tsk);
+	count = task_bp_pinned(tsk, type);
 
-	tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
 		tsk_pinned[count]++;
 		if (count > 0)
@@ -178,7 +199,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 /*
  * Add/remove the given breakpoint in our constraint table
  */
-static void toggle_bp_slot(struct perf_event *bp, bool enable)
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +208,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 	/* Pinned counter task profiling */
 	if (tsk) {
 		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable);
+			toggle_bp_task_slot(tsk, cpu, enable, type);
 			return;
 		}
 
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable);
+			toggle_bp_task_slot(tsk, cpu, enable, type);
 		return;
 	}
 
 	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++;
 	else
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--;
 }
 
 /*
@@ -246,14 +268,21 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 static int __reserve_bp_slot(struct perf_event *bp)
 {
 	struct bp_busy_slots slots = {0};
+	enum bp_type_idx type;
 
-	fetch_bp_busy_slots(&slots, bp);
+	/* Basic checks */
+	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+		return -EINVAL;
+
+	type = find_slot_idx(bp);
+	fetch_bp_busy_slots(&slots, bp, type);
 
 	/* Flexible counters need to keep at least one slot */
 	if (slots.pinned + (!!slots.flexible) == HBP_NUM)
 		return -ENOSPC;
 
-	toggle_bp_slot(bp, true);
+	toggle_bp_slot(bp, true, type);
 
 	return 0;
 }
@@ -273,7 +302,10 @@ int reserve_bp_slot(struct perf_event *bp)
 
 static void __release_bp_slot(struct perf_event *bp)
 {
-	toggle_bp_slot(bp, false);
+	enum bp_type_idx type;
+
+	type = find_slot_idx(bp);
+	toggle_bp_slot(bp, false, type);
 }
 
 void release_bp_slot(struct perf_event *bp)
-- 
cgit v1.2.3-55-g7522


From f93a20541134fa767e8dc4eb32e956d30b9f6b92 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 13 Apr 2010 00:32:30 +0200
Subject: hw-breakpoints: Handle breakpoint weight in allocation constraints

Depending on their nature and on what an arch supports, breakpoints
may consume more than one address register. For example a simple
absolute address match usually only requires one address register.
But an address range match may consume two registers.

Currently our slot allocation constraints, that tend to reflect the
limited arch's resources, always consider that a breakpoint consumes
one slot.

Then provide a way for archs to tell us the weight of a breakpoint
through a new hw_breakpoint_weight() helper. This weight will be
computed against the generic allocation constraints instead of
a constant value.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 63 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 8ead1345e33b..974498b858fc 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -77,6 +77,11 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
 
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+	return 1;
+}
+
 static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
 {
 	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
@@ -124,7 +129,7 @@ static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
 	list_for_each_entry(bp, list, event_entry) {
 		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
 			if (find_slot_idx(bp) == type)
-				count++;
+				count += hw_breakpoint_weight(bp);
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -173,26 +178,41 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 	}
 }
 
+/*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+	slots->pinned += weight;
+}
+
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
 static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
-				enum bp_type_idx type)
+				enum bp_type_idx type, int weight)
 {
 	unsigned int *tsk_pinned;
-	int count = 0;
+	int old_count = 0;
+	int old_idx = 0;
+	int idx = 0;
 
-	count = task_bp_pinned(tsk, type);
+	old_count = task_bp_pinned(tsk, type);
+	old_idx = old_count - 1;
+	idx = old_idx + weight;
 
 	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
-		tsk_pinned[count]++;
-		if (count > 0)
-			tsk_pinned[count-1]--;
+		tsk_pinned[idx]++;
+		if (old_count > 0)
+			tsk_pinned[old_idx]--;
 	} else {
-		tsk_pinned[count]--;
-		if (count > 0)
-			tsk_pinned[count-1]++;
+		tsk_pinned[idx]--;
+		if (old_count > 0)
+			tsk_pinned[old_idx]++;
 	}
 }
 
@@ -200,7 +220,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
  * Add/remove the given breakpoint in our constraint table
  */
 static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+	       int weight)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
@@ -208,20 +229,20 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
 	/* Pinned counter task profiling */
 	if (tsk) {
 		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable, type);
+			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
 			return;
 		}
 
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable, type);
+			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
 		return;
 	}
 
 	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
 	else
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
 }
 
 /*
@@ -269,6 +290,7 @@ static int __reserve_bp_slot(struct perf_event *bp)
 {
 	struct bp_busy_slots slots = {0};
 	enum bp_type_idx type;
+	int weight;
 
 	/* Basic checks */
 	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
@@ -276,13 +298,16 @@ static int __reserve_bp_slot(struct perf_event *bp)
 		return -EINVAL;
 
 	type = find_slot_idx(bp);
+	weight = hw_breakpoint_weight(bp);
+
 	fetch_bp_busy_slots(&slots, bp, type);
+	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-	if (slots.pinned + (!!slots.flexible) == HBP_NUM)
+	if (slots.pinned + (!!slots.flexible) > HBP_NUM)
 		return -ENOSPC;
 
-	toggle_bp_slot(bp, true, type);
+	toggle_bp_slot(bp, true, type, weight);
 
 	return 0;
 }
@@ -303,9 +328,11 @@ int reserve_bp_slot(struct perf_event *bp)
 static void __release_bp_slot(struct perf_event *bp)
 {
 	enum bp_type_idx type;
+	int weight;
 
 	type = find_slot_idx(bp);
-	toggle_bp_slot(bp, false, type);
+	weight = hw_breakpoint_weight(bp);
+	toggle_bp_slot(bp, false, type, weight);
 }
 
 void release_bp_slot(struct perf_event *bp)
-- 
cgit v1.2.3-55-g7522


From feef47d0cb530e8419dfa0b48141b538b89b1b1a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Fri, 23 Apr 2010 05:59:55 +0200
Subject: hw-breakpoints: Get the number of available registers on boot
 dynamically

The breakpoint generic layer assumes that archs always know in advance
the static number of address registers available to host breakpoints
through the HBP_NUM macro.

However this is not true for every archs. For example Arm needs to get
this information dynamically to handle the compatiblity between
different versions.

To solve this, this patch proposes to drop the static HBP_NUM macro
and let the arch provide the number of available slots through a
new hw_breakpoint_slots() function. For archs that have
CONFIG_HAVE_MIXED_BREAKPOINTS_REGS selected, it will be called once
as the number of registers fits for instruction and data breakpoints
together.
For the others it will be called first to get the number of
instruction breakpoint registers and another time to get the
data breakpoint registers, the targeted type is given as a
parameter of hw_breakpoint_slots().

Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++++
 arch/x86/include/asm/hw_breakpoint.h |  5 ++++
 include/linux/hw_breakpoint.h        | 10 +++++++
 kernel/hw_breakpoint.c               | 53 ++++++++++++++++++++++++++++--------
 kernel/trace/trace_ksym.c            | 26 +++++-------------
 5 files changed, 68 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 382bad937dcc..e14cad96798f 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -46,6 +46,11 @@ struct pmu;
 /* Maximum number of UBC channels */
 #define HBP_NUM		2
 
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+
 /* arch/sh/kernel/hw_breakpoint.c */
 extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
 extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index c77a5a6fab9d..942255310e6a 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -41,6 +41,11 @@ struct arch_hw_breakpoint {
 /* Total number of available HW breakpoint registers */
 #define HBP_NUM 4
 
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+
 struct perf_event;
 struct pmu;
 
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 7e8899093098..a2d6ea49ec56 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -17,6 +17,16 @@ enum {
 	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 #ifdef __KERNEL__
 
 #include <linux/perf_event.h>
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 974498b858fc..684b710cbb91 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,20 +40,12 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
 #include <linux/hw_breakpoint.h>
 
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
 
 /*
  * Constraints data
@@ -63,11 +55,15 @@ enum bp_type_idx {
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
+static int nr_slots[TYPE_MAX];
+
+static int constraints_initialized;
+
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
 	unsigned int pinned;
@@ -99,7 +95,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 	int i;
 	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
-	for (i = HBP_NUM -1; i >= 0; i--) {
+	for (i = nr_slots[type] - 1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
 			return i + 1;
 	}
@@ -292,6 +288,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	enum bp_type_idx type;
 	int weight;
 
+	/* We couldn't initialize breakpoint constraints on boot */
+	if (!constraints_initialized)
+		return -ENOMEM;
+
 	/* Basic checks */
 	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
 	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
@@ -304,7 +304,7 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-	if (slots.pinned + (!!slots.flexible) > HBP_NUM)
+	if (slots.pinned + (!!slots.flexible) > nr_slots[type])
 		return -ENOSPC;
 
 	toggle_bp_slot(bp, true, type, weight);
@@ -551,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 
 static int __init init_hw_breakpoint(void)
 {
+	unsigned int **task_bp_pinned;
+	int cpu, err_cpu;
+	int i;
+
+	for (i = 0; i < TYPE_MAX; i++)
+		nr_slots[i] = hw_breakpoint_slots(i);
+
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < TYPE_MAX; i++) {
+			task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+			*task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
+						  GFP_KERNEL);
+			if (!*task_bp_pinned)
+				goto err_alloc;
+		}
+	}
+
+	constraints_initialized = 1;
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+	for_each_possible_cpu(err_cpu) {
+		if (err_cpu == cpu)
+			break;
+		for (i = 0; i < TYPE_MAX; i++)
+			kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+	}
+
+	return -ENOMEM;
 }
 core_initcall(init_hw_breakpoint);
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd6879477..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
 
 #include <asm/atomic.h>
 
-/*
- * For now, let us restrict the no. of symbols traced simultaneously to number
- * of available hardware breakpoint registers.
- */
-#define KSYM_TRACER_MAX HBP_NUM
-
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 
 struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
 
 static struct trace_array *ksym_trace_array;
 
-static unsigned int ksym_filter_entry_count;
 static unsigned int ksym_tracing_enabled;
 
 static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	struct trace_ksym *entry;
 	int ret = -ENOMEM;
 
-	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
-		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
-		" new requests for tracing can be accepted now.\n",
-			KSYM_TRACER_MAX);
-		return -ENOSPC;
-	}
-
 	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 
 	if (IS_ERR(entry->ksym_hbp)) {
 		ret = PTR_ERR(entry->ksym_hbp);
-		printk(KERN_INFO "ksym_tracer request failed. Try again"
-					" later!!\n");
+		if (ret == -ENOSPC) {
+			printk(KERN_ERR "ksym_tracer: Maximum limit reached."
+			" No new requests for tracing can be accepted now.\n");
+		} else {
+			printk(KERN_INFO "ksym_tracer request failed. Try again"
+					 " later!!\n");
+		}
 		goto err;
 	}
 
 	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-	ksym_filter_entry_count++;
 
 	return 0;
 
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
 	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
 								ksym_hlist) {
 		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 				goto out_unlock;
 		}
 		/* Error or "symbol:---" case: drop it */
-		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
-- 
cgit v1.2.3-55-g7522


From 048c852051d2bd5da54a4488bc1f16b0fc74c695 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Sat, 1 May 2010 10:11:35 +0200
Subject: perf: Fix resource leak in failure path of perf_event_open()

perf_event_open() kfrees event after init failure which doesn't
release all resources allocated by perf_event_alloc().  Use
free_event() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@au1.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: <stable@kernel.org>
LKML-Reference: <4BDBE237.1040809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2f3fbf84215a..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4897,7 +4897,7 @@ err_fput_free_put_context:
 
 err_free_put_context:
 	if (err < 0)
-		kfree(event);
+		free_event(event);
 
 err_put_context:
 	if (err < 0)
-- 
cgit v1.2.3-55-g7522


From e7a297b0d7d6049bd4e423ac1e17da31e4c401b8 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr
Date: Fri, 30 Apr 2010 14:44:50 -0700
Subject: genirq: Add CPU mask affinity hint

This patch adds a cpumask affinity hint to the irq_desc structure,
along with a registration function and a read-only proc entry for each
interrupt.

This affinity_hint handle for each interrupt can be used by underlying
drivers that need a better mechanism to control interrupt affinity.
The underlying driver can register a cpumask for the interrupt, which
will allow the driver to provide the CPU mask for the interrupt to
anything that requests it.  The intent is to extend the userspace
daemon, irqbalance, to help hint to it a preferred CPU mask to balance
the interrupt into.

[ tglx: Fixed compile warnings, added WARN_ON, made SMP only ]

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Cc: davem@davemloft.net
Cc: arjan@linux.jf.intel.com
Cc: bhutchings@solarflare.com
LKML-Reference: <20100430214445.3992.41647.stgit@ppwaskie-hc2.jf.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  6 ++++++
 include/linux/irq.h       |  1 +
 kernel/irq/manage.c       | 22 ++++++++++++++++++++++
 kernel/irq/proc.c         | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index e6d2f4441fda..5137db3317f9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -233,6 +233,7 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
+extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 #else /* CONFIG_SMP */
 
 static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -247,6 +248,11 @@ static inline int irq_can_set_affinity(unsigned int irq)
 
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
+static inline int irq_set_affinity_hint(unsigned int irq,
+                                        const struct cpumask *m)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 707ab122e2e6..c03243ad84b4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -195,6 +195,7 @@ struct irq_desc {
 	raw_spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
+	const struct cpumask	*affinity_hint;
 	unsigned int		node;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 444d5a81a209..3164ba7ce151 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	return 0;
 }
 
+int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	if (!desc)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	desc->affinity_hint = m;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
@@ -906,6 +922,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			desc->chip->disable(irq);
 	}
 
+#ifdef CONFIG_SMP
+	/* make sure affinity_hint is cleaned up */
+	if (WARN_ON_ONCE(desc->affinity_hint))
+		desc->affinity_hint = NULL;
+#endif
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index fe92dc5190dd..4f9427a30e14 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,29 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long)m->private);
+	unsigned long flags;
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	if (desc->affinity_hint)
+		cpumask_copy(mask, desc->affinity_hint);
+	else
+		cpumask_setall(mask);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	seq_cpumask(m, mask);
+	seq_putc(m, '\n');
+	free_cpumask_var(mask);
+
+	return 0;
+}
+
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -84,6 +107,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
+static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+}
+
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -92,6 +120,13 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
+static const struct file_operations irq_affinity_hint_proc_fops = {
+	.open		= irq_affinity_hint_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_cpumask(m, irq_default_affinity);
@@ -252,6 +287,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
 
+	/* create /proc/irq/<irq>/affinity_hint */
+	proc_create_data("affinity_hint", 0400, desc->dir,
+			 &irq_affinity_hint_proc_fops, (void *)(long)irq);
+
 	proc_create_data("node", 0444, desc->dir,
 			 &irq_node_proc_fops, (void *)(long)irq);
 #endif
-- 
cgit v1.2.3-55-g7522


From ba697f40dbb704956a4cf67a7845b538015a01ea Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 4 May 2010 04:47:25 +0200
Subject: lockdep: Provide off case for redundant_hardirqs_on increment

We forgot to provide a !CONFIG_DEBUG_LOCKDEP case for the
redundant_hardirqs_on stat handling.

Manage that in the headers with a new __debug_atomic_inc() helper.

Fixes:

	kernel/lockdep.c:2306: error: 'lockdep_stats' undeclared (first use in this function)
	kernel/lockdep.c:2306: error: (Each undeclared identifier is reported only once
	kernel/lockdep.c:2306: error: for each function it appears in.)

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep.c           | 2 +-
 kernel/lockdep_internals.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1b58a1bbcc87..9cf79858fd82 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2303,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		 * so this is racy by nature but loosing one hit
 		 * in a stat is not a big deal.
 		 */
-		this_cpu_inc(lockdep_stats.redundant_hardirqs_on);
+		__debug_atomic_inc(redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 2b174762fa0e..7de27a80f802 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -139,6 +139,9 @@ struct lockdep_stats {
 
 DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
+#define __debug_atomic_inc(ptr)					\
+	this_cpu_inc(lockdep_stats.ptr);
+
 #define debug_atomic_inc(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
 	this_cpu_inc(lockdep_stats.ptr);			\
@@ -160,6 +163,7 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 	__total;							\
 })
 #else
+# define __debug_atomic_inc(ptr)	do { } while (0)
 # define debug_atomic_inc(ptr)		do { } while (0)
 # define debug_atomic_dec(ptr)		do { } while (0)
 # define debug_atomic_read(ptr)		0
-- 
cgit v1.2.3-55-g7522


From fa9a97dec611c5356301645d576b523ce3919eba Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 4 May 2010 04:52:48 +0200
Subject: lockdep: Actually _dec_ in debug_atomic_dec

Fix a silly copy-paste mistake that was making debug_atomic_dec use
this_cpu_inc instead of this_cpu_dec.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep_internals.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 7de27a80f802..8d929c717d3e 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -149,7 +149,7 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
 #define debug_atomic_dec(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	this_cpu_inc(lockdep_stats.ptr);			\
+	this_cpu_dec(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_read(ptr)		({				\
-- 
cgit v1.2.3-55-g7522


From 54d47a2be5e7f928fb77b2f5a0761f6bd3c9dbff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Tue, 4 May 2010 04:54:47 +0200
Subject: lockdep: No need to disable preemption in debug atomic ops

No need to disable preemption in the debug_atomic_* ops, as
we ensure interrupts are disabled already.

So let's use the __this_cpu_ops() rather than this_cpu_ops() that
enclose the ops in a preempt disabled section.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep_internals.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8d929c717d3e..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -144,12 +144,12 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
 #define debug_atomic_inc(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	this_cpu_inc(lockdep_stats.ptr);			\
+	__this_cpu_inc(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_dec(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	this_cpu_dec(lockdep_stats.ptr);			\
+	__this_cpu_dec(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_read(ptr)		({				\
-- 
cgit v1.2.3-55-g7522


From 777d0411cd1e384115985dac5ccd42031e3eee2b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Mon, 3 May 2010 15:39:45 +0200
Subject: hw_breakpoints: Fix percpu build failure

Fix this build error:

   kernel/hw_breakpoint.c:58:1: error: pasting "__pcpu_scope_" and "*" does not give a valid preprocessing token

It happens if CONFIG_DEBUG_FORCE_WEAK_PER_CPU, because we concatenate
someting with the name and we have the "*" in the name.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
LKML-Reference: <20100503133942.GA5497@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 684b710cbb91..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -55,7 +55,7 @@
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]);
+static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
-- 
cgit v1.2.3-55-g7522


From 956097912c40a03bf22603a3be73503fd9ea9e44 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sun, 2 May 2010 08:03:54 +0200
Subject: ring-buffer: Wrap open-coded WARN_ONCE

Wrap open-coded WARN_ONCE functionality into the equivalent macro.

Signed-off-by: Borislav Petkov <bp@alien8.de>
LKML-Reference: <20100502060354.GA5281@liondog.tnic>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2a090448ef6b..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2000,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
 {
 	struct ring_buffer_event *event;
-	static int once;
 	int ret;
 
-	if (unlikely(*delta > (1ULL << 59) && !once++)) {
-		printk(KERN_WARNING "Delta way too big! %llu"
-		       " ts=%llu write stamp = %llu\n",
-		       (unsigned long long)*delta,
-		       (unsigned long long)*ts,
-		       (unsigned long long)cpu_buffer->write_stamp);
-		WARN_ON(1);
-	}
+	WARN_ONCE(*delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+		  (unsigned long long)*delta,
+		  (unsigned long long)*ts,
+		  (unsigned long long)cpu_buffer->write_stamp);
 
 	/*
 	 * The delta is too big, we to add a
-- 
cgit v1.2.3-55-g7522


From 9a9686b634acc5cb6b7c601c171ae64af0318a24 Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Thu, 22 Apr 2010 17:29:24 +0800
Subject: cgroup: Fix an RCU warning in cgroup_path()

with CONFIG_PROVE_RCU=y, a warning can be triggered:

  # mount -t cgroup -o debug xxx /mnt
  # cat /proc/$$/cgroup

...
kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
...

This is a false-positive, because cgroup_path() can be called
with either rcu_read_lock() held or cgroup_mutex held.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..4ca928db890c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
-	struct dentry *dentry = rcu_dereference(cgrp->dentry);
+	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+						      rcu_read_lock_held() ||
+						      cgroup_lock_is_held());
 
 	if (!dentry || cgrp == dummytop) {
 		/*
@@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	*--start = '\0';
 	for (;;) {
 		int len = dentry->d_name.len;
+
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
-		memcpy(start, cgrp->dentry->d_name.name, len);
+		memcpy(start, dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
-		dentry = rcu_dereference(cgrp->dentry);
+
+		dentry = rcu_dereference_check(cgrp->dentry,
+					       rcu_read_lock_held() ||
+					       cgroup_lock_is_held());
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
-- 
cgit v1.2.3-55-g7522


From fae9c791703606636c1220e47f6690660042ce7f Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Thu, 22 Apr 2010 17:30:00 +0800
Subject: cgroup: Fix an RCU warning in alloc_css_id()

With CONFIG_PROVE_RCU=y, a warning can be triggered:

  # mount -t cgroup -o memory xxx /mnt
  # mkdir /mnt/0

...
kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
...

This is a false-positive. It's safe to directly access parent_css->id.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4ca928db890c..3a53c771e503 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4561,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 {
 	int subsys_id, i, depth = 0;
 	struct cgroup_subsys_state *parent_css, *child_css;
-	struct css_id *child_id, *parent_id = NULL;
+	struct css_id *child_id, *parent_id;
 
 	subsys_id = ss->subsys_id;
 	parent_css = parent->subsys[subsys_id];
 	child_css = child->subsys[subsys_id];
-	depth = css_depth(parent_css) + 1;
 	parent_id = parent_css->id;
+	depth = parent_id->depth;
 
 	child_id = get_new_cssid(ss, depth);
 	if (IS_ERR(child_id))
-- 
cgit v1.2.3-55-g7522


From b629317e66fb1c6066c550dded45ab85a936163c Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Thu, 22 Apr 2010 17:30:40 +0800
Subject: sched: Fix an RCU warning in print_task()

With CONFIG_PROVE_RCU=y, a warning can be triggered:

  $ cat /proc/sched_debug

...
kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
...

Both cgroup_path() and task_group() should be called with either
rcu_read_lock or cgroup_mutex held.

The rcu_dereference_check() does include cgroup_lock_is_held(), so we
know that this lock is not held.  Therefore, in a CONFIG_PREEMPT kernel,
to say nothing of a CONFIG_PREEMPT_RT kernel, the original code could
have ended up copying a string out of the freelist.

This patch inserts RCU read-side primitives needed to prevent this
scenario.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched_debug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db144037..19be00ba6123 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	{
 		char path[64];
 
+		rcu_read_lock();
 		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+		rcu_read_unlock();
 		SEQ_printf(m, " %s", path);
 	}
 #endif
-- 
cgit v1.2.3-55-g7522


From 668eb65f092902eb7dd526af73d4a7f025a94612 Mon Sep 17 00:00:00 2001
From: Thiago Farina
Date: Sun, 24 Jan 2010 11:03:50 -0500
Subject: tracing: Fix "integer as NULL pointer" warning.

kernel/trace/trace_output.c:256:24: warning: Using plain integer as NULL pointer

Signed-off-by: Thiago Farina <tfransosi@gmail.com>
LKML-Reference: <1264349038-1766-3-git-send-email-tfransosi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..2404c129a8c9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 	void *ret;
 
 	if (s->full)
-		return 0;
+		return NULL;
 
 	if (len > ((PAGE_SIZE - 1) - s->len)) {
 		s->full = 1;
-- 
cgit v1.2.3-55-g7522


From ee84b8243b07c33a5c8aed42b4b2da60cb16d1d2 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 May 2010 09:28:41 -0700
Subject: rcu: create rcu_my_thread_group_empty() wrapper

Some RCU-lockdep splat repairs need to know whether they are running
in a single-threaded process.  Unfortunately, the thread_group_empty()
primitive is defined in sched.h, and can induce #include hell.  This
commit therefore introduces a rcu_my_thread_group_empty() wrapper that
is defined in rcupdate.c, thus avoiding the need to include sched.h
everywhere.

Signed-off-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  2 ++
 kernel/rcupdate.c        | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 07db2feb8572..db266bbed23f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -190,6 +190,8 @@ static inline int rcu_read_lock_sched_held(void)
 
 #ifdef CONFIG_PROVE_RCU
 
+extern int rcu_my_thread_group_empty(void);
+
 /**
  * rcu_dereference_check - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 03a7ea1579f6..49d808e833b0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	rcu = container_of(head, struct rcu_synchronize, head);
 	complete(&rcu->completion);
 }
+
+#ifdef CONFIG_PROVE_RCU
+/*
+ * wrapper function to avoid #include problems.
+ */
+int rcu_my_thread_group_empty(void)
+{
+	return thread_group_empty(current);
+}
+EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
+#endif /* #ifdef CONFIG_PROVE_RCU */
-- 
cgit v1.2.3-55-g7522


From 1142d810298e694754498dbb4983fcb6cb7fd884 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Thu, 6 May 2010 18:49:20 +0200
Subject: cpu_stop: implement stop_cpu[s]()

Implement a simplistic per-cpu maximum priority cpu monopolization
mechanism.  A non-sleeping callback can be scheduled to run on one or
multiple cpus with maximum priority monopolozing those cpus.  This is
primarily to replace and unify RT workqueue usage in stop_machine and
scheduler migration_thread which currently is serving multiple
purposes.

Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(),
stop_cpus() and try_stop_cpus().

This is to allow clean sharing of resources among stop_cpu and all the
migration thread users.  One stopper thread per cpu is created which
is currently named "stopper/CPU".  This will eventually replace the
migration thread and take on its name.

* This facility was originally named cpuhog and lived in separate
  files but Peter Zijlstra nacked the name and thus got renamed to
  cpu_stop and moved into stop_machine.c.

* Better reporting of preemption leak as per Peter's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 include/linux/stop_machine.h |  39 ++++-
 kernel/stop_machine.c        | 372 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 402 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index baba3a23a814..efcbd6c37947 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -1,15 +1,46 @@
 #ifndef _LINUX_STOP_MACHINE
 #define _LINUX_STOP_MACHINE
-/* "Bogolock": stop the entire machine, disable interrupts.  This is a
-   very heavy lock, which is equivalent to grabbing every spinlock
-   (and more).  So the "read" side to such a lock is anything which
-   disables preeempt. */
+
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/list.h>
 #include <asm/system.h>
 
 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
+/*
+ * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
+ * monopolization mechanism.  The caller can specify a non-sleeping
+ * function to be executed on a single or multiple cpus preempting all
+ * other processes and monopolizing those cpus until it finishes.
+ *
+ * Resources for this mechanism are preallocated when a cpu is brought
+ * up and requests are guaranteed to be served as long as the target
+ * cpus are online.
+ */
+
+typedef int (*cpu_stop_fn_t)(void *arg);
+
+struct cpu_stop_work {
+	struct list_head	list;		/* cpu_stopper->works */
+	cpu_stop_fn_t		fn;
+	void			*arg;
+	struct cpu_stop_done	*done;
+};
+
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			 struct cpu_stop_work *work_buf);
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+
+/*
+ * stop_machine "Bogolock": stop the entire machine, disable
+ * interrupts.  This is a very heavy lock, which is equivalent to
+ * grabbing every spinlock (and more).  So the "read" side to such a
+ * lock is anything which disables preeempt.
+ */
+
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
  * @fn: the function to run
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..7e3f9182aef3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,379 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005	IBM Corporation.
+ * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010		SUSE Linux Products GmbH
+ * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
  */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+	atomic_t		nr_todo;	/* nr left to execute */
+	bool			executed;	/* actually executed? */
+	int			ret;		/* collected return value */
+	struct completion	completion;	/* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+	spinlock_t		lock;
+	struct list_head	works;		/* list of pending works */
+	struct task_struct	*thread;	/* stopper thread */
+	bool			enabled;	/* is this stopper enabled? */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+	memset(done, 0, sizeof(*done));
+	atomic_set(&done->nr_todo, nr_todo);
+	init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+	if (done) {
+		if (executed)
+			done->executed = true;
+		if (atomic_dec_and_test(&done->nr_todo))
+			complete(&done->completion);
+	}
+}
+
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+				struct cpu_stop_work *work)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stopper->lock, flags);
+
+	if (stopper->enabled) {
+		list_add_tail(&work->list, &stopper->works);
+		wake_up_process(stopper->thread);
+	} else
+		cpu_stop_signal_done(work->done, false);
+
+	spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+	struct cpu_stop_done done;
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+	cpu_stop_init_done(&done, 1);
+	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+	wait_for_completion(&done.completion);
+	return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			struct cpu_stop_work *work_buf)
+{
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	struct cpu_stop_work *work;
+	struct cpu_stop_done done;
+	unsigned int cpu;
+
+	/* initialize works and done */
+	for_each_cpu(cpu, cpumask) {
+		work = &per_cpu(stop_cpus_work, cpu);
+		work->fn = fn;
+		work->arg = arg;
+		work->done = &done;
+	}
+	cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+	/*
+	 * Disable preemption while queueing to avoid getting
+	 * preempted by a stopper which might wait for other stoppers
+	 * to enter @fn which can lead to deadlock.
+	 */
+	preempt_disable();
+	for_each_cpu(cpu, cpumask)
+		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+				    &per_cpu(stop_cpus_work, cpu));
+	preempt_enable();
+
+	wait_for_completion(&done.completion);
+	return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	int ret;
+
+	/* static works are used, process one request at a time */
+	mutex_lock(&stop_cpus_mutex);
+	ret = __stop_cpus(cpumask, fn, arg);
+	mutex_unlock(&stop_cpus_mutex);
+	return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	int ret;
+
+	/* static works are used, process one request at a time */
+	if (!mutex_trylock(&stop_cpus_mutex))
+		return -EAGAIN;
+	ret = __stop_cpus(cpumask, fn, arg);
+	mutex_unlock(&stop_cpus_mutex);
+	return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+	struct cpu_stopper *stopper = data;
+	struct cpu_stop_work *work;
+	int ret;
+
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&stopper->lock);
+	if (!list_empty(&stopper->works)) {
+		work = list_first_entry(&stopper->works,
+					struct cpu_stop_work, list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_irq(&stopper->lock);
+
+	if (work) {
+		cpu_stop_fn_t fn = work->fn;
+		void *arg = work->arg;
+		struct cpu_stop_done *done = work->done;
+		char ksym_buf[KSYM_NAME_LEN];
+
+		__set_current_state(TASK_RUNNING);
+
+		/* cpu stop callbacks are not allowed to sleep */
+		preempt_disable();
+
+		ret = fn(arg);
+		if (ret)
+			done->ret = ret;
+
+		/* restore preemption and check it's still balanced */
+		preempt_enable();
+		WARN_ONCE(preempt_count(),
+			  "cpu_stop: %s(%p) leaked preempt count\n",
+			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+					  ksym_buf), arg);
+
+		cpu_stop_signal_done(done, true);
+	} else
+		schedule();
+
+	goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+					   unsigned long action, void *hcpu)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	struct cpu_stop_work *work;
+	struct task_struct *p;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		BUG_ON(stopper->thread || stopper->enabled ||
+		       !list_empty(&stopper->works));
+		p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d",
+				   cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+		get_task_struct(p);
+		stopper->thread = p;
+		break;
+
+	case CPU_ONLINE:
+		kthread_bind(stopper->thread, cpu);
+		/* strictly unnecessary, as first user will wake it */
+		wake_up_process(stopper->thread);
+		/* mark enabled */
+		spin_lock_irq(&stopper->lock);
+		stopper->enabled = true;
+		spin_unlock_irq(&stopper->lock);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		/* kill the stopper */
+		kthread_stop(stopper->thread);
+		/* drain remaining works */
+		spin_lock_irq(&stopper->lock);
+		list_for_each_entry(work, &stopper->works, list)
+			cpu_stop_signal_done(work->done, false);
+		stopper->enabled = false;
+		spin_unlock_irq(&stopper->lock);
+		/* release the stopper */
+		put_task_struct(stopper->thread);
+		stopper->thread = NULL;
+		break;
+#endif
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+	.notifier_call	= cpu_stop_cpu_callback,
+	.priority	= 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+	void *bcpu = (void *)(long)smp_processor_id();
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+		spin_lock_init(&stopper->lock);
+		INIT_LIST_HEAD(&stopper->works);
+	}
+
+	/* start one for the boot cpu */
+	err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+				    bcpu);
+	BUG_ON(err == NOTIFY_BAD);
+	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+	register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+	return 0;
+}
+early_initcall(cpu_stop_init);
 
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
-- 
cgit v1.2.3-55-g7522


From 3fc1f1e27a5b807791d72e5d992aa33b668a6626 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Thu, 6 May 2010 18:49:20 +0200
Subject: stop_machine: reimplement using cpu_stop

Reimplement stop_machine using cpu_stop.  As cpu stoppers are
guaranteed to be available for all online cpus,
stop_machine_create/destroy() are no longer necessary and removed.

With resource management and synchronization handled by cpu_stop, the
new implementation is much simpler.  Asking the cpu_stop to execute
the stop_cpu() state machine on all online cpus with cpu hotplug
disabled is enough.

stop_machine itself doesn't need to manage any global resources
anymore, so all per-instance information is rolled into struct
stop_machine_data and the mutex and all static data variables are
removed.

The previous implementation created and destroyed RT workqueues as
necessary which made stop_machine() calls highly expensive on very
large machines.  According to Dimitri Sivanich, preventing the dynamic
creation/destruction makes booting faster more than twice on very
large machines.  cpu_stop resources are preallocated for all online
cpus and should have the same effect.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 arch/s390/kernel/time.c      |   1 -
 drivers/xen/manage.c         |  14 +---
 include/linux/stop_machine.h |  20 ------
 kernel/cpu.c                 |   8 ---
 kernel/module.c              |  14 +---
 kernel/stop_machine.c        | 158 +++++++++++--------------------------------
 6 files changed, 42 insertions(+), 173 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index fba6dec156bf..03d96569f187 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -390,7 +390,6 @@ static void __init time_init_wq(void)
 	if (time_sync_wq)
 		return;
 	time_sync_wq = create_singlethread_workqueue("timesync");
-	stop_machine_create();
 }
 
 /*
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2ac4440e7b08..8943b8ccee1a 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -80,12 +80,6 @@ static void do_suspend(void)
 
 	shutting_down = SHUTDOWN_SUSPEND;
 
-	err = stop_machine_create();
-	if (err) {
-		printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
-		goto out;
-	}
-
 #ifdef CONFIG_PREEMPT
 	/* If the kernel is preemptible, we need to freeze all the processes
 	   to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
 	err = freeze_processes();
 	if (err) {
 		printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
-		goto out_destroy_sm;
+		goto out;
 	}
 #endif
 
@@ -136,12 +130,8 @@ out_resume:
 out_thaw:
 #ifdef CONFIG_PREEMPT
 	thaw_processes();
-
-out_destroy_sm:
-#endif
-	stop_machine_destroy();
-
 out:
+#endif
 	shutting_down = SHUTDOWN_INVALID;
 }
 #endif	/* CONFIG_PM_SLEEP */
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index efcbd6c37947..0e552e72a4c4 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -67,23 +67,6 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-/**
- * stop_machine_create: create all stop_machine threads
- *
- * Description: This causes all stop_machine threads to be created before
- * stop_machine actually gets called. This can be used by subsystems that
- * need a non failing stop_machine infrastructure.
- */
-int stop_machine_create(void);
-
-/**
- * stop_machine_destroy: destroy all stop_machine threads
- *
- * Description: This causes all stop_machine threads which were created with
- * stop_machine_create to be destroyed again.
- */
-void stop_machine_destroy(void);
-
 #else
 
 static inline int stop_machine(int (*fn)(void *), void *data,
@@ -96,8 +79,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-static inline int stop_machine_create(void) { return 0; }
-static inline void stop_machine_destroy(void) { }
-
 #endif /* CONFIG_SMP */
 #endif /* _LINUX_STOP_MACHINE */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 914aedcde849..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -266,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
 {
 	int err;
 
-	err = stop_machine_create();
-	if (err)
-		return err;
 	cpu_maps_update_begin();
 
 	if (cpu_hotplug_disabled) {
@@ -280,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
 
 out:
 	cpu_maps_update_done();
-	stop_machine_destroy();
 	return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -361,9 +357,6 @@ int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error;
 
-	error = stop_machine_create();
-	if (error)
-		return error;
 	cpu_maps_update_begin();
 	first_cpu = cpumask_first(cpu_online_mask);
 	/*
@@ -394,7 +387,6 @@ int disable_nonboot_cpus(void)
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
 	cpu_maps_update_done();
-	stop_machine_destroy();
 	return error;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..0838246d8c94 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -723,16 +723,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
-	/* Create stop_machine threads since free_module relies on
-	 * a non-failing stop_machine call. */
-	ret = stop_machine_create();
-	if (ret)
-		return ret;
-
-	if (mutex_lock_interruptible(&module_mutex) != 0) {
-		ret = -EINTR;
-		goto out_stop;
-	}
+	if (mutex_lock_interruptible(&module_mutex) != 0)
+		return -EINTR;
 
 	mod = find_module(name);
 	if (!mod) {
@@ -792,8 +784,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
  out:
 	mutex_unlock(&module_mutex);
-out_stop:
-	stop_machine_destroy();
 	return ret;
 }
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7e3f9182aef3..884c7a1afeed 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -388,174 +388,92 @@ enum stopmachine_state {
 	/* Exit */
 	STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 
 struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	int fnret;
+	int			(*fn)(void *);
+	void			*data;
+	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+	unsigned int		num_threads;
+	const struct cpumask	*active_cpus;
+
+	enum stopmachine_state	state;
+	atomic_t		thread_ack;
 };
 
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+		      enum stopmachine_state newstate)
 {
 	/* Reset ack counter. */
-	atomic_set(&thread_ack, num_threads);
+	atomic_set(&smdata->thread_ack, smdata->num_threads);
 	smp_wmb();
-	state = newstate;
+	smdata->state = newstate;
 }
 
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-	if (atomic_dec_and_test(&thread_ack))
-		set_state(state + 1);
+	if (atomic_dec_and_test(&smdata->thread_ack))
+		set_state(smdata, smdata->state + 1);
 }
 
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
 {
+	struct stop_machine_data *smdata = data;
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
-	struct stop_machine_data *smdata = &idle;
-	int cpu = smp_processor_id();
-	int err;
+	int cpu = smp_processor_id(), err = 0;
+	bool is_active;
+
+	if (!smdata->active_cpus)
+		is_active = cpu == cpumask_first(cpu_online_mask);
+	else
+		is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
 
-	if (!active_cpus) {
-		if (cpu == cpumask_first(cpu_online_mask))
-			smdata = &active;
-	} else {
-		if (cpumask_test_cpu(cpu, active_cpus))
-			smdata = &active;
-	}
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read stopmachine_state. */
 		cpu_relax();
-		if (state != curstate) {
-			curstate = state;
+		if (smdata->state != curstate) {
+			curstate = smdata->state;
 			switch (curstate) {
 			case STOPMACHINE_DISABLE_IRQ:
 				local_irq_disable();
 				hard_irq_disable();
 				break;
 			case STOPMACHINE_RUN:
-				/* On multiple CPUs only a single error code
-				 * is needed to tell that something failed. */
-				err = smdata->fn(smdata->data);
-				if (err)
-					smdata->fnret = err;
+				if (is_active)
+					err = smdata->fn(smdata->data);
 				break;
 			default:
 				break;
 			}
-			ack_state();
+			ack_state(smdata);
 		}
 	} while (curstate != STOPMACHINE_EXIT);
 
 	local_irq_enable();
+	return err;
 }
 
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-	return 0;
-}
-
-int stop_machine_create(void)
-{
-	mutex_lock(&setup_lock);
-	if (refcount)
-		goto done;
-	stop_machine_wq = create_rt_workqueue("kstop");
-	if (!stop_machine_wq)
-		goto err_out;
-	stop_machine_work = alloc_percpu(struct work_struct);
-	if (!stop_machine_work)
-		goto err_out;
-done:
-	refcount++;
-	mutex_unlock(&setup_lock);
-	return 0;
-
-err_out:
-	if (stop_machine_wq)
-		destroy_workqueue(stop_machine_wq);
-	mutex_unlock(&setup_lock);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
-	mutex_lock(&setup_lock);
-	refcount--;
-	if (refcount)
-		goto done;
-	destroy_workqueue(stop_machine_wq);
-	free_percpu(stop_machine_work);
-done:
-	mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-	struct work_struct *sm_work;
-	int i, ret;
-
-	/* Set up initial state. */
-	mutex_lock(&lock);
-	num_threads = num_online_cpus();
-	active_cpus = cpus;
-	active.fn = fn;
-	active.data = data;
-	active.fnret = 0;
-	idle.fn = chill;
-	idle.data = NULL;
-
-	set_state(STOPMACHINE_PREPARE);
-
-	/* Schedule the stop_cpu work on all cpus: hold this CPU so one
-	 * doesn't hit this CPU until we're ready. */
-	get_cpu();
-	for_each_online_cpu(i) {
-		sm_work = per_cpu_ptr(stop_machine_work, i);
-		INIT_WORK(sm_work, stop_cpu);
-		queue_work_on(i, stop_machine_wq, sm_work);
-	}
-	/* This will release the thread on our CPU. */
-	put_cpu();
-	flush_workqueue(stop_machine_wq);
-	ret = active.fnret;
-	mutex_unlock(&lock);
-	return ret;
+	struct stop_machine_data smdata = { .fn = fn, .data = data,
+					    .num_threads = num_online_cpus(),
+					    .active_cpus = cpus };
+
+	/* Set the initial state and stop all online cpus. */
+	set_state(&smdata, STOPMACHINE_PREPARE);
+	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
 }
 
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	int ret;
 
-	ret = stop_machine_create();
-	if (ret)
-		return ret;
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
 	ret = __stop_machine(fn, data, cpus);
 	put_online_cpus();
-	stop_machine_destroy();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
-- 
cgit v1.2.3-55-g7522


From 969c79215a35b06e5e3efe69b9412f858df7856c Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: replace migration_thread with cpu_stop

Currently migration_thread is serving three purposes - migration
pusher, context to execute active_load_balance() and forced context
switcher for expedited RCU synchronize_sched.  All three roles are
hardcoded into migration_thread() and determining which job is
scheduled is slightly messy.

This patch kills migration_thread and replaces all three uses with
cpu_stop.  The three different roles of migration_thread() are
splitted into three separate cpu_stop callbacks -
migration_cpu_stop(), active_load_balance_cpu_stop() and
synchronize_sched_expedited_cpu_stop() - and each use case now simply
asks cpu_stop to execute the callback as necessary.

synchronize_sched_expedited() was implemented with private
preallocated resources and custom multi-cpu queueing and waiting
logic, both of which are provided by cpu_stop.
synchronize_sched_expedited_count is made atomic and all other shared
resources along with the mutex are dropped.

synchronize_sched_expedited() also implemented a check to detect cases
where not all the callback got executed on their assigned cpus and
fall back to synchronize_sched().  If called with cpu hotplug blocked,
cpu_stop already guarantees that and the condition cannot happen;
otherwise, stop_machine() would break.  However, this patch preserves
the paranoid check using a cpumask to record on which cpus the stopper
ran so that it can serve as a bisection point if something actually
goes wrong theree.

Because the internal execution state is no longer visible,
rcu_expedited_torture_stats() is removed.

This patch also renames cpu_stop threads to from "stopper/%d" to
"migration/%d".  The names of these threads ultimately don't matter
and there's no reason to make unnecessary userland visible changes.

With this patch applied, stop_machine() and sched now share the same
resources.  stop_machine() is faster without wasting any resources and
sched migration users are much cleaner.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 Documentation/RCU/torture.txt |  10 --
 include/linux/rcutiny.h       |   2 -
 include/linux/rcutree.h       |   1 -
 kernel/rcutorture.c           |   2 +-
 kernel/sched.c                | 315 ++++++++++++------------------------------
 kernel/sched_fair.c           |  48 +++++--
 kernel/stop_machine.c         |   2 +-
 7 files changed, 127 insertions(+), 253 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e2..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
 	sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
 	sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
 	sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
-	state: -1 / 0:0 3:0 4:0
-
-As before, the first four lines are similar to those for RCU.
-The last line shows the task-migration state.  The first number is
--1 if synchronize_sched_expedited() is idle, -2 if in the process of
-posting wakeups to the migration kthreads, and N when waiting on CPU N.
-Each of the colon-separated fields following the "/" is a CPU:state pair.
-Valid states are "0" for idle, "1" for waiting for quiescent state,
-"2" for passed through quiescent state, and "3" when a race with a
-CPU-hotplug event forces use of the synchronize_sched() primitive.
 
 
 USAGE
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480a..0006b2df00e1 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
 	return 0;
 }
 
-extern int rcu_expedited_torture_stats(char *page);
-
 static inline void rcu_force_quiescent_state(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779e..24e467e526b8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,7 +35,6 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
-extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..2b676f3a0f26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
 	.sync		= synchronize_sched_expedited,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_sched_force_quiescent_state,
-	.stats		= rcu_expedited_torture_stats,
+	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "sched_expedited"
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index 4956ed092838..f1d577a0a8ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -539,15 +539,13 @@ struct rq {
 	int post_schedule;
 	int active_balance;
 	int push_cpu;
+	struct cpu_stop_work active_balance_work;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
 
 	unsigned long avg_load_per_task;
 
-	struct task_struct *migration_thread;
-	struct list_head migration_queue;
-
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	__set_task_cpu(p, new_cpu);
 }
 
-struct migration_req {
-	struct list_head list;
-
+struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
-
-	struct completion done;
 };
 
+static int migration_cpu_stop(void *data);
+
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct rq *rq = task_rq(p);
 
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	 * If the task is not on a runqueue (and not running), then
 	 * the next wake-up will properly place the task.
 	 */
-	if (!p->se.on_rq && !task_running(rq, p))
-		return 0;
-
-	init_completion(&req->done);
-	req->task = p;
-	req->dest_cpu = dest_cpu;
-	list_add(&req->list, &rq->migration_queue);
-
-	return 1;
+	return p->se.on_rq || task_running(rq, p);
 }
 
 /*
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq)
 void sched_exec(void)
 {
 	struct task_struct *p = current;
-	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int dest_cpu;
@@ -3124,17 +3110,11 @@ void sched_exec(void)
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
 	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-	    likely(cpu_active(dest_cpu)) &&
-	    migrate_task(p, dest_cpu, &req)) {
-		/* Need to wait for migration thread (might exit: take ref). */
-		struct task_struct *mt = rq->migration_thread;
+	    likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+		struct migration_arg arg = { p, dest_cpu };
 
-		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
-		wake_up_process(mt);
-		put_task_struct(mt);
-		wait_for_completion(&req.done);
-
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void)
 /*
  * This is how migration works:
  *
- * 1) we queue a struct migration_req structure in the source CPU's
- *    runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- *    thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
  */
 
 /*
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void)
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
+	unsigned int dest_cpu;
 	int ret = 0;
 
 	/*
@@ -5354,15 +5332,12 @@ again:
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (migrate_task(p, dest_cpu)) {
+		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
-		struct task_struct *mt = rq->migration_thread;
-
-		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
-		wake_up_process(mt);
-		put_task_struct(mt);
-		wait_for_completion(&req.done);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
@@ -5420,70 +5395,22 @@ fail:
 	return ret;
 }
 
-#define RCU_MIGRATION_IDLE	0
-#define RCU_MIGRATION_NEED_QS	1
-#define RCU_MIGRATION_GOT_QS	2
-#define RCU_MIGRATION_MUST_SYNC	3
-
 /*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
  */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-	int badcpu;
-	int cpu = (long)data;
-	struct rq *rq;
-
-	rq = cpu_rq(cpu);
-	BUG_ON(rq->migration_thread != current);
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		struct migration_req *req;
-		struct list_head *head;
-
-		raw_spin_lock_irq(&rq->lock);
-
-		if (cpu_is_offline(cpu)) {
-			raw_spin_unlock_irq(&rq->lock);
-			break;
-		}
-
-		if (rq->active_balance) {
-			active_load_balance(rq, cpu);
-			rq->active_balance = 0;
-		}
-
-		head = &rq->migration_queue;
-
-		if (list_empty(head)) {
-			raw_spin_unlock_irq(&rq->lock);
-			schedule();
-			set_current_state(TASK_INTERRUPTIBLE);
-			continue;
-		}
-		req = list_entry(head->next, struct migration_req, list);
-		list_del_init(head->next);
-
-		if (req->task != NULL) {
-			raw_spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-		} else if (likely(cpu == (badcpu = smp_processor_id()))) {
-			req->dest_cpu = RCU_MIGRATION_GOT_QS;
-			raw_spin_unlock(&rq->lock);
-		} else {
-			req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-			raw_spin_unlock(&rq->lock);
-			WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-		}
-		local_irq_enable();
-
-		complete(&req->done);
-	}
-	__set_current_state(TASK_RUNNING);
+	struct migration_arg *arg = data;
 
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+	local_irq_enable();
 	return 0;
 }
 
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq = cpu_rq(cpu);
 
 	switch (action) {
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-		if (IS_ERR(p))
-			return NOTIFY_BAD;
-		kthread_bind(p, cpu);
-		/* Must be high prio: stop_machine expects to yield to it. */
-		rq = task_rq_lock(p, &flags);
-		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-		task_rq_unlock(rq, &flags);
-		get_task_struct(p);
-		cpu_rq(cpu)->migration_thread = p;
 		rq->calc_load_update = calc_load_update;
 		break;
 
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		/* Strictly unnecessary, as first user will wake it. */
-		wake_up_process(cpu_rq(cpu)->migration_thread);
-
 		/* Update our root-domain */
-		rq = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		if (!cpu_rq(cpu)->migration_thread)
-			break;
-		/* Unbind it from offline cpu so it can run. Fall thru. */
-		kthread_bind(cpu_rq(cpu)->migration_thread,
-			     cpumask_any(cpu_online_mask));
-		kthread_stop(cpu_rq(cpu)->migration_thread);
-		put_task_struct(cpu_rq(cpu)->migration_thread);
-		cpu_rq(cpu)->migration_thread = NULL;
-		break;
-
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		migrate_live_tasks(cpu);
-		rq = cpu_rq(cpu);
-		kthread_stop(rq->migration_thread);
-		put_task_struct(rq->migration_thread);
-		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		raw_spin_lock_irq(&rq->lock);
 		deactivate_task(rq, rq->idle, 0);
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
-		/*
-		 * No need to migrate the tasks: it was best-effort if
-		 * they didn't take sched_hotcpu_mutex. Just wake up
-		 * the requestors.
-		 */
-		raw_spin_lock_irq(&rq->lock);
-		while (!list_empty(&rq->migration_queue)) {
-			struct migration_req *req;
-
-			req = list_entry(rq->migration_queue.next,
-					 struct migration_req, list);
-			list_del_init(&req->list);
-			raw_spin_unlock_irq(&rq->lock);
-			complete(&req->done);
-			raw_spin_lock_irq(&rq->lock);
-		}
-		raw_spin_unlock_irq(&rq->lock);
 		break;
 
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/* Update our root-domain */
-		rq = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7757,10 +7635,8 @@ void __init sched_init(void)
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
-		rq->migration_thread = NULL;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
@@ -9054,43 +8930,39 @@ struct cgroup_subsys cpuacct_subsys = {
 
 #ifndef CONFIG_SMP
 
-int rcu_expedited_torture_stats(char *page)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
 void synchronize_sched_expedited(void)
 {
+	/*
+	 * There must be a full memory barrier on each affected CPU
+	 * between the time that try_stop_cpus() is called and the
+	 * time that it returns.
+	 *
+	 * In the current initial implementation of cpu_stop, the
+	 * above condition is already met when the control reaches
+	 * this point and the following smp_mb() is not strictly
+	 * necessary.  Do smp_mb() anyway for documentation and
+	 * robustness against future implementation changes.
+	 */
+	smp_mb();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-	int cnt = 0;
-	int cpu;
+	static DEFINE_SPINLOCK(done_mask_lock);
+	struct cpumask *done_mask = data;
 
-	cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
-	for_each_online_cpu(cpu) {
-		 cnt += sprintf(&page[cnt], " %d:%d",
-				cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+	if (done_mask) {
+		spin_lock(&done_mask_lock);
+		cpumask_set_cpu(smp_processor_id(), done_mask);
+		spin_unlock(&done_mask_lock);
 	}
-	cnt += sprintf(&page[cnt], "\n");
-	return cnt;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
 
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9104,60 +8976,55 @@ static long synchronize_sched_expedited_count;
  */
 void synchronize_sched_expedited(void)
 {
-	int cpu;
-	unsigned long flags;
-	bool need_full_sync = 0;
-	struct rq *rq;
-	struct migration_req *req;
-	long snap;
-	int trycount = 0;
+	cpumask_var_t done_mask_var;
+	struct cpumask *done_mask = NULL;
+	int snap, trycount = 0;
+
+	/*
+	 * done_mask is used to check that all cpus actually have
+	 * finished running the stopper, which is guaranteed by
+	 * stop_cpus() if it's called with cpu hotplug blocked.  Keep
+	 * the paranoia for now but it's best effort if cpumask is off
+	 * stack.
+	 */
+	if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC))
+		done_mask = done_mask_var;
 
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
 	get_online_cpus();
-	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+	while (try_stop_cpus(cpu_online_mask,
+			     synchronize_sched_expedited_cpu_stop,
+			     done_mask) == -EAGAIN) {
 		put_online_cpus();
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
-			return;
+			goto free_out;
 		}
-		if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
 			smp_mb(); /* ensure test happens before caller kfree */
-			return;
+			goto free_out;
 		}
 		get_online_cpus();
 	}
-	rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
-	for_each_online_cpu(cpu) {
-		rq = cpu_rq(cpu);
-		req = &per_cpu(rcu_migration_req, cpu);
-		init_completion(&req->done);
-		req->task = NULL;
-		req->dest_cpu = RCU_MIGRATION_NEED_QS;
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		list_add(&req->list, &rq->migration_queue);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		wake_up_process(rq->migration_thread);
-	}
-	for_each_online_cpu(cpu) {
-		rcu_expedited_state = cpu;
-		req = &per_cpu(rcu_migration_req, cpu);
-		rq = cpu_rq(cpu);
-		wait_for_completion(&req->done);
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-			need_full_sync = 1;
-		req->dest_cpu = RCU_MIGRATION_IDLE;
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-	synchronize_sched_expedited_count++;
-	mutex_unlock(&rcu_sched_expedited_mutex);
+	atomic_inc(&synchronize_sched_expedited_count);
+	if (done_mask)
+		cpumask_xor(done_mask, done_mask, cpu_online_mask);
 	put_online_cpus();
-	if (need_full_sync)
+
+	/* paranoia - this can't happen */
+	if (done_mask && cpumask_weight(done_mask)) {
+		char buf[80];
+
+		cpulist_scnprintf(buf, sizeof(buf), done_mask);
+		WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n",
+			  cpumask_weight(done_mask), buf);
 		synchronize_sched();
+	}
+free_out:
+	free_cpumask_var(done_mask_var);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cbd8b8a296d1..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
+static int active_load_balance_cpu_stop(void *data);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2887,8 +2889,9 @@ redo:
 		if (need_active_balance(sd, sd_idle, idle)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
-			/* don't kick the migration_thread, if the curr
-			 * task on busiest cpu can't be moved to this_cpu
+			/* don't kick the active_load_balance_cpu_stop,
+			 * if the curr task on busiest cpu can't be
+			 * moved to this_cpu
 			 */
 			if (!cpumask_test_cpu(this_cpu,
 					      &busiest->curr->cpus_allowed)) {
@@ -2898,14 +2901,22 @@ redo:
 				goto out_one_pinned;
 			}
 
+			/*
+			 * ->active_balance synchronizes accesses to
+			 * ->active_balance_work.  Once set, it's cleared
+			 * only after active load balance is finished.
+			 */
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
 			if (active_balance)
-				wake_up_process(busiest->migration_thread);
+				stop_one_cpu_nowait(cpu_of(busiest),
+					active_load_balance_cpu_stop, busiest,
+					&busiest->active_balance_work);
 
 			/*
 			 * We've kicked active balancing, reset the failure
@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 }
 
 /*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
  */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
 {
+	struct rq *busiest_rq = data;
+	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
-	struct rq *target_rq;
+
+	raw_spin_lock_irq(&busiest_rq->lock);
+
+	/* make sure the requested cpu hasn't gone down in the meantime */
+	if (unlikely(busiest_cpu != smp_processor_id() ||
+		     !busiest_rq->active_balance))
+		goto out_unlock;
 
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
-		return;
-
-	target_rq = cpu_rq(target_cpu);
+		goto out_unlock;
 
 	/*
 	 * This condition is "impossible", if it occurs
@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 			schedstat_inc(sd, alb_failed);
 	}
 	double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+	busiest_rq->active_balance = 0;
+	raw_spin_unlock_irq(&busiest_rq->lock);
+	return 0;
 }
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 884c7a1afeed..5b20141a5ec1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,7 +301,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		BUG_ON(stopper->thread || stopper->enabled ||
 		       !list_empty(&stopper->works));
-		p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d",
+		p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
 				   cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
-- 
cgit v1.2.3-55-g7522


From 94458d5ecb3da844823cc191e73e5c5ead98a464 Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: kill paranoia check in synchronize_sched_expedited()

The paranoid check which verifies that the cpu_stop callback is
actually called on all online cpus is completely superflous.  It's
guaranteed by cpu_stop facility and if it didn't work as advertised
other things would go horribly wrong and trying to recover using
synchronize_sched() wouldn't be very meaningful.

Kill the paranoid check.  Removal of this feature is done as a
separate step so that it can serve as a bisection point if something
actually goes wrong.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 kernel/sched.c | 40 +++-------------------------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f1d577a0a8ab..e9c6d798831a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8953,14 +8953,6 @@ static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-	static DEFINE_SPINLOCK(done_mask_lock);
-	struct cpumask *done_mask = data;
-
-	if (done_mask) {
-		spin_lock(&done_mask_lock);
-		cpumask_set_cpu(smp_processor_id(), done_mask);
-		spin_unlock(&done_mask_lock);
-	}
 	return 0;
 }
 
@@ -8976,55 +8968,29 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t done_mask_var;
-	struct cpumask *done_mask = NULL;
 	int snap, trycount = 0;
 
-	/*
-	 * done_mask is used to check that all cpus actually have
-	 * finished running the stopper, which is guaranteed by
-	 * stop_cpus() if it's called with cpu hotplug blocked.  Keep
-	 * the paranoia for now but it's best effort if cpumask is off
-	 * stack.
-	 */
-	if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC))
-		done_mask = done_mask_var;
-
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
 	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
 	get_online_cpus();
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
-			     done_mask) == -EAGAIN) {
+			     NULL) == -EAGAIN) {
 		put_online_cpus();
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
-			goto free_out;
+			return;
 		}
 		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
 			smp_mb(); /* ensure test happens before caller kfree */
-			goto free_out;
+			return;
 		}
 		get_online_cpus();
 	}
 	atomic_inc(&synchronize_sched_expedited_count);
-	if (done_mask)
-		cpumask_xor(done_mask, done_mask, cpu_online_mask);
 	put_online_cpus();
-
-	/* paranoia - this can't happen */
-	if (done_mask && cpumask_weight(done_mask)) {
-		char buf[80];
-
-		cpulist_scnprintf(buf, sizeof(buf), done_mask);
-		WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n",
-			  cpumask_weight(done_mask), buf);
-		synchronize_sched();
-	}
-free_out:
-	free_cpumask_var(done_mask_var);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
-- 
cgit v1.2.3-55-g7522


From cc631fb732b8ccd6a0cc45557475ea09b0c21a68 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: correctly place paranioa memory barriers in
 synchronize_sched_expedited()

The memory barriers must be in the SMP case, not in the !SMP case.
Also add a barrier after the atomic_inc() in order to ensure that
other CPUs see post-synchronize_sched_expedited() actions as following
the expedited grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e9c6d798831a..155a16d52146 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8931,6 +8931,15 @@ struct cgroup_subsys cpuacct_subsys = {
 #ifndef CONFIG_SMP
 
 void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*
 	 * There must be a full memory barrier on each affected CPU
@@ -8943,16 +8952,7 @@ void synchronize_sched_expedited(void)
 	 * necessary.  Do smp_mb() anyway for documentation and
 	 * robustness against future implementation changes.
 	 */
-	smp_mb();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
+	smp_mb(); /* See above comment block. */
 	return 0;
 }
 
@@ -8990,6 +8990,7 @@ void synchronize_sched_expedited(void)
 		get_online_cpus();
 	}
 	atomic_inc(&synchronize_sched_expedited_count);
+	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-- 
cgit v1.2.3-55-g7522


From d9f599e1e6d019968b35d2dc63074b9e8964fa69 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Sat, 20 Mar 2010 17:39:11 +0300
Subject: perf: Fix check at end of event search

The original code doesn't work because "call" is never NULL there.

Signed-off-by: Dan Carpenter <error27@gmail.com>
LKML-Reference: <20100320143911.GF5331@bicker>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7fe..58092d844a1f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1398,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	}
 
 	err = -EINVAL;
-	if (!call)
+	if (&call->list == &ftrace_events)
 		goto out_unlock;
 
 	err = -EEXIST;
-- 
cgit v1.2.3-55-g7522


From fc390cde362309f6892bb719194f242c466a978b Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 May 2010 11:42:52 -0700
Subject: rcu: need barrier() in UP synchronize_sched_expedited()

If synchronize_sched_expedited() is ever to be called from within
kernel/sched.c in a !SMP PREEMPT kernel, the !SMP implementation needs
a barrier().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 155a16d52146..fbaf3128d010 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8932,6 +8932,7 @@ struct cgroup_subsys cpuacct_subsys = {
 
 void synchronize_sched_expedited(void)
 {
+	barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
-- 
cgit v1.2.3-55-g7522


From 4726f2a617ebd868a4fdeb5679613b897e5f1676 Mon Sep 17 00:00:00 2001
From: Yong Zhang
Date: Tue, 4 May 2010 14:16:48 +0800
Subject: lockdep: Reduce stack_trace usage

When calling check_prevs_add(), if all validations passed
add_lock_to_list() will add new lock to dependency tree and
alloc stack_trace for each list_entry.

But at this time, we are always on the same stack, so stack_trace
for each list_entry has the same value. This is redundant and eats
up lots of memory which could lead to warning on low
MAX_STACK_TRACE_ENTRIES.

Use one copy of stack_trace instead.

V2: As suggested by Peter Zijlstra, move save_trace() from
    check_prevs_add() to check_prev_add().
    Add tracking for trylock dependence which is also redundant.

Signed-off-by: Yong Zhang <yong.zhang0@windriver.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100504065711.GC10784@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9cf79858fd82..51080807dc8c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -805,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
  * Add a new dependency to the head of the list:
  */
 static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
-			    struct list_head *head, unsigned long ip, int distance)
+			    struct list_head *head, unsigned long ip,
+			    int distance, struct stack_trace *trace)
 {
 	struct lock_list *entry;
 	/*
@@ -816,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	if (!entry)
 		return 0;
 
-	if (!save_trace(&entry->trace))
-		return 0;
-
 	entry->class = this;
 	entry->distance = distance;
+	entry->trace = *trace;
 	/*
 	 * Since we never remove from the dependency list, the list can
 	 * be walked lockless by other CPUs, it's only allocation
@@ -1622,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance)
+	       struct held_lock *next, int distance, int trylock_loop)
 {
 	struct lock_list *entry;
 	int ret;
 	struct lock_list this;
 	struct lock_list *uninitialized_var(target_entry);
+	/*
+	 * Static variable, serialized by the graph_lock().
+	 *
+	 * We use this static variable to save the stack trace in case
+	 * we call into this function multiple times due to encountering
+	 * trylocks in the held lock stack.
+	 */
+	static struct stack_trace trace;
 
 	/*
 	 * Prove that the new <prev> -> <next> dependency would not
@@ -1675,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		}
 	}
 
+	if (!trylock_loop && !save_trace(&trace))
+		return 0;
+
 	/*
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
 	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
 			       &hlock_class(prev)->locks_after,
-			       next->acquire_ip, distance);
+			       next->acquire_ip, distance, &trace);
 
 	if (!ret)
 		return 0;
 
 	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
 			       &hlock_class(next)->locks_before,
-			       next->acquire_ip, distance);
+			       next->acquire_ip, distance, &trace);
 	if (!ret)
 		return 0;
 
@@ -1718,6 +1728,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
+	int trylock_loop = 0;
 	struct held_lock *hlock;
 
 	/*
@@ -1743,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 * added:
 		 */
 		if (hlock->read != 2) {
-			if (!check_prev_add(curr, hlock, next, distance))
+			if (!check_prev_add(curr, hlock, next,
+						distance, trylock_loop))
 				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
@@ -1766,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		if (curr->held_locks[depth].irq_context !=
 				curr->held_locks[depth-1].irq_context)
 			break;
+		trylock_loop = 1;
 	}
 	return 1;
 out_bug:
-- 
cgit v1.2.3-55-g7522


From 27a9da6538ee18046d7bff8e36a9f783542c54c3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 4 May 2010 20:36:56 +0200
Subject: sched: Remove rq argument to the tracepoints

struct rq isn't visible outside of sched.o so its near useless to
expose the pointer, also there are no users of it, so remove it.

Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1272997616.1642.207.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/sched.h      | 32 ++++++++++----------------------
 kernel/sched.c                    |  8 ++++----
 kernel/trace/ftrace.c             |  3 +--
 kernel/trace/trace_sched_switch.c |  5 ++---
 kernel/trace/trace_sched_wakeup.c |  5 ++---
 5 files changed, 19 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cfceb0b73e20..4f733ecea46e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret,
 
 /*
  * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_wait_task,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_PROTO(struct task_struct *p),
 
-	TP_ARGS(rq, p),
+	TP_ARGS(p),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task,
 
 /*
  * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DECLARE_EVENT_CLASS(sched_wakeup_template,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_PROTO(struct task_struct *p, int success),
 
-	TP_ARGS(rq, p, success),
+	TP_ARGS(p, success),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 );
 
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
-	     TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-	     TP_ARGS(rq, p, success));
+	     TP_PROTO(struct task_struct *p, int success),
+	     TP_ARGS(p, success));
 
 /*
  * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
-	     TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-	     TP_ARGS(rq, p, success));
+	     TP_PROTO(struct task_struct *p, int success),
+	     TP_ARGS(p, success));
 
 /*
  * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,
 
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct task_struct *prev,
 		 struct task_struct *next),
 
-	TP_ARGS(rq, prev, next),
+	TP_ARGS(prev, next),
 
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4956ed092838..11ac0eb0bce7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2168,7 +2168,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
-		trace_sched_wait_task(rq, p);
+		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
@@ -2439,7 +2439,7 @@ out_activate:
 	success = 1;
 
 out_running:
-	trace_sched_wakeup(rq, p, success);
+	trace_sched_wakeup(p, success);
 	check_preempt_curr(rq, p, wake_flags);
 
 	p->state = TASK_RUNNING;
@@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	rq = task_rq_lock(p, &flags);
 	activate_task(rq, p, 0);
-	trace_sched_wakeup_new(rq, p, 1);
+	trace_sched_wakeup_new(p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
@@ -2833,7 +2833,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	trace_sched_switch(rq, prev, next);
+	trace_sched_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..aa3a92b511e2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3212,8 +3212,7 @@ free:
 }
 
 static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-				struct task_struct *next)
+ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	unsigned long long timestamp;
 	int index;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..a55fccfede5d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-			struct task_struct *next)
+probe_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(struct task_struct *wakee, int success)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..8052446ceeaa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,8 +107,7 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
 }
 
 static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
-	struct task_struct *next)
+probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
@@ -200,7 +199,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(struct task_struct *p, int success)
 {
 	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
-- 
cgit v1.2.3-55-g7522


From 4fd38e4595e2f6c9d27732c042a0e16b2753049c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 6 May 2010 17:31:38 +0200
Subject: perf: Fix exit() vs PERF_FORMAT_GROUP

Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work
as expected if the task the counters were attached to quit before
the read() call.

The cause is that we unconditionally destroy the grouping when we
remove counters from their context. Fix this by only doing this when
we free the counter itself.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1273160566.5605.404.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 1 +
 kernel/perf_event.c        | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c8e375440403..bf8f3c003297 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -522,6 +522,7 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
+	PERF_EVENT_STATE_FREE		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d1552d3c12b..f13c3db765f4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -341,6 +341,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
 
+	if (event->state > PERF_EVENT_STATE_FREE)
+		return;
+
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
@@ -1856,6 +1859,8 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	event->state = PERF_EVENT_STATE_FREE;
+
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_event_remove_from_context(event);
-- 
cgit v1.2.3-55-g7522


From a0507c84bf47dfd204299774f45fd16da33f0619 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 6 May 2010 15:42:53 +0200
Subject: perf: Annotate perf_event_read_group() vs perf_event_release_kernel()

Stephane reported a lockdep warning while using PERF_FORMAT_GROUP.

The issue is that perf_event_read_group() takes faults while holding
the ctx->mutex, while perf_event_release_kernel() can be called from
munmap(). Which makes for an AB-BA deadlock.

Except we can never establish the deadlock because we'll only ever
call perf_event_release_kernel() after all file descriptors are dead
so there is no concurrency possible.

Reported-by: Stephane Eranian <eranian@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 49d8be5a45e3..34659d4085c7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1867,7 +1867,19 @@ int perf_event_release_kernel(struct perf_event *event)
 	event->state = PERF_EVENT_STATE_FREE;
 
 	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
+	/*
+	 * There are two ways this annotation is useful:
+	 *
+	 *  1) there is a lock recursion from perf_event_exit_task
+	 *     see the comment there.
+	 *
+	 *  2) there is a lock-inversion with mmap_sem through
+	 *     perf_event_read_group(), which takes faults while
+	 *     holding ctx->mutex, however this is called after
+	 *     the last filedesc died, so there is no possibility
+	 *     to trigger the AB-BA case.
+	 */
+	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
 	perf_event_remove_from_context(event);
 	mutex_unlock(&ctx->mutex);
 
@@ -5305,7 +5317,7 @@ void perf_event_exit_task(struct task_struct *child)
 	 *
 	 * But since its the parent context it won't be the same instance.
 	 */
-	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock(&child_ctx->mutex);
 
 again:
 	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
-- 
cgit v1.2.3-55-g7522


From 6bde9b6ce0127e2a56228a2071536d422be31336 Mon Sep 17 00:00:00 2001
From: Lin Ming
Date: Fri, 23 Apr 2010 13:56:00 +0800
Subject: perf: Add group scheduling transactional APIs

Add group scheduling transactional APIs to struct pmu.
These APIs will be implemented in arch code, based on Peter's idea as
below.

> the idea behind hw_perf_group_sched_in() is to not perform
> schedulability tests on each event in the group, but to add the group
> as a whole and then perform one test.
>
> Of course, when that test fails, you'll have to roll-back the whole
> group again.
>
> So start_txn (or a better name) would simply toggle a flag in the pmu
> implementation that will make pmu::enable() not perform the
> schedulablilty test.
>
> Then commit_txn() will perform the schedulability test (so note the
> method has to have a !void return value.
>
> This will allow us to use the regular
> kernel/perf_event.c::group_sched_in() and all the rollback code.
> Currently each hw_perf_group_sched_in() implementation duplicates all
> the rolllback code (with various bugs).

->start_txn:
Start group events scheduling transaction, set a flag to make
pmu::enable() not perform the schedulability test, it will be performed
at commit time.

->commit_txn:
Commit group events scheduling transaction, perform the group
schedulability as a whole

->cancel_txn:
Stop group events scheduling transaction, clear the flag so
pmu::enable() will perform the schedulability test.

Reviewed-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1272002160.5707.60.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 15 ++++++++++++---
 kernel/perf_event.c        | 33 ++++++++++++++++++++-------------
 2 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 23cd0057a681..4924c96d7e2d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -547,6 +547,8 @@ struct hw_perf_event {
 
 struct perf_event;
 
+#define PERF_EVENT_TXN_STARTED 1
+
 /**
  * struct pmu - generic performance monitoring unit
  */
@@ -557,6 +559,16 @@ struct pmu {
 	void (*stop)			(struct perf_event *event);
 	void (*read)			(struct perf_event *event);
 	void (*unthrottle)		(struct perf_event *event);
+
+	/*
+	 * group events scheduling is treated as a transaction,
+	 * add group events as a whole and perform one schedulability test.
+	 * If test fails, roll back the whole group
+	 */
+
+	void (*start_txn)	(const struct pmu *pmu);
+	void (*cancel_txn)	(const struct pmu *pmu);
+	int  (*commit_txn)	(const struct pmu *pmu);
 };
 
 /**
@@ -823,9 +835,6 @@ extern void perf_disable(void);
 extern void perf_enable(void);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
-extern int hw_perf_group_sched_in(struct perf_event *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx);
 extern void perf_event_update_userpage(struct perf_event *event);
 extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 34659d4085c7..bb06382f98e7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -83,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	return 0;
-}
-
 void __weak perf_event_print_debug(void)	{ }
 
 static DEFINE_PER_CPU(int, perf_disable_count);
@@ -644,15 +636,20 @@ group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_event_context *ctx)
 {
-	struct perf_event *event, *partial_group;
+	struct perf_event *event, *partial_group = NULL;
+	const struct pmu *pmu = group_event->pmu;
+	bool txn = false;
 	int ret;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
-	if (ret)
-		return ret < 0 ? ret : 0;
+	/* Check if group transaction availabe */
+	if (pmu->start_txn)
+		txn = true;
+
+	if (txn)
+		pmu->start_txn(pmu);
 
 	if (event_sched_in(group_event, cpuctx, ctx))
 		return -EAGAIN;
@@ -667,9 +664,19 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	return 0;
+	if (txn) {
+		ret = pmu->commit_txn(pmu);
+		if (!ret) {
+			pmu->cancel_txn(pmu);
+
+			return 0;
+		}
+	}
 
 group_error:
+	if (txn)
+		pmu->cancel_txn(pmu);
+
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
-- 
cgit v1.2.3-55-g7522


From 3ee943728fff536edaf8f59faa58aaa1aa7366e3 Mon Sep 17 00:00:00 2001
From: Neil Horman
Date: Sat, 8 May 2010 01:57:52 -0700
Subject: ipv4: remove ip_rt_secret timer (v4)

A while back there was a discussion regarding the rt_secret_interval timer.
Given that we've had the ability to do emergency route cache rebuilds for awhile
now, based on a statistical analysis of the various hash chain lengths in the
cache, the use of the flush timer is somewhat redundant.  This patch removes the
rt_secret_interval sysctl, allowing us to rely solely on the statistical
analysis mechanism to determine the need for route cache flushes.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h |   1 -
 kernel/sysctl_binary.c   |   1 -
 net/ipv4/route.c         | 108 ++++-------------------------------------------
 3 files changed, 8 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index ae07feec6446..d68c3f121774 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -55,7 +55,6 @@ struct netns_ipv4 {
 	int sysctl_rt_cache_rebuild_count;
 	int current_rt_cache_rebuild_count;
 
-	struct timer_list rt_secret_timer;
 	atomic_t rt_genid;
 
 #ifdef CONFIG_IP_MROUTE
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 59030570f5ca..937d31dc8566 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -224,7 +224,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
 	{ CTL_INT,	NET_IPV4_ROUTE_MTU_EXPIRES,		"mtu_expires" },
 	{ CTL_INT,	NET_IPV4_ROUTE_MIN_PMTU,		"min_pmtu" },
 	{ CTL_INT,	NET_IPV4_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
-	{ CTL_INT,	NET_IPV4_ROUTE_SECRET_INTERVAL,		"secret_interval" },
 	{}
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a947428ef0ae..dea3f9264250 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
-static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
 static int rt_chain_length_max __read_mostly	= 20;
 
 static struct delayed_work expires_work;
@@ -918,32 +917,11 @@ void rt_cache_flush_batch(void)
 	rt_do_flush(!in_softirq());
 }
 
-/*
- * We change rt_genid and let gc do the cleanup
- */
-static void rt_secret_rebuild(unsigned long __net)
-{
-	struct net *net = (struct net *)__net;
-	rt_cache_invalidate(net);
-	mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
-}
-
-static void rt_secret_rebuild_oneshot(struct net *net)
-{
-	del_timer_sync(&net->ipv4.rt_secret_timer);
-	rt_cache_invalidate(net);
-	if (ip_rt_secret_interval)
-		mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
-}
-
 static void rt_emergency_hash_rebuild(struct net *net)
 {
-	if (net_ratelimit()) {
+	if (net_ratelimit())
 		printk(KERN_WARNING "Route hash chain too long!\n");
-		printk(KERN_WARNING "Adjust your secret_interval!\n");
-	}
-
-	rt_secret_rebuild_oneshot(net);
+	rt_cache_invalidate(net);
 }
 
 /*
@@ -3101,48 +3079,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 	return -EINVAL;
 }
 
-static void rt_secret_reschedule(int old)
-{
-	struct net *net;
-	int new = ip_rt_secret_interval;
-	int diff = new - old;
-
-	if (!diff)
-		return;
-
-	rtnl_lock();
-	for_each_net(net) {
-		int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
-		long time;
-
-		if (!new)
-			continue;
-
-		if (deleted) {
-			time = net->ipv4.rt_secret_timer.expires - jiffies;
-
-			if (time <= 0 || (time += diff) <= 0)
-				time = 0;
-		} else
-			time = new;
-
-		mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
-	}
-	rtnl_unlock();
-}
-
-static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos)
-{
-	int old = ip_rt_secret_interval;
-	int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
-
-	rt_secret_reschedule(old);
-
-	return ret;
-}
-
 static ctl_table ipv4_route_table[] = {
 	{
 		.procname	= "gc_thresh",
@@ -3251,13 +3187,6 @@ static ctl_table ipv4_route_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "secret_interval",
-		.data		= &ip_rt_secret_interval,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= ipv4_sysctl_rt_secret_interval,
-	},
 	{ }
 };
 
@@ -3336,34 +3265,15 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 };
 #endif
 
-
-static __net_init int rt_secret_timer_init(struct net *net)
+static __net_init int rt_genid_init(struct net *net)
 {
-	atomic_set(&net->ipv4.rt_genid,
-			(int) ((num_physpages ^ (num_physpages>>8)) ^
-			(jiffies ^ (jiffies >> 7))));
-
-	net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
-	net->ipv4.rt_secret_timer.data = (unsigned long)net;
-	init_timer_deferrable(&net->ipv4.rt_secret_timer);
-
-	if (ip_rt_secret_interval) {
-		net->ipv4.rt_secret_timer.expires =
-			jiffies + net_random() % ip_rt_secret_interval +
-			ip_rt_secret_interval;
-		add_timer(&net->ipv4.rt_secret_timer);
-	}
+	get_random_bytes(&net->ipv4.rt_genid,
+			 sizeof(net->ipv4.rt_genid));
 	return 0;
 }
 
-static __net_exit void rt_secret_timer_exit(struct net *net)
-{
-	del_timer_sync(&net->ipv4.rt_secret_timer);
-}
-
-static __net_initdata struct pernet_operations rt_secret_timer_ops = {
-	.init = rt_secret_timer_init,
-	.exit = rt_secret_timer_exit,
+static __net_initdata struct pernet_operations rt_genid_ops = {
+	.init = rt_genid_init,
 };
 
 
@@ -3424,9 +3334,6 @@ int __init ip_rt_init(void)
 	schedule_delayed_work(&expires_work,
 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
 
-	if (register_pernet_subsys(&rt_secret_timer_ops))
-		printk(KERN_ERR "Unable to setup rt_secret_timer\n");
-
 	if (ip_rt_proc_init())
 		printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
@@ -3438,6 +3345,7 @@ int __init ip_rt_init(void)
 #ifdef CONFIG_SYSCTL
 	register_pernet_subsys(&sysctl_route_ops);
 #endif
+	register_pernet_subsys(&rt_genid_ops);
 	return rc;
 }
 
-- 
cgit v1.2.3-55-g7522


From 6e85158cf5a2385264316870256fb6ad681156a0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras
Date: Sat, 8 May 2010 20:58:00 +1000
Subject: perf_event: Make software events work again

Commit 6bde9b6ce0127e2a56228a2071536d422be31336 ("perf: Add
group scheduling transactional APIs") added code to allow a
group to be scheduled in a single transaction.  However, it
introduced a bug in handling events whose pmu does not implement
transactions -- at the end of scheduling in the events in the
group, in the non-transactional case the code now falls through
to the group_error label, and proceeds to unschedule all the
events in the group and return failure.

This fixes it by returning 0 (success) in the non-transactional
case.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: eranian@gmail.com
LKML-Reference: <20100508105800.GB10650@brick.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bb06382f98e7..180151ff8376 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -664,13 +664,13 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (txn) {
-		ret = pmu->commit_txn(pmu);
-		if (!ret) {
-			pmu->cancel_txn(pmu);
+	if (!txn)
+		return 0;
 
-			return 0;
-		}
+	ret = pmu->commit_txn(pmu);
+	if (!ret) {
+		pmu->cancel_txn(pmu);
+		return 0;
 	}
 
 group_error:
-- 
cgit v1.2.3-55-g7522


From bbf1bb3eee86f2eef2baa14e600be454d09109ee Mon Sep 17 00:00:00 2001
From: Tejun Heo
Date: Sat, 8 May 2010 16:20:53 +0200
Subject: cpu_stop: add dummy implementation for UP

When !CONFIG_SMP, cpu_stop functions weren't defined at all which
could lead to build failures if UP code uses cpu_stop facility.  Add
dummy cpu_stop implementation for UP.  The waiting variants execute
the work function directly with preempt disabled and
stop_one_cpu_nowait() schedules a workqueue work.

Makefile and ifdefs around stop_machine implementation are updated to
accomodate CONFIG_SMP && !CONFIG_STOP_MACHINE case.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/stop_machine.h | 69 ++++++++++++++++++++++++++++++++++++++++----
 kernel/Makefile              |  2 +-
 kernel/stop_machine.c        |  4 +++
 3 files changed, 68 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0e552e72a4c4..6b524a0d02e4 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -6,8 +6,6 @@
 #include <linux/list.h>
 #include <asm/system.h>
 
-#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
-
 /*
  * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
  * monopolization mechanism.  The caller can specify a non-sleeping
@@ -18,9 +16,10 @@
  * up and requests are guaranteed to be served as long as the target
  * cpus are online.
  */
-
 typedef int (*cpu_stop_fn_t)(void *arg);
 
+#ifdef CONFIG_SMP
+
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
@@ -34,12 +33,70 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 
+#else	/* CONFIG_SMP */
+
+#include <linux/workqueue.h>
+
+struct cpu_stop_work {
+	struct work_struct	work;
+	cpu_stop_fn_t		fn;
+	void			*arg;
+};
+
+static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+	int ret = -ENOENT;
+	preempt_disable();
+	if (cpu == smp_processor_id())
+		ret = fn(arg);
+	preempt_enable();
+	return ret;
+}
+
+static void stop_one_cpu_nowait_workfn(struct work_struct *work)
+{
+	struct cpu_stop_work *stwork =
+		container_of(work, struct cpu_stop_work, work);
+	preempt_disable();
+	stwork->fn(stwork->arg);
+	preempt_enable();
+}
+
+static inline void stop_one_cpu_nowait(unsigned int cpu,
+				       cpu_stop_fn_t fn, void *arg,
+				       struct cpu_stop_work *work_buf)
+{
+	if (cpu == smp_processor_id()) {
+		INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
+		work_buf->fn = fn;
+		work_buf->arg = arg;
+		schedule_work(&work_buf->work);
+	}
+}
+
+static inline int stop_cpus(const struct cpumask *cpumask,
+			    cpu_stop_fn_t fn, void *arg)
+{
+	if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
+		return stop_one_cpu(raw_smp_processor_id(), fn, arg);
+	return -ENOENT;
+}
+
+static inline int try_stop_cpus(const struct cpumask *cpumask,
+				cpu_stop_fn_t fn, void *arg)
+{
+	return stop_cpus(cpumask, fn, arg);
+}
+
+#endif	/* CONFIG_SMP */
+
 /*
  * stop_machine "Bogolock": stop the entire machine, disable
  * interrupts.  This is a very heavy lock, which is equivalent to
  * grabbing every spinlock (and more).  So the "read" side to such a
  * lock is anything which disables preeempt.
  */
+#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
@@ -67,7 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-#else
+#else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
 static inline int stop_machine(int (*fn)(void *), void *data,
 			       const struct cpumask *cpus)
@@ -79,5 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-#endif /* CONFIG_SMP */
-#endif /* _LINUX_STOP_MACHINE */
+#endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#endif	/* _LINUX_STOP_MACHINE */
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..149e18ef1ab1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b20141a5ec1..ef51d1fcf5e6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -375,6 +375,8 @@ static int __init cpu_stop_init(void)
 }
 early_initcall(cpu_stop_init);
 
+#ifdef CONFIG_STOP_MACHINE
+
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
 	/* Dummy starting state for thread. */
@@ -477,3 +479,5 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif	/* CONFIG_STOP_MACHINE */
-- 
cgit v1.2.3-55-g7522


From c0614829c16ab9d31f1b7d40516decfbf3d32102 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu
Date: Tue, 27 Apr 2010 18:33:12 -0400
Subject: kprobes: Move enable/disable_kprobe() out from debugfs code

Move enable/disable_kprobe() API out from debugfs related code,
because these interfaces are not related to debugfs interface.

This fixes a compiler warning.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by:  Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
LKML-Reference: <20100427223312.2322.60512.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 132 +++++++++++++++++++++++++++----------------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e9..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 	arch_remove_kprobe(p);
 }
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+		p->flags &= ~KPROBE_FLAG_DISABLED;
+		arm_kprobe(p);
+	}
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 void __kprobes dump_kprobe(struct kprobe *kp)
 {
 	printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
-/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
-{
-	int ret = 0;
-	struct kprobe *p;
-
-	mutex_lock(&kprobe_mutex);
-
-	/* Check whether specified probe is valid. */
-	p = __get_valid_kprobe(kp);
-	if (unlikely(p == NULL)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* If the probe is already disabled (or gone), just return */
-	if (kprobe_disabled(kp))
-		goto out;
-
-	kp->flags |= KPROBE_FLAG_DISABLED;
-	if (p != kp)
-		/* When kp != p, p is always enabled. */
-		try_to_disable_aggr_kprobe(p);
-
-	if (!kprobes_all_disarmed && kprobe_disabled(p))
-		disarm_kprobe(p);
-out:
-	mutex_unlock(&kprobe_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(disable_kprobe);
-
-/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
-{
-	int ret = 0;
-	struct kprobe *p;
-
-	mutex_lock(&kprobe_mutex);
-
-	/* Check whether specified probe is valid. */
-	p = __get_valid_kprobe(kp);
-	if (unlikely(p == NULL)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (kprobe_gone(kp)) {
-		/* This kprobe has gone, we couldn't enable it. */
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (p != kp)
-		kp->flags &= ~KPROBE_FLAG_DISABLED;
-
-	if (!kprobes_all_disarmed && kprobe_disabled(p)) {
-		p->flags &= ~KPROBE_FLAG_DISABLED;
-		arm_kprobe(p);
-	}
-out:
-	mutex_unlock(&kprobe_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(enable_kprobe);
-
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
-- 
cgit v1.2.3-55-g7522


From 883a2a3189dae9d2912c417e47152f51cb922a3f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sat, 8 May 2010 06:16:11 +0200
Subject: tracing: Drop lock_acquired waittime field

Drop the waittime field from the lock_acquired event, we can
calculate it by substracting the lock_acquired event timestamp
with the matching lock_acquire one.

It is not needed and takes useless space in the traces.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 11 ++++-------
 kernel/lockdep.c            |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 5c1dcfc16c60..17ca287ae176 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -78,24 +78,21 @@ TRACE_EVENT(lock_contended,
 );
 
 TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip, waittime),
+	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
 		__string(name, lock->name)
-		__field(s64, wait_nsec)
 		__field(void *, lockdep_addr)
 	),
 
 	TP_fast_assign(
 		__assign_str(name, lock->name);
-		__entry->wait_nsec = waittime;
 		__entry->lockdep_addr = lock;
 	),
-	TP_printk("%p %s (%llu ns)", __entry->lockdep_addr,
-		  __get_str(name),
-		  __entry->wait_nsec)
+	TP_printk("%p %s", __entry->lockdep_addr,
+		  __get_str(name))
 );
 
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..31e22e742368 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3380,7 +3380,7 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
-	trace_lock_acquired(lock, ip, waittime);
+	trace_lock_acquired(lock, ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
-- 
cgit v1.2.3-55-g7522


From 93135439459920c4d856f4ab8f068c030085c8df Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Sat, 8 May 2010 06:24:25 +0200
Subject: tracing: Drop the nested field from lock_release event

Drop the nested field as we don't use it. Every nested state can
be computed from a state machine on post processing already.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 4 ++--
 kernel/lockdep.c            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 17ca287ae176..fde4c3853391 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -37,9 +37,9 @@ TRACE_EVENT(lock_acquire,
 
 TRACE_EVENT(lock_release,
 
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, nested, ip),
+	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
 		__string(name, lock->name)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 31e22e742368..e9c759f06c1d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3227,7 +3227,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
-	trace_lock_release(lock, nested, ip);
+	trace_lock_release(lock, ip);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
-- 
cgit v1.2.3-55-g7522


From b1f724c3055fa75a31d272222213647547a2d3d4 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Sun, 9 May 2010 08:22:08 -0700
Subject: sched: Add a comment to get_cpu_idle_time_us()

The exported function get_cpu_idle_time_us() has no comment
describing it; add a kerneldoc comment

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082208.7cb721f0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f25735a767af..358822ee32d5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -179,6 +179,20 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 	return now;
 }
 
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-- 
cgit v1.2.3-55-g7522


From 595aac488b546c7185be7e29c8ae165a588b2a9f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Sun, 9 May 2010 08:22:45 -0700
Subject: sched: Introduce a function to update the idle statistics

Currently, two places update the idle statistics (and more to
come later in this series).

This patch creates a helper function for updating these
statistics.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082245.163e67ed@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 358822ee32d5..59d8762c7e1d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,25 @@ static void tick_nohz_update_jiffies(ktime_t now)
 	touch_softlockup_watchdog();
 }
 
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
 {
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	ktime_t delta;
 
-	delta = ktime_sub(now, ts->idle_entrytime);
 	ts->idle_lastupdate = now;
-	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	if (ts->idle_active) {
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	}
+}
+
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	update_ts_time_stats(ts, now);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
@@ -165,14 +176,12 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
-	ktime_t now, delta;
+	ktime_t now;
 
 	now = ktime_get();
-	if (ts->idle_active) {
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_lastupdate = now;
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-	}
+
+	update_ts_time_stats(ts, now);
+
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
 	sched_clock_idle_sleep_event();
-- 
cgit v1.2.3-55-g7522


From 8c7b09f43f4bf570654bcc458ce96819a932303c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Sun, 9 May 2010 08:23:23 -0700
Subject: sched: Update the idle statistics in get_cpu_idle_time_us()

Right now, get_cpu_idle_time_us() only reports the idle
statistics upto the point the CPU entered last idle; not what is
valid right now.

This patch adds an update of the idle statistics to
get_cpu_idle_time_us(), so that calling this function always
returns statistics that are accurate at the point of the call.

This includes resetting the start of the idle time for
accounting purposes to avoid double accounting.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082323.2d2f1945@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 59d8762c7e1d..f15d18d82c18 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,7 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		ts->idle_entrytime = now;
 	}
 }
 
@@ -205,14 +206,18 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
+	now = ktime_get();
+	update_ts_time_stats(ts, now);
+
 	if (ts->idle_active)
 		*last_update_time = ktime_to_us(ts->idle_lastupdate);
 	else
-		*last_update_time = ktime_to_us(ktime_get());
+		*last_update_time = ktime_to_us(now);
 
 	return ktime_to_us(ts->idle_sleeptime);
 }
-- 
cgit v1.2.3-55-g7522


From 8d63bf949e330588b80d30ca8f0a27a45297a9e9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Sun, 9 May 2010 08:24:03 -0700
Subject: sched: Fold updating of the last_update_time_info into
 update_ts_time_stats()

This patch folds the updating of the last_update_time into the
update_ts_time_stats() function, and updates the callers.

This allows for further cleanups that are done in the next
patch.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082403.60072967@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f15d18d82c18..e86e1c6674d1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,7 +153,8 @@ static void tick_nohz_update_jiffies(ktime_t now)
 /*
  * Updates the per cpu time idle statistics counters
  */
-static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 {
 	ktime_t delta;
 
@@ -163,13 +164,19 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
+
+	if (ts->idle_active && last_update_time)
+		*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	else
+		*last_update_time = ktime_to_us(now);
+
 }
 
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
-	update_ts_time_stats(ts, now);
+	update_ts_time_stats(ts, now, NULL);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
@@ -181,7 +188,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 
 	now = ktime_get();
 
-	update_ts_time_stats(ts, now);
+	update_ts_time_stats(ts, now, NULL);
 
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
@@ -206,18 +213,11 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-	ktime_t now;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
-	now = ktime_get();
-	update_ts_time_stats(ts, now);
-
-	if (ts->idle_active)
-		*last_update_time = ktime_to_us(ts->idle_lastupdate);
-	else
-		*last_update_time = ktime_to_us(now);
+	update_ts_time_stats(ts, ktime_get(), last_update_time);
 
 	return ktime_to_us(ts->idle_sleeptime);
 }
-- 
cgit v1.2.3-55-g7522


From e0e37c200f1357db0dd986edb359c41c57d24f6e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Sun, 9 May 2010 08:24:39 -0700
Subject: sched: Eliminate the ts->idle_lastupdate field

Now that the only user of ts->idle_lastupdate is
update_ts_time_stats(), the entire field can be eliminated.

In update_ts_time_stats(), idle_lastupdate is first set to
"now", and a few lines later, the only user is an if() statement
that assigns a variable either to "now" or to
ts->idle_lastupdate, which has the value of "now" at that point.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082439.2fab0b4f@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tick.h     | 1 -
 kernel/time/tick-sched.c | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e21be3..0343eed40619 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -60,7 +60,6 @@ struct tick_sched {
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
-	ktime_t				idle_lastupdate;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e86e1c6674d1..50953f4c42b2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,16 +158,13 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 {
 	ktime_t delta;
 
-	ts->idle_lastupdate = now;
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
 
-	if (ts->idle_active && last_update_time)
-		*last_update_time = ktime_to_us(ts->idle_lastupdate);
-	else
+	if (last_update_time)
 		*last_update_time = ktime_to_us(now);
 
 }
-- 
cgit v1.2.3-55-g7522


From 0224cf4c5ee0d7faec83956b8e21f7d89e3df3bd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Sun, 9 May 2010 08:25:23 -0700
Subject: sched: Intoduce get_cpu_iowait_time_us()

For the ondemand cpufreq governor, it is desired that the iowait
time is microaccounted in a similar way as idle time is.

This patch introduces the infrastructure to account and expose
this information via the get_cpu_iowait_time_us() function.

[akpm@linux-foundation.org: fix CONFIG_NO_HZ=n build]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082523.284feab6@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tick.h     |  4 ++++
 kernel/time/tick-sched.c | 28 ++++++++++++++++++++++++++++
 kernel/time/timer_list.c |  1 +
 3 files changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0343eed40619..b232ccc0ee29 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
  * @idle_waketime:	Time when the idle was interrupted
  * @idle_exittime:	Time when the idle state was left
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
  * @sleep_length:	Duration of the current idle sleep
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
  */
@@ -60,6 +61,7 @@ struct tick_sched {
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
+	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
@@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
@@ -133,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 	return len;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
 #endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 50953f4c42b2..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,8 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		if (nr_iowait_cpu() > 0)
+			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
 
@@ -220,6 +222,32 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (!tick_nohz_enabled)
+		return -1;
+
+	update_ts_time_stats(ts, ktime_get(), last_update_time);
+
+	return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_waketime);
 		P_ns(idle_exittime);
 		P_ns(idle_sleeptime);
+		P_ns(iowait_sleeptime);
 		P(last_jiffies);
 		P(next_jiffies);
 		P_ns(idle_expires);
-- 
cgit v1.2.3-55-g7522


From af507ae8a0512a83728b17d8f8c5fa1561669f50 Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Mon, 10 May 2010 11:24:27 +0800
Subject: sched: Remove a stale comment

This comment should have been removed together with uids_mutex
when removing user sched.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dhaval Giani <dhaval.giani@gmail.com>
LKML-Reference: <4BE77C6B.5010402@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 8e1c8c0a496c..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -136,7 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
-	/* Make uid_hash_find() + uid_hash_insert() atomic. */
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
-- 
cgit v1.2.3-55-g7522


From 29f87b793da421a6ab816d991dc8dbf909dfb66a Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka
Date: Tue, 27 Apr 2010 14:12:15 -0700
Subject: posix-cpu-timers: Optimize run_posix_cpu_timers()

We can optimize and simplify things taking into account signal->cputimer
is always running when we have configured any process wide cpu timer.

In check_process_timers(), we don't have to check if new updated value of
signal->cputime_expires is smaller, since we maintain new first expiration
time ({prof,virt,sched}_expires) in code flow and all other writes to
expiration cache are protected by sighand->siglock .

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 80 +++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 799f360d1475..00bb252f29a2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1002,16 +1002,9 @@ static void stop_process_timers(struct signal_struct *sig)
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
 	unsigned long flags;
 
-	if (!cputimer->running)
-		return;
-
 	spin_lock_irqsave(&cputimer->lock, flags);
 	cputimer->running = 0;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
-
-	sig->cputime_expires.prof_exp = cputime_zero;
-	sig->cputime_expires.virt_exp = cputime_zero;
-	sig->cputime_expires.sched_exp = 0;
 }
 
 static u32 onecputick;
@@ -1048,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 	}
 }
 
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:	The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+	if (cputime_eq(cputime->utime, cputime_zero) &&
+	    cputime_eq(cputime->stime, cputime_zero) &&
+	    cputime->sum_exec_runtime == 0)
+		return 1;
+	return 0;
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1064,19 +1074,6 @@ static void check_process_timers(struct task_struct *tsk,
 	struct task_cputime cputime;
 	unsigned long soft;
 
-	/*
-	 * Don't sample the current process CPU clocks if there are no timers.
-	 */
-	if (list_empty(&timers[CPUCLOCK_PROF]) &&
-	    cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
-	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
-	    list_empty(&timers[CPUCLOCK_VIRT]) &&
-	    cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
-	    list_empty(&timers[CPUCLOCK_SCHED])) {
-		stop_process_timers(sig);
-		return;
-	}
-
 	/*
 	 * Collect the current process totals.
 	 */
@@ -1166,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk,
 		}
 	}
 
-	if (!cputime_eq(prof_expires, cputime_zero) &&
-	    (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
-	     cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
-		sig->cputime_expires.prof_exp = prof_expires;
-	if (!cputime_eq(virt_expires, cputime_zero) &&
-	    (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
-	     cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
-		sig->cputime_expires.virt_exp = virt_expires;
-	if (sched_expires != 0 &&
-	    (sig->cputime_expires.sched_exp == 0 ||
-	     sig->cputime_expires.sched_exp > sched_expires))
-		sig->cputime_expires.sched_exp = sched_expires;
+	sig->cputime_expires.prof_exp = prof_expires;
+	sig->cputime_expires.virt_exp = virt_expires;
+	sig->cputime_expires.sched_exp = sched_expires;
+	if (task_cputime_zero(&sig->cputime_expires))
+		stop_process_timers(sig);
 }
 
 /*
@@ -1249,23 +1239,6 @@ out:
 	++timer->it_requeue_pending;
 }
 
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime:	The struct to compare.
- *
- * Checks @cputime to see if all fields are zero.  Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
-	if (cputime_eq(cputime->utime, cputime_zero) &&
-	    cputime_eq(cputime->stime, cputime_zero) &&
-	    cputime->sum_exec_runtime == 0)
-		return 1;
-	return 0;
-}
-
 /**
  * task_cputime_expired - Compare two task_cputime entities.
  *
@@ -1322,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	}
 
 	sig = tsk->signal;
-	if (!task_cputime_zero(&sig->cputime_expires)) {
+	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
 
 		thread_group_cputimer(tsk, &group_sample);
@@ -1359,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * put them on the firing list.
 	 */
 	check_thread_timers(tsk, &firing);
-	check_process_timers(tsk, &firing);
+	/*
+	 * If there are any active process wide timers (POSIX 1.b, itimers,
+	 * RLIMIT_CPU) cputimer must be running.
+	 */
+	if (tsk->signal->cputimer.running)
+		check_process_timers(tsk, &firing);
 
 	/*
 	 * We must release these locks before taking any timer's lock.
-- 
cgit v1.2.3-55-g7522


From d7e81c269db899b800e0963dc4aceece1f82a680 Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Fri, 7 May 2010 18:07:38 -0700
Subject: clocksource: Add clocksource_register_hz/khz interface

How to pick good mult/shift pairs has always been difficult to
describe to folks writing clocksource drivers, since it requires
careful tradeoffs in adjustment accuracy vs overflow limits.

Now, with the clocks_calc_mult_shift function, its much
easier. However, not many clocksources have converted to using that
function, and there is still the issue of the max interval length
assumption being made by each clocksource driver independently.

So this patch simplifies the registration process by having
clocksources be registered with a hz/khz value and the registration
function taking care of setting mult/shift.

This should take most of the confusion out of writing a clocksource
driver.

Additionally it also keeps the shift size tradeoff (more accuracy vs
longer possible nohz times) centralized so the timekeeping core can
keep track of the assumptions being made.

[ tglx: Coding style and comments fixed ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <1273280858-30143-1-git-send-email-johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 19 +++++++++++++++++-
 kernel/time/clocksource.c   | 48 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 4bca8b60cdf7..5ea3c60c160c 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -273,7 +273,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 }
 
 
-/* used to install a new clocksource */
 extern int clocksource_register(struct clocksource*);
 extern void clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
@@ -287,6 +286,24 @@ extern void clocksource_mark_unstable(struct clocksource *cs);
 extern void
 clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec);
 
+/*
+ * Don't call __clocksource_register_scale directly, use
+ * clocksource_register_hz/khz
+ */
+extern int
+__clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
+
+static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
+{
+	return __clocksource_register_scale(cs, 1, hz);
+}
+
+static inline int clocksource_register_khz(struct clocksource *cs, u32 khz)
+{
+	return __clocksource_register_scale(cs, 1000, khz);
+}
+
+
 static inline void
 clocksource_calc_mult_shift(struct clocksource *cs, u32 freq, u32 minsec)
 {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
 	list_add(&cs->list, entry);
 }
 
+
+/*
+ * Maximum time we expect to go between ticks. This includes idle
+ * tickless time. It provides the trade off between selecting a
+ * mult/shift pair that is very precise but can only handle a short
+ * period of time, vs. a mult/shift pair that can handle long periods
+ * of time but isn't as precise.
+ *
+ * This is a subsystem constant, and actual hardware limitations
+ * may override it (ie: clocksources that wrap every 3 seconds).
+ */
+#define MAX_UPDATE_LENGTH 5 /* Seconds */
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t:		clocksource to be registered
+ * @scale:	Scale factor multiplied against freq to get clocksource hz
+ * @freq:	clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+	/*
+	 * Ideally we want to use  some of the limits used in
+	 * clocksource_max_deferment, to provide a more informed
+	 * MAX_UPDATE_LENGTH. But for now this just gets the
+	 * register interface working properly.
+	 */
+	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+				      NSEC_PER_SEC/scale,
+				      MAX_UPDATE_LENGTH*scale);
+	cs->max_idle_ns = clocksource_max_deferment(cs);
+
+	mutex_lock(&clocksource_mutex);
+	clocksource_enqueue(cs);
+	clocksource_select();
+	clocksource_enqueue_watchdog(cs);
+	mutex_unlock(&clocksource_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__clocksource_register_scale);
+
+
 /**
  * clocksource_register - Used to install new clocksources
  * @t:		clocksource to be registered
-- 
cgit v1.2.3-55-g7522


From 2b3fc35f6919344e3cf722dde8308f47235c0b70 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Tue, 20 Apr 2010 16:23:07 +0800
Subject: rcu: optionally leave lockdep enabled after RCU lockdep splat

There is no need to disable lockdep after an RCU lockdep splat,
so remove the debug_lockdeps_off() from lockdep_rcu_dereference().
To avoid repeated lockdep splats, use a static variable in the inlined
rcu_dereference_check() and rcu_dereference_protected() macros so that
a given instance splats only once, but so that multiple instances can
be detected per boot.

This is controlled by a new config variable CONFIG_PROVE_RCU_REPEATEDLY,
which is disabled by default.  This provides the normal lockdep behavior
by default, but permits people who want to find multiple RCU-lockdep
splats per boot to easily do so.

Requested-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Tested-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 15 +++++++++++----
 kernel/lockdep.c         |  3 +++
 lib/Kconfig.debug        | 12 ++++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index db266bbed23f..4dca2752cfde 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -192,6 +192,15 @@ static inline int rcu_read_lock_sched_held(void)
 
 extern int rcu_my_thread_group_empty(void);
 
+#define __do_rcu_dereference_check(c)					\
+	do {								\
+		static bool __warned;					\
+		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
+			__warned = true;				\
+			lockdep_rcu_dereference(__FILE__, __LINE__);	\
+		}							\
+	} while (0)
+
 /**
  * rcu_dereference_check - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
@@ -221,8 +230,7 @@ extern int rcu_my_thread_group_empty(void);
  */
 #define rcu_dereference_check(p, c) \
 	({ \
-		if (debug_lockdep_rcu_enabled() && !(c)) \
-			lockdep_rcu_dereference(__FILE__, __LINE__); \
+		__do_rcu_dereference_check(c); \
 		rcu_dereference_raw(p); \
 	})
 
@@ -239,8 +247,7 @@ extern int rcu_my_thread_group_empty(void);
  */
 #define rcu_dereference_protected(p, c) \
 	({ \
-		if (debug_lockdep_rcu_enabled() && !(c)) \
-			lockdep_rcu_dereference(__FILE__, __LINE__); \
+		__do_rcu_dereference_check(c); \
 		(p); \
 	})
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..3a756ba8d5d8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3801,8 +3801,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
 {
 	struct task_struct *curr = current;
 
+#ifndef CONFIG_PROVE_RCU_REPEATEDLY
 	if (!debug_locks_off())
 		return;
+#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
+	/* Note: the following can be executed concurrently, so be careful. */
 	printk("\n===================================================\n");
 	printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
 	printk(  "---------------------------------------------------\n");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 935248bdbc47..94090b4bb7d2 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -512,6 +512,18 @@ config PROVE_RCU
 
 	 Say N if you are unsure.
 
+config PROVE_RCU_REPEATEDLY
+	bool "RCU debugging: don't disable PROVE_RCU on first splat"
+	depends on PROVE_RCU
+	default n
+	help
+	 By itself, PROVE_RCU will disable checking upon issuing the
+	 first warning (or "splat").  This feature prevents such
+	 disabling, allowing multiple RCU-lockdep warnings to be printed
+	 on a single reboot.
+
+	 Say N if you are unsure.
+
 config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
-- 
cgit v1.2.3-55-g7522


From d25eb9442bb2c38c1e742f0fa764d7132d72593f Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 18 Mar 2010 21:36:51 -0700
Subject: rcu: substitute set_need_resched for sending resched IPIs

This patch adds a check to __rcu_pending() that does a local
set_need_resched() if the current CPU is holding up the current grace
period and if force_quiescent_state() will be called soon.  The goal is
to reduce the probability that force_quiescent_state() will need to do
smp_send_reschedule(), which sends an IPI and is therefore more expensive
on most architectures.

Signed-off-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..e54c12351227 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1499,6 +1499,16 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rdp->qs_pending) {
+
+		/*
+		 * If force_quiescent_state() coming soon and this CPU
+		 * needs a quiescent state, and this is either RCU-sched
+		 * or RCU-bh, force a local reschedule.
+		 */
+		if (!rdp->preemptable &&
+		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
+				 jiffies))
+			set_need_resched();
 		rdp->n_rp_qs_pending++;
 		return 1;
 	}
-- 
cgit v1.2.3-55-g7522


From f261414f0d56dd1a0e34888e27d1d4902ad052f3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Sun, 28 Mar 2010 11:15:20 +0800
Subject: rcu: make dead code really dead

cleanup: make dead code really dead

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e54c12351227..6042fb859535 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1236,11 +1236,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		break; /* grace period idle or initializing, ignore. */
 
 	case RCU_SAVE_DYNTICK:
-
-		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
 			break; /* So gcc recognizes the dead code. */
 
+		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+
 		/* Record dyntick-idle state. */
 		force_qs_rnp(rsp, dyntick_save_progress_counter);
 		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-- 
cgit v1.2.3-55-g7522


From 0c34029abdfdea64420cb4264c4e91a776b22157 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Sun, 28 Mar 2010 11:12:30 +0800
Subject: rcu: move some code from macro to function

Shrink the RCU_INIT_FLAVOR() macro by moving all but the initialization
of the ->rda[] array to rcu_init_one().  The call to rcu_init_one()
can then be moved to the end of the RCU_INIT_FLAVOR() macro, which is
required because rcu_boot_init_percpu_data(), which is now called from
rcu_init_one(), depends on the initialization of the ->rda[] array.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6042fb859535..86bb9499aae6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1859,6 +1859,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
 		}
 	}
+
+	rnp = rsp->level[NUM_RCU_LVLS - 1];
+	for_each_possible_cpu(i) {
+		if (i > rnp->grphi)
+			rnp++;
+		rsp->rda[i]->mynode = rnp;
+		rcu_boot_init_percpu_data(i, rsp);
+	}
 }
 
 /*
@@ -1869,19 +1877,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 #define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
 	int i; \
-	int j; \
-	struct rcu_node *rnp; \
 	\
-	rcu_init_one(rsp); \
-	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
-	j = 0; \
 	for_each_possible_cpu(i) { \
-		if (i > rnp[j].grphi) \
-			j++; \
-		per_cpu(rcu_data, i).mynode = &rnp[j]; \
 		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
-		rcu_boot_init_percpu_data(i, rsp); \
 	} \
+	rcu_init_one(rsp); \
 } while (0)
 
 void __init rcu_init(void)
-- 
cgit v1.2.3-55-g7522


From 5db356736acb9ba717df1aa9444e4e44cbb30a71 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Tue, 30 Mar 2010 18:40:36 +0800
Subject: rcu: ignore offline CPUs in last non-dyntick-idle CPU check

Offline CPUs are not in nohz_cpu_mask, but can be ignored when checking
for the last non-dyntick-idle CPU.  This patch therefore only checks
online CPUs for not being dyntick idle, allowing fast entry into
full-system dyntick-idle state even when there are some offline CPUs.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..687c4e90722e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1016,7 +1016,7 @@ int rcu_needs_cpu(int cpu)
 
 	/* Don't bother unless we are the last non-dyntick-idle CPU. */
 	for_each_cpu_not(thatcpu, nohz_cpu_mask)
-		if (thatcpu != cpu) {
+		if (cpu_online(thatcpu) && thatcpu != cpu) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
 			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
 			return rcu_needs_cpu_quick_check(cpu);
-- 
cgit v1.2.3-55-g7522


From da848c47bc6e873a54a445ea1960423a495b6b32 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 30 Mar 2010 15:46:01 -0700
Subject: rcu: shrink rcutiny by making synchronize_rcu_bh() be inline

Because synchronize_rcu_bh() is identical to synchronize_sched(),
make the former a static inline invoking the latter, saving the
overhead of an EXPORT_SYMBOL_GPL() and the duplicate code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  2 --
 include/linux/rcutiny.h  | 12 +++++++++++-
 include/linux/rcutree.h  |  2 ++
 kernel/rcutiny.c         |  9 ++-------
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 02537a72aaa4..d8fb2abcf303 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -56,8 +56,6 @@ struct rcu_head {
 };
 
 /* Exported common interfaces */
-extern void synchronize_rcu_bh(void);
-extern void synchronize_sched(void);
 extern void rcu_barrier(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480a..bbeb55b7709b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -74,7 +74,17 @@ static inline void rcu_sched_force_quiescent_state(void)
 {
 }
 
-#define synchronize_rcu synchronize_sched
+extern void synchronize_sched(void);
+
+static inline void synchronize_rcu(void)
+{
+	synchronize_sched();
+}
+
+static inline void synchronize_rcu_bh(void)
+{
+	synchronize_sched();
+}
 
 static inline void synchronize_rcu_expedited(void)
 {
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779e..7484fe66a3aa 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -86,6 +86,8 @@ static inline void __rcu_read_unlock_bh(void)
 
 extern void call_rcu_sched(struct rcu_head *head,
 			   void (*func)(struct rcu_head *rcu));
+extern void synchronize_rcu_bh(void);
+extern void synchronize_sched(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..272c6d21a75f 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -187,7 +187,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  *
  * Cool, huh?  (Due to Josh Triplett.)
  *
- * But we want to make this a static inline later.
+ * But we want to make this a static inline later.  The cond_resched()
+ * currently makes this problematic.
  */
 void synchronize_sched(void)
 {
@@ -195,12 +196,6 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
-void synchronize_rcu_bh(void)
-{
-	synchronize_sched();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
 /*
  * Helper function for call_rcu() and call_rcu_bh().
  */
-- 
cgit v1.2.3-55-g7522


From 99652b54de1ee094236f7171485214071af4ef31 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 30 Mar 2010 15:50:01 -0700
Subject: rcu: rename rcutiny rcu_ctrlblk to rcu_sched_ctrlblk

Make naming line up in preparation for CONFIG_TINY_PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 272c6d21a75f..d9f8a623c9fa 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
 };
 
 /* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.donetail	= &rcu_ctrlblk.rcucblist,
-	.curtail	= &rcu_ctrlblk.rcucblist,
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
 };
 
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -108,7 +108,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
  */
 void rcu_sched_qs(int cpu)
 {
-	if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
+	    rcu_qsctr_help(&rcu_bh_ctrlblk))
 		raise_softirq(RCU_SOFTIRQ);
 }
 
@@ -173,7 +174,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-	__rcu_process_callbacks(&rcu_ctrlblk);
+	__rcu_process_callbacks(&rcu_sched_ctrlblk);
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
 }
 
@@ -221,7 +222,7 @@ static void __call_rcu(struct rcu_head *head,
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-	__call_rcu(head, func, &rcu_ctrlblk);
+	__call_rcu(head, func, &rcu_sched_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
-- 
cgit v1.2.3-55-g7522


From 25502a6c13745f4650cc59322bd198194f55e796 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 1 Apr 2010 17:37:01 -0700
Subject: rcu: refactor RCU's context-switch handling

The addition of preemptible RCU to treercu resulted in a bit of
confusion and inefficiency surrounding the handling of context switches
for RCU-sched and for RCU-preempt.  For RCU-sched, a context switch
is a quiescent state, pure and simple, just like it always has been.
For RCU-preempt, a context switch is in no way a quiescent state, but
special handling is required when a task blocks in an RCU read-side
critical section.

However, the callout from the scheduler and the outer loop in ksoftirqd
still calls something named rcu_sched_qs(), whose name is no longer
accurate.  Furthermore, when rcu_check_callbacks() notes an RCU-sched
quiescent state, it ends up unnecessarily (though harmlessly, aside
from the performance hit) enqueuing the current task if it happens to
be running in an RCU-preempt read-side critical section.  This not only
increases the maximum latency of scheduler_tick(), it also needlessly
increases the overhead of the next outermost rcu_read_unlock() invocation.

This patch addresses this situation by separating the notion of RCU's
context-switch handling from that of RCU-sched's quiescent states.
The context-switch handling is covered by rcu_note_context_switch() in
general and by rcu_preempt_note_context_switch() for preemptible RCU.
This permits rcu_sched_qs() to handle quiescent states and only quiescent
states.  It also reduces the maximum latency of scheduler_tick(), though
probably by much less than a microsecond.  Finally, it means that tasks
within preemptible-RCU read-side critical sections avoid incurring the
overhead of queuing unless there really is a context switch.

Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/rcutiny.h |  4 ++++
 include/linux/rcutree.h |  1 +
 kernel/rcutree.c        | 17 ++++++++++++-----
 kernel/rcutree_plugin.h | 11 +++++++----
 kernel/sched.c          |  2 +-
 kernel/softirq.c        |  2 +-
 6 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index bbeb55b7709b..ff22b97fb979 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -29,6 +29,10 @@
 
 void rcu_sched_qs(int cpu);
 void rcu_bh_qs(int cpu);
+static inline void rcu_note_context_switch(int cpu)
+{
+	rcu_sched_qs(cpu);
+}
 
 #define __rcu_read_lock()	preempt_disable()
 #define __rcu_read_unlock()	preempt_enable()
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 7484fe66a3aa..b9f74606f320 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,6 +34,7 @@ struct notifier_block;
 
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
+extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern int rcu_expedited_torture_stats(char *page);
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 86bb9499aae6..e33631354b69 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -97,25 +97,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(int cpu)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 
-	rdp = &per_cpu(rcu_sched_data, cpu);
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
-	rcu_preempt_note_context_switch(cpu);
 }
 
 void rcu_bh_qs(int cpu)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
 }
 
+/*
+ * Note a context switch.  This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ */
+void rcu_note_context_switch(int cpu)
+{
+	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch(cpu);
+}
+
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = 1,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 687c4e90722e..f9bc83a047da 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -75,13 +75,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  * that this just means that the task currently running on the CPU is
  * not in a quiescent state.  There might be any number of tasks blocked
  * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
  */
 static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
+	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 
 /*
@@ -144,9 +150,8 @@ static void rcu_preempt_note_context_switch(int cpu)
 	 * grace period, then the fact that the task has been enqueued
 	 * means that we continue to block the current grace period.
 	 */
-	rcu_preempt_qs(cpu);
 	local_irq_save(flags);
-	t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+	rcu_preempt_qs(cpu);
 	local_irq_restore(flags);
 }
 
@@ -236,7 +241,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	 */
 	special = t->rcu_read_unlock_special;
 	if (special & RCU_READ_UNLOCK_NEED_QS) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 		rcu_preempt_qs(smp_processor_id());
 	}
 
@@ -473,7 +477,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 	struct task_struct *t = current;
 
 	if (t->rcu_read_lock_nesting == 0) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 		rcu_preempt_qs(cpu);
 		return;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f70ffe..d8a213ccdc3b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3706,7 +3706,7 @@ need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_sched_qs(cpu);
+	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..0db913a5c60f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
-			rcu_sched_qs((long)__bind_cpu);
+			rcu_note_context_switch((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3-55-g7522


From bbad937983147c017c25406860287cb94da9af7c Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Fri, 2 Apr 2010 16:17:17 -0700
Subject: rcu: slim down rcutiny by removing rcu_scheduler_active and friends

TINY_RCU does not need rcu_scheduler_active unless CONFIG_DEBUG_LOCK_ALLOC.
So conditionally compile rcu_scheduler_active in order to slim down
rcutiny a bit more.  Also gets rid of an EXPORT_SYMBOL_GPL, which is
responsible for most of the slimming.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  4 +---
 include/linux/rcutiny.h  | 13 +++++++++++++
 include/linux/rcutree.h  |  3 +++
 kernel/rcupdate.c        | 19 -------------------
 kernel/rcutiny.c         |  7 +++++++
 kernel/rcutiny_plugin.h  | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/rcutree.c         | 19 +++++++++++++++++++
 7 files changed, 82 insertions(+), 22 deletions(-)
 create mode 100644 kernel/rcutiny_plugin.h

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d8fb2abcf303..23be3a702516 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -64,8 +64,6 @@ extern int sched_expedited_torture_stats(char *page);
 
 /* Internal to kernel */
 extern void rcu_init(void);
-extern int rcu_scheduler_active;
-extern void rcu_scheduler_starting(void);
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
@@ -178,7 +176,7 @@ static inline int rcu_read_lock_bh_held(void)
 #ifdef CONFIG_PREEMPT
 static inline int rcu_read_lock_sched_held(void)
 {
-	return !rcu_scheduler_active || preempt_count() != 0 || irqs_disabled();
+	return preempt_count() != 0 || irqs_disabled();
 }
 #else /* #ifdef CONFIG_PREEMPT */
 static inline int rcu_read_lock_sched_held(void)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ff22b97fb979..14e5a76b2c06 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -128,4 +128,17 @@ static inline int rcu_preempt_depth(void)
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+extern int rcu_scheduler_active __read_mostly;
+extern void rcu_scheduler_starting(void);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rcu_scheduler_starting(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index b9f74606f320..48282055e83d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -123,4 +123,7 @@ static inline int rcu_blocking_is_gp(void)
 	return num_online_cpus() == 1;
 }
 
+extern void rcu_scheduler_starting(void);
+extern int rcu_scheduler_active __read_mostly;
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 49d808e833b0..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include <linux/kernel_stat.h>
 #include <linux/hardirq.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 #endif
 
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 int debug_lockdep_rcu_enabled(void)
@@ -96,21 +92,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-/*
- * This function is invoked towards the end of the scheduler's initialization
- * process.  Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system).  After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.
- */
-void rcu_scheduler_starting(void)
-{
-	WARN_ON(num_online_cpus() != 1);
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
-}
-
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d9f8a623c9fa..b1804ff83d5e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_NO_HZ
 
 static long rcu_dynticks_nesting = 1;
@@ -276,3 +281,5 @@ void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
+
+#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+#include <linux/kernel_stat.h>
+
+/*
+ * During boot, we forgive RCU lockdep issues.  After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e33631354b69..3623f8e10220 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
+#include <linux/kernel_stat.h>
 
 #include "rcutree.h"
 
@@ -80,6 +81,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -1783,6 +1787,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.  This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
 /*
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
-- 
cgit v1.2.3-55-g7522


From c68de2097a8799549a3c3bf27cbfeea24a604284 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 15 Apr 2010 10:12:40 -0700
Subject: rcu: disable CPU stall warnings upon panic

The current RCU CPU stall warnings remain enabled even after a panic
occurs, which some people have found to be a bit counterproductive.
This patch therefore uses a notifier to disable stall warnings once a
panic occurs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3623f8e10220..595fb83e9b7d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -449,6 +449,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
+int rcu_cpu_stall_panicking __read_mostly;
+
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
 	rsp->gp_start = jiffies;
@@ -526,6 +528,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	long delta;
 	struct rcu_node *rnp;
 
+	if (rcu_cpu_stall_panicking)
+		return;
 	delta = jiffies - rsp->jiffies_stall;
 	rnp = rdp->mynode;
 	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -540,6 +544,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 }
 
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+	rcu_cpu_stall_panicking = 1;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+	.notifier_call = rcu_panic,
+};
+
+static void __init check_cpu_stall_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+}
+
 #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -550,6 +569,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
+static void __init check_cpu_stall_init(void)
+{
+}
+
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /*
@@ -1934,6 +1957,7 @@ void __init rcu_init(void)
 	cpu_notifier(rcu_cpu_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+	check_cpu_stall_init();
 }
 
 #include "rcutree_plugin.h"
-- 
cgit v1.2.3-55-g7522


From 26845c2860cebebe6ce2d9d01ae3cb3db84b7e29 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 13 Apr 2010 14:19:23 -0700
Subject: rcu: print boot-time console messages if RCU configs out of ordinary

Print boot-time messages if tracing is enabled, if fanout is set
to non-default values, if exact fanout is specified, if accelerated
dyntick-idle grace periods have been enabled, if RCU-lockdep is enabled,
if rcutorture has been boot-time enabled, if the CPU stall detector has
been disabled, or if four-level hierarchy has been enabled.

This is all for TREE_RCU and TREE_PREEMPT_RCU.  TINY_RCU will be handled
separately, if at all.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        |  6 ------
 kernel/rcutree_plugin.h | 44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 595fb83e9b7d..ec6196fcd1fa 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1938,12 +1938,6 @@ void __init rcu_init(void)
 	int cpu;
 
 	rcu_bootup_announce();
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#if NUM_RCU_LVL_4 != 0
-	printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
-#endif /* #if NUM_RCU_LVL_4 != 0 */
 	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	__rcu_init_preempt();
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f9bc83a047da..0ae2339ab04d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
 
 #include <linux/delay.h>
 
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary.  If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+#ifdef CONFIG_RCU_TRACE
+	printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
+#endif
+#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+	printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+	       CONFIG_RCU_FANOUT);
+#endif
+#ifdef CONFIG_RCU_FANOUT_EXACT
+	printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
+#endif
+#ifdef CONFIG_RCU_FAST_NO_HZ
+	printk(KERN_INFO
+	       "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+#endif
+#ifdef CONFIG_PROVE_RCU
+	printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+	printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO
+	       "\tRCU-based detection of stalled CPUs is disabled.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
+#endif
+#if NUM_RCU_LVL_4 != 0
+	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+#endif
+}
+
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  */
 static void __init rcu_bootup_announce(void)
 {
-	printk(KERN_INFO
-	       "Experimental preemptable hierarchical RCU implementation.\n");
+	printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+	rcu_bootup_announce_oddness();
 }
 
 /*
@@ -757,6 +796,7 @@ void exit_rcu(void)
 static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Hierarchical RCU implementation.\n");
+	rcu_bootup_announce_oddness();
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 4300aa642cc9ecb35f2e0683dd294fb790ef028c Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 13 Apr 2010 16:18:22 -0700
Subject: rcu: improve RCU CPU stall-warning messages

The existing RCU CPU stall-warning messages can be confusing, especially
in the case where one CPU detects a single other stalled CPU.  In addition,
the console messages did not say which flavor of RCU detected the stall,
which can make it difficult to work out exactly what is causing the stall.
This commit improves these messages.

Requested-by: Dhaval Giani <dhaval.giani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 20 +++++++++++---------
 kernel/rcutree.h |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ec6196fcd1fa..f391886be8f0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -54,8 +54,8 @@
 
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
-#define RCU_STATE_INITIALIZER(name) { \
-	.level = { &name.node[0] }, \
+#define RCU_STATE_INITIALIZER(structname) { \
+	.level = { &structname.node[0] }, \
 	.levelcnt = { \
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
@@ -66,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
-	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
 	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &name.orphan_cbs_list, \
+	.orphan_cbs_tail = &structname.orphan_cbs_list, \
 	.orphan_qlen = 0, \
-	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
+	.name = #structname, \
 }
 
 struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -483,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 
 	/* OK, time to rat on our buddy... */
 
-	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rcu_print_task_stall(rnp);
@@ -494,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 			if (rnp->qsmask & (1UL << cpu))
 				printk(" %d", rnp->grplo + cpu);
 	}
-	printk(" (detected by %d, t=%ld jiffies)\n",
+	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
 	trigger_all_cpu_backtrace();
 
@@ -510,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
-			smp_processor_id(), jiffies - rsp->gp_start);
+	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
+	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
 	trigger_all_cpu_backtrace();
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..11f171121ad9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -326,6 +326,7 @@ struct rcu_state {
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	char *name;				/* Name of structure. */
 };
 
 /* Return values for rcu_preempt_offline_tasks(). */
-- 
cgit v1.2.3-55-g7522


From 4a90a0681cf6cd21cd444184302aa045156486b3 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 14 Apr 2010 16:48:11 -0700
Subject: rcu: permit discontiguous cpu_possible_mask CPU numbering

TREE_RCU assumes that CPU numbering is contiguous, but some users need
large holes in the numbering to better map to hardware layout.  This patch
makes TREE_RCU (and TREE_PREEMPT_RCU) tolerate large holes in the CPU
numbering.  However, NR_CPUS must still be greater than the largest
CPU number.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f391886be8f0..c60fd74e7ec9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1913,7 +1913,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 
 	rnp = rsp->level[NUM_RCU_LVLS - 1];
 	for_each_possible_cpu(i) {
-		if (i > rnp->grphi)
+		while (i > rnp->grphi)
 			rnp++;
 		rsp->rda[i]->mynode = rnp;
 		rcu_boot_init_percpu_data(i, rsp);
-- 
cgit v1.2.3-55-g7522


From d21670acab9fcb4bc74a40b68a6941059234c55c Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 14 Apr 2010 17:39:26 -0700
Subject: rcu: reduce the number of spurious RCU_SOFTIRQ invocations

Lai Jiangshan noted that up to 10% of the RCU_SOFTIRQ are spurious, and
traced this down to the fact that the current grace-period machinery
will uselessly raise RCU_SOFTIRQ when a given CPU needs to go through
a quiescent state, but has not yet done so.  In this situation, there
might well be nothing that RCU_SOFTIRQ can do, and the overhead can be
worth worrying about in the ksoftirqd case.  This patch therefore avoids
raising RCU_SOFTIRQ in this situation.

Changes since v1 (http://lkml.org/lkml/2010/3/30/122 from Lai Jiangshan):

o	Omit the rcu_qs_pending() prechecks, as they aren't that
	much less expensive than the quiescent-state checks.

o	Merge with the set_need_resched() patch that reduces IPIs.

o	Add the new n_rp_report_qs field to the rcu_pending tracing output.

o	Update the tracing documentation accordingly.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/trace.txt | 35 +++++++++++++++++++----------------
 kernel/rcutree.c            | 11 ++++++-----
 kernel/rcutree.h            |  1 +
 kernel/rcutree_trace.c      |  4 +++-
 4 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 8608fd85e921..efd8cc95c06b 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -256,23 +256,23 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
 The output of "cat rcu/rcu_pending" looks as follows:
 
 rcu_sched:
-  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
-  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
-  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
-  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
-  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
-  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
-  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
-  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+  0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
 rcu_bh:
-  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
-  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
-  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
-  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
-  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
-  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
-  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
-  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+  0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
 
 As always, this is once again split into "rcu_sched" and "rcu_bh"
 portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -284,6 +284,9 @@ o	"np" is the number of times that __rcu_pending() has been invoked
 o	"qsp" is the number of times that the RCU was waiting for a
 	quiescent state from this CPU.
 
+o	"rpq" is the number of times that the CPU had passed through
+	a quiescent state, but not yet reported it to RCU.
+
 o	"cbr" is the number of times that this CPU had RCU callbacks
 	that had passed through a grace period, and were thus ready
 	to be invoked.
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c60fd74e7ec9..ba6996943e28 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1161,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 void rcu_check_callbacks(int cpu, int user)
 {
-	if (!rcu_pending(cpu))
-		return; /* if nothing for RCU to do. */
 	if (user ||
 	    (idle_cpu(cpu) && rcu_scheduler_active &&
 	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1194,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
 		rcu_bh_qs(cpu);
 	}
 	rcu_preempt_check_callbacks(cpu);
-	raise_softirq(RCU_SOFTIRQ);
+	if (rcu_pending(cpu))
+		raise_softirq(RCU_SOFTIRQ);
 }
 
 #ifdef CONFIG_SMP
@@ -1534,18 +1533,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending) {
+	if (rdp->qs_pending && !rdp->passed_quiesc) {
 
 		/*
 		 * If force_quiescent_state() coming soon and this CPU
 		 * needs a quiescent state, and this is either RCU-sched
 		 * or RCU-bh, force a local reschedule.
 		 */
+		rdp->n_rp_qs_pending++;
 		if (!rdp->preemptable &&
 		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
 				 jiffies))
 			set_need_resched();
-		rdp->n_rp_qs_pending++;
+	} else if (rdp->qs_pending && rdp->passed_quiesc) {
+		rdp->n_rp_report_qs++;
 		return 1;
 	}
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 11f171121ad9..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
 	/* 5) __rcu_pending() statistics. */
 	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
 	unsigned long n_rp_qs_pending;
+	unsigned long n_rp_report_qs;
 	unsigned long n_rp_cb_ready;
 	unsigned long n_rp_cpu_needs_gp;
 	unsigned long n_rp_gp_completed;
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
 	seq_printf(m, "%3d%cnp=%ld "
-		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+		   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
+		   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->n_rcu_pending,
 		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_report_qs,
 		   rdp->n_rp_cb_ready,
 		   rdp->n_rp_cpu_needs_gp,
 		   rdp->n_rp_gp_completed,
-- 
cgit v1.2.3-55-g7522


From 77e38ed347162423c6b72e06c865a121081c2bb6 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Sun, 25 Apr 2010 21:04:29 -0700
Subject: rcu: RCU_FAST_NO_HZ must check RCU dyntick state

The current version of RCU_FAST_NO_HZ reproduces the old CLASSIC_RCU
dyntick-idle bug, as it fails to detect CPUs that have interrupted
or NMIed out of dyntick-idle mode.  Fix this by making rcu_needs_cpu()
check the state in the per-CPU rcu_dynticks variables, thus correctly
detecting the dyntick-idle state from an RCU perspective.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0ae2339ab04d..9b18227e86e8 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1051,6 +1051,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
 int rcu_needs_cpu(int cpu)
 {
 	int c = 0;
+	int snap;
+	int snap_nmi;
 	int thatcpu;
 
 	/* Check for being in the holdoff period. */
@@ -1058,12 +1060,18 @@ int rcu_needs_cpu(int cpu)
 		return rcu_needs_cpu_quick_check(cpu);
 
 	/* Don't bother unless we are the last non-dyntick-idle CPU. */
-	for_each_cpu_not(thatcpu, nohz_cpu_mask)
-		if (cpu_online(thatcpu) && thatcpu != cpu) {
+	for_each_online_cpu(thatcpu) {
+		if (thatcpu == cpu)
+			continue;
+		snap = per_cpu(rcu_dynticks, thatcpu)->dynticks;
+		snap_nmi = per_cpu(rcu_dynticks, thatcpu)->dynticks_nmi;
+		smp_mb(); /* Order sampling of snap with end of grace period. */
+		if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
 			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
 			return rcu_needs_cpu_quick_check(cpu);
 		}
+	}
 
 	/* Check and update the rcu_dyntick_drain sequencing. */
 	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
-- 
cgit v1.2.3-55-g7522


From d822ed1094032ab524344a9a474c93128d9c2159 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Sat, 8 May 2010 19:58:22 -0700
Subject: rcu: fix build bug in RCU_FAST_NO_HZ builds

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 9b18227e86e8..ac7d80fa895c 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1063,8 +1063,8 @@ int rcu_needs_cpu(int cpu)
 	for_each_online_cpu(thatcpu) {
 		if (thatcpu == cpu)
 			continue;
-		snap = per_cpu(rcu_dynticks, thatcpu)->dynticks;
-		snap_nmi = per_cpu(rcu_dynticks, thatcpu)->dynticks_nmi;
+		snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+		snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
 		smp_mb(); /* Order sampling of snap with end of grace period. */
 		if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
-- 
cgit v1.2.3-55-g7522


From d3c1b24c50e8b2bbc840322caf26c7eada594d21 Mon Sep 17 00:00:00 2001
From: Jiri Slaby
Date: Sat, 1 May 2010 23:52:02 +0200
Subject: PM / Hibernate: Snapshot cleanup

Remove support of reads with offset. This means snapshot_read/write_next
now does not accept count parameter. It allows to clean up the functions
and snapshot handle which no longer needs to care about offsets.

/dev/snapshot handler is converted to simple_{read_from,write_to}_buffer
which take care of offsets.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/power.h    |  18 +-----
 kernel/power/snapshot.c | 145 +++++++++++++++++++-----------------------------
 kernel/power/swap.c     |   8 +--
 kernel/power/user.c     |  37 ++++++++----
 4 files changed, 88 insertions(+), 120 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..b1e207dde1c2 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
  */
 
 struct snapshot_handle {
-	loff_t		offset;	/* number of the last byte ready for reading
-				 * or writing in the sequence
-				 */
 	unsigned int	cur;	/* number of the block of PAGE_SIZE bytes the
 				 * next operation will refer to (ie. current)
 				 */
-	unsigned int	cur_offset;	/* offset with respect to the current
-					 * block (for the next operation)
-					 */
-	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
-				 * was the current one previously
-				 */
 	void		*buffer;	/* address of the block to read from
 					 * or write to
 					 */
-	unsigned int	buf_offset;	/* location to read from or write to,
-					 * given as a displacement from 'buffer'
-					 */
 	int		sync_read;	/* Set to one to notify the caller of
 					 * snapshot_write_next() that it may
 					 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
  * snapshot_read_next()/snapshot_write_next() is allowed to
  * read/write data after the function returns
  */
-#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+#define data_of(handle)	((handle).buffer)
 
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern unsigned long snapshot_get_image_size(void);
-extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
-extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_read_next(struct snapshot_handle *handle);
+extern int snapshot_write_next(struct snapshot_handle *handle);
 extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
  *	snapshot_handle structure.  The structure gets updated and a pointer
  *	to it should be passed to this function every next time.
  *
- *	The @count parameter should contain the number of bytes the caller
- *	wants to read from the snapshot.  It must not be zero.
- *
  *	On success the function returns a positive number.  Then, the caller
  *	is allowed to read up to the returned number of bytes from the memory
- *	location computed by the data_of() macro.  The number returned
- *	may be smaller than @count, but this only happens if the read would
- *	cross a page boundary otherwise.
+ *	location computed by the data_of() macro.
  *
  *	The function returns 0 to indicate the end of data stream condition,
  *	and a negative number is returned on error.  In such cases the
@@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
  *	any more.
  */
 
-int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+int snapshot_read_next(struct snapshot_handle *handle)
 {
 	if (handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
@@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		if (!buffer)
 			return -ENOMEM;
 	}
-	if (!handle->offset) {
+	if (!handle->cur) {
 		int error;
 
 		error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		handle->buffer = buffer;
 		memory_bm_position_reset(&orig_bm);
 		memory_bm_position_reset(&copy_bm);
-	}
-	if (handle->prev < handle->cur) {
-		if (handle->cur <= nr_meta_pages) {
-			memset(buffer, 0, PAGE_SIZE);
-			pack_pfns(buffer, &orig_bm);
-		} else {
-			struct page *page;
+	} else if (handle->cur <= nr_meta_pages) {
+		memset(buffer, 0, PAGE_SIZE);
+		pack_pfns(buffer, &orig_bm);
+	} else {
+		struct page *page;
 
-			page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
-			if (PageHighMem(page)) {
-				/* Highmem pages are copied to the buffer,
-				 * because we can't return with a kmapped
-				 * highmem page (we may not be called again).
-				 */
-				void *kaddr;
+		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+		if (PageHighMem(page)) {
+			/* Highmem pages are copied to the buffer,
+			 * because we can't return with a kmapped
+			 * highmem page (we may not be called again).
+			 */
+			void *kaddr;
 
-				kaddr = kmap_atomic(page, KM_USER0);
-				memcpy(buffer, kaddr, PAGE_SIZE);
-				kunmap_atomic(kaddr, KM_USER0);
-				handle->buffer = buffer;
-			} else {
-				handle->buffer = page_address(page);
-			}
+			kaddr = kmap_atomic(page, KM_USER0);
+			memcpy(buffer, kaddr, PAGE_SIZE);
+			kunmap_atomic(kaddr, KM_USER0);
+			handle->buffer = buffer;
+		} else {
+			handle->buffer = page_address(page);
 		}
-		handle->prev = handle->cur;
-	}
-	handle->buf_offset = handle->cur_offset;
-	if (handle->cur_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->cur_offset;
-		handle->cur_offset = 0;
-		handle->cur++;
-	} else {
-		handle->cur_offset += count;
 	}
-	handle->offset += count;
-	return count;
+	handle->cur++;
+	return PAGE_SIZE;
 }
 
 /**
@@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
  *	snapshot_handle structure.  The structure gets updated and a pointer
  *	to it should be passed to this function every next time.
  *
- *	The @count parameter should contain the number of bytes the caller
- *	wants to write to the image.  It must not be zero.
- *
  *	On success the function returns a positive number.  Then, the caller
  *	is allowed to write up to the returned number of bytes to the memory
- *	location computed by the data_of() macro.  The number returned
- *	may be smaller than @count, but this only happens if the write would
- *	cross a page boundary otherwise.
+ *	location computed by the data_of() macro.
  *
  *	The function returns 0 to indicate the "end of file" condition,
  *	and a negative number is returned on error.  In such cases the
@@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
  *	any more.
  */
 
-int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+int snapshot_write_next(struct snapshot_handle *handle)
 {
 	static struct chain_allocator ca;
 	int error = 0;
 
 	/* Check if we have already loaded the entire image */
-	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 
-	if (handle->offset == 0) {
+	handle->sync_read = 1;
+
+	if (!handle->cur) {
 		if (!buffer)
 			/* This makes the buffer be freed by swsusp_free() */
 			buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 			return -ENOMEM;
 
 		handle->buffer = buffer;
-	}
-	handle->sync_read = 1;
-	if (handle->prev < handle->cur) {
-		if (handle->prev == 0) {
-			error = load_header(buffer);
-			if (error)
-				return error;
+	} else if (handle->cur == 1) {
+		error = load_header(buffer);
+		if (error)
+			return error;
 
-			error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
-			if (error)
-				return error;
+		error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+		if (error)
+			return error;
+
+	} else if (handle->cur <= nr_meta_pages + 1) {
+		error = unpack_orig_pfns(buffer, &copy_bm);
+		if (error)
+			return error;
 
-		} else if (handle->prev <= nr_meta_pages) {
-			error = unpack_orig_pfns(buffer, &copy_bm);
+		if (handle->cur == nr_meta_pages + 1) {
+			error = prepare_image(&orig_bm, &copy_bm);
 			if (error)
 				return error;
 
-			if (handle->prev == nr_meta_pages) {
-				error = prepare_image(&orig_bm, &copy_bm);
-				if (error)
-					return error;
-
-				chain_init(&ca, GFP_ATOMIC, PG_SAFE);
-				memory_bm_position_reset(&orig_bm);
-				restore_pblist = NULL;
-				handle->buffer = get_buffer(&orig_bm, &ca);
-				handle->sync_read = 0;
-				if (IS_ERR(handle->buffer))
-					return PTR_ERR(handle->buffer);
-			}
-		} else {
-			copy_last_highmem_page();
+			chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+			memory_bm_position_reset(&orig_bm);
+			restore_pblist = NULL;
 			handle->buffer = get_buffer(&orig_bm, &ca);
+			handle->sync_read = 0;
 			if (IS_ERR(handle->buffer))
 				return PTR_ERR(handle->buffer);
-			if (handle->buffer != buffer)
-				handle->sync_read = 0;
 		}
-		handle->prev = handle->cur;
-	}
-	handle->buf_offset = handle->cur_offset;
-	if (handle->cur_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->cur_offset;
-		handle->cur_offset = 0;
-		handle->cur++;
 	} else {
-		handle->cur_offset += count;
+		copy_last_highmem_page();
+		handle->buffer = get_buffer(&orig_bm, &ca);
+		if (IS_ERR(handle->buffer))
+			return PTR_ERR(handle->buffer);
+		if (handle->buffer != buffer)
+			handle->sync_read = 0;
 	}
-	handle->offset += count;
-	return count;
+	handle->cur++;
+	return PAGE_SIZE;
 }
 
 /**
@@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 {
 	copy_last_highmem_page();
 	/* Free only if we have loaded the image entirely */
-	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
 		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
 		free_highmem_data();
 	}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..7f2a17e4067b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -431,7 +431,7 @@ static int save_image(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 	while (1) {
-		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		ret = snapshot_read_next(snapshot);
 		if (ret <= 0)
 			break;
 		ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -492,7 +492,7 @@ int swsusp_write(unsigned int flags)
 		return error;
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	error = snapshot_read_next(&snapshot);
 	if (error < PAGE_SIZE) {
 		if (error >= 0)
 			error = -EFAULT;
@@ -615,7 +615,7 @@ static int load_image(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 	for ( ; ; ) {
-		error = snapshot_write_next(snapshot, PAGE_SIZE);
+		error = snapshot_write_next(snapshot);
 		if (error <= 0)
 			break;
 		error = swap_read_page(handle, data_of(*snapshot), &bio);
@@ -660,7 +660,7 @@ int swsusp_read(unsigned int *flags_p)
 	*flags_p = swsusp_header->flags;
 
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	error = snapshot_write_next(&snapshot);
 	if (error < PAGE_SIZE)
 		return error < 0 ? error : -EFAULT;
 	header = (struct swsusp_info *)data_of(snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 {
 	struct snapshot_data *data;
 	ssize_t res;
+	loff_t pg_offp = *offp & ~PAGE_MASK;
 
 	mutex_lock(&pm_mutex);
 
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 		res = -ENODATA;
 		goto Unlock;
 	}
-	res = snapshot_read_next(&data->handle, count);
-	if (res > 0) {
-		if (copy_to_user(buf, data_of(data->handle), res))
-			res = -EFAULT;
-		else
-			*offp = data->handle.offset;
+	if (!pg_offp) { /* on page boundary? */
+		res = snapshot_read_next(&data->handle);
+		if (res <= 0)
+			goto Unlock;
+	} else {
+		res = PAGE_SIZE - pg_offp;
 	}
 
+	res = simple_read_from_buffer(buf, count, &pg_offp,
+			data_of(data->handle), res);
+	if (res > 0)
+		*offp += res;
+
  Unlock:
 	mutex_unlock(&pm_mutex);
 
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 {
 	struct snapshot_data *data;
 	ssize_t res;
+	loff_t pg_offp = *offp & ~PAGE_MASK;
 
 	mutex_lock(&pm_mutex);
 
 	data = filp->private_data;
-	res = snapshot_write_next(&data->handle, count);
-	if (res > 0) {
-		if (copy_from_user(data_of(data->handle), buf, res))
-			res = -EFAULT;
-		else
-			*offp = data->handle.offset;
+
+	if (!pg_offp) {
+		res = snapshot_write_next(&data->handle);
+		if (res <= 0)
+			goto unlock;
+	} else {
+		res = PAGE_SIZE - pg_offp;
 	}
 
+	res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
+			buf, count);
+	if (res > 0)
+		*offp += res;
+unlock:
 	mutex_unlock(&pm_mutex);
 
 	return res;
-- 
cgit v1.2.3-55-g7522


From 8a0d613fa12e1b7f7f71ca88ed7dc2a3de95121a Mon Sep 17 00:00:00 2001
From: Jiri Slaby
Date: Sat, 1 May 2010 23:52:34 +0200
Subject: PM / Hibernate: Separate block_io

Move block I/O operations to a separate file. It is because it will
be used later not only by the swap writer.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Makefile   |   3 +-
 kernel/power/block_io.c | 103 ++++++++++++++++++++++++++++++++++++
 kernel/power/power.h    |   9 ++++
 kernel/power/swap.c     | 136 +++++++++---------------------------------------
 4 files changed, 139 insertions(+), 112 deletions(-)
 create mode 100644 kernel/power/block_io.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..524e058dcf06 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
+				   block_io.o
 obj-$(CONFIG_HIBERNATION_NVS)	+= hibernate_nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..2a2f8aed8e59
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
+/*
+ * This file provides functions for block I/O operations on swap/file.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include "power.h"
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, struct block_device *bdev, sector_t sector,
+		struct page *page, struct bio **bio_chain)
+{
+	const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+	struct bio *bio;
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+	bio->bi_sector = sector;
+	bio->bi_bdev = bdev;
+	bio->bi_end_io = end_swap_bio_read;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
+			sector);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(bio_rw, bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		if (rw == READ)
+			get_page(page);	/* These pages are freed later */
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(bio_rw, bio);
+	}
+	return 0;
+}
+
+int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+			virt_to_page(addr), bio_chain);
+}
+
+int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+	return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+			virt_to_page(addr), bio_chain);
+}
+
+int hib_wait_on_bio_chain(struct bio **bio_chain)
+{
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b1e207dde1c2..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -142,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
 
+/* kernel/power/block_io.c */
+extern struct block_device *hib_resume_bdev;
+
+extern int hib_bio_read_page(pgoff_t page_off, void *addr,
+		struct bio **bio_chain);
+extern int hib_bio_write_page(pgoff_t page_off, void *addr,
+		struct bio **bio_chain);
+extern int hib_wait_on_bio_chain(struct bio **bio_chain);
+
 struct timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7f2a17e4067b..1b1ab6fcf386 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -145,93 +145,7 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-static struct block_device *resume_bdev;
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-			struct bio **bio_chain)
-{
-	const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-	struct bio *bio;
-
-	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
-			page_off);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(bio_rw, bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(bio_rw, bio);
-	}
-	return 0;
-}
-
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
-}
-
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
+struct block_device *hib_resume_bdev;
 
 /*
  * Saving part
@@ -241,14 +155,14 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
 {
 	int error;
 
-	bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
 		memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
 		swsusp_header->image = start;
 		swsusp_header->flags = flags;
-		error = bio_write_page(swsusp_resume_block,
+		error = hib_bio_write_page(swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -267,18 +181,18 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 	int res;
 
 	res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
-			&resume_bdev);
+			&hib_resume_bdev);
 	if (res < 0)
 		return res;
 
 	root_swap = res;
-	res = blkdev_get(resume_bdev, FMODE_WRITE);
+	res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
 	if (res)
 		return res;
 
-	res = set_blocksize(resume_bdev, PAGE_SIZE);
+	res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
 	if (res < 0)
-		blkdev_put(resume_bdev, FMODE_WRITE);
+		blkdev_put(hib_resume_bdev, FMODE_WRITE);
 
 	return res;
 }
@@ -309,7 +223,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	} else {
 		src = buf;
 	}
-	return bio_write_page(offset, src, bio_chain);
+	return hib_bio_write_page(offset, src, bio_chain);
 }
 
 /*
@@ -380,7 +294,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		return error;
 	handle->cur->entries[handle->k++] = offset;
 	if (handle->k >= MAP_PAGE_ENTRIES) {
-		error = wait_on_bio_chain(bio_chain);
+		error = hib_wait_on_bio_chain(bio_chain);
 		if (error)
 			goto out;
 		offset = alloc_swapdev_block(root_swap);
@@ -441,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
 			printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
 		nr_pages++;
 	}
-	err2 = wait_on_bio_chain(&bio);
+	err2 = hib_wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
 	if (!ret)
 		ret = err2;
@@ -553,7 +467,7 @@ static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
 	if (!handle->cur)
 		return -ENOMEM;
 
-	error = bio_read_page(start, handle->cur, NULL);
+	error = hib_bio_read_page(start, handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -573,17 +487,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = bio_read_page(offset, buf, bio_chain);
+	error = hib_bio_read_page(offset, buf, bio_chain);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
-		error = wait_on_bio_chain(bio_chain);
+		error = hib_wait_on_bio_chain(bio_chain);
 		handle->k = 0;
 		offset = handle->cur->next_swap;
 		if (!offset)
 			release_swap_reader(handle);
 		else if (!error)
-			error = bio_read_page(offset, handle->cur, NULL);
+			error = hib_bio_read_page(offset, handle->cur, NULL);
 	}
 	return error;
 }
@@ -622,14 +536,14 @@ static int load_image(struct swap_map_handle *handle,
 		if (error)
 			break;
 		if (snapshot->sync_read)
-			error = wait_on_bio_chain(&bio);
+			error = hib_wait_on_bio_chain(&bio);
 		if (error)
 			break;
 		if (!(nr_pages % m))
 			printk("\b\b\b\b%3d%%", nr_pages / m);
 		nr_pages++;
 	}
-	err2 = wait_on_bio_chain(&bio);
+	err2 = hib_wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
 	if (!error)
 		error = err2;
@@ -686,11 +600,11 @@ int swsusp_check(void)
 {
 	int error;
 
-	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-	if (!IS_ERR(resume_bdev)) {
-		set_blocksize(resume_bdev, PAGE_SIZE);
+	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(hib_resume_bdev)) {
+		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		memset(swsusp_header, 0, PAGE_SIZE);
-		error = bio_read_page(swsusp_resume_block,
+		error = hib_bio_read_page(swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -698,7 +612,7 @@ int swsusp_check(void)
 		if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = bio_write_page(swsusp_resume_block,
+			error = hib_bio_write_page(swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -706,11 +620,11 @@ int swsusp_check(void)
 
 put:
 		if (error)
-			blkdev_put(resume_bdev, FMODE_READ);
+			blkdev_put(hib_resume_bdev, FMODE_READ);
 		else
 			pr_debug("PM: Signature found, resuming\n");
 	} else {
-		error = PTR_ERR(resume_bdev);
+		error = PTR_ERR(hib_resume_bdev);
 	}
 
 	if (error)
@@ -725,12 +639,12 @@ put:
 
 void swsusp_close(fmode_t mode)
 {
-	if (IS_ERR(resume_bdev)) {
+	if (IS_ERR(hib_resume_bdev)) {
 		pr_debug("PM: Image device not initialised\n");
 		return;
 	}
 
-	blkdev_put(resume_bdev, mode);
+	blkdev_put(hib_resume_bdev, mode);
 }
 
 static int swsusp_header_init(void)
-- 
cgit v1.2.3-55-g7522


From 51fb352b2c586b29c7bba38178b3b5389a7fb074 Mon Sep 17 00:00:00 2001
From: Jiri Slaby
Date: Sat, 1 May 2010 23:53:02 +0200
Subject: PM / Hibernate: Move the first_sector out of swsusp_write

The first sector knowledge is swap-only specific. Move it into the
swap handle. This will be needed for later non-swap specific code
moving into snapshot.c.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: "Rafael J. Wysocki" <rjw@sisk.pl>
---
 kernel/power/swap.c | 76 ++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1b1ab6fcf386..63e80628a326 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -29,6 +29,40 @@
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
+
+struct swap_map_page {
+	sector_t entries[MAP_PAGE_ENTRIES];
+	sector_t next_swap;
+};
+
+/**
+ *	The swap_map_handle structure is used for handling swap in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	sector_t cur_swap;
+	sector_t first_sector;
+	unsigned int k;
+};
+
 struct swsusp_header {
 	char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
 	sector_t image;
@@ -151,7 +185,7 @@ struct block_device *hib_resume_bdev;
  * Saving part
  */
 
-static int mark_swapfiles(sector_t start, unsigned int flags)
+static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
@@ -160,7 +194,7 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
 		memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
-		swsusp_header->image = start;
+		swsusp_header->image = handle->first_sector;
 		swsusp_header->flags = flags;
 		error = hib_bio_write_page(swsusp_resume_block,
 					swsusp_header, NULL);
@@ -226,39 +260,6 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	return hib_bio_write_page(offset, src, bio_chain);
 }
 
-/*
- *	The swap map is a data structure used for keeping track of each page
- *	written to a swap partition.  It consists of many swap_map_page
- *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are stored on the swap and linked together with the
- *	help of the .next_swap member.
- *
- *	The swap map is created during suspend.  The swap map pages are
- *	allocated and populated one at a time, so we only need one memory
- *	page to set up the entire structure.
- *
- *	During resume we also only need to use one swap_map_page structure
- *	at a time.
- */
-
-#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
-
-struct swap_map_page {
-	sector_t entries[MAP_PAGE_ENTRIES];
-	sector_t next_swap;
-};
-
-/**
- *	The swap_map_handle structure is used for handling swap in
- *	a file-alike way
- */
-
-struct swap_map_handle {
-	struct swap_map_page *cur;
-	sector_t cur_swap;
-	unsigned int k;
-};
-
 static void release_swap_writer(struct swap_map_handle *handle)
 {
 	if (handle->cur)
@@ -277,6 +278,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 		return -ENOSPC;
 	}
 	handle->k = 0;
+	handle->first_sector = handle->cur_swap;
 	return 0;
 }
 
@@ -421,8 +423,6 @@ int swsusp_write(unsigned int flags)
 	}
 	error = get_swap_writer(&handle);
 	if (!error) {
-		sector_t start = handle.cur_swap;
-
 		error = swap_write_page(&handle, header, NULL);
 		if (!error)
 			error = save_image(&handle, &snapshot,
@@ -431,7 +431,7 @@ int swsusp_write(unsigned int flags)
 		if (!error) {
 			flush_swap_writer(&handle);
 			printk(KERN_INFO "PM: S");
-			error = mark_swapfiles(start, flags);
+			error = mark_swapfiles(&handle, flags);
 			printk("|\n");
 		}
 	}
-- 
cgit v1.2.3-55-g7522


From 6f612af57821c637b7eaca4374ac7b85f800d6e2 Mon Sep 17 00:00:00 2001
From: Jiri Slaby
Date: Sat, 1 May 2010 23:54:02 +0200
Subject: PM / Hibernate: Group swap ops

Move all the swap processing into one function. It will make swap
calls from a non-swap code easier.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 117 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 63e80628a326..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -208,9 +208,10 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 /**
  *	swsusp_swap_check - check if the resume device is a swap device
  *	and get its index (if so)
+ *
+ *	This is called before saving image
  */
-
-static int swsusp_swap_check(void) /* This is called before saving image */
+static int swsusp_swap_check(void)
 {
 	int res;
 
@@ -269,17 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
 
 static int get_swap_writer(struct swap_map_handle *handle)
 {
+	int ret;
+
+	ret = swsusp_swap_check();
+	if (ret) {
+		if (ret != -ENOSPC)
+			printk(KERN_ERR "PM: Cannot find swap device, try "
+					"swapon -a.\n");
+		return ret;
+	}
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
-	if (!handle->cur)
-		return -ENOMEM;
+	if (!handle->cur) {
+		ret = -ENOMEM;
+		goto err_close;
+	}
 	handle->cur_swap = alloc_swapdev_block(root_swap);
 	if (!handle->cur_swap) {
-		release_swap_writer(handle);
-		return -ENOSPC;
+		ret = -ENOSPC;
+		goto err_rel;
 	}
 	handle->k = 0;
 	handle->first_sector = handle->cur_swap;
 	return 0;
+err_rel:
+	release_swap_writer(handle);
+err_close:
+	swsusp_close(FMODE_WRITE);
+	return ret;
 }
 
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -322,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
 		return -EINVAL;
 }
 
+static int swap_writer_finish(struct swap_map_handle *handle,
+		unsigned int flags, int error)
+{
+	if (!error) {
+		flush_swap_writer(handle);
+		printk(KERN_INFO "PM: S");
+		error = mark_swapfiles(handle, flags);
+		printk("|\n");
+	}
+
+	if (error)
+		free_all_swap_pages(root_swap);
+	release_swap_writer(handle);
+	swsusp_close(FMODE_WRITE);
+
+	return error;
+}
+
 /**
  *	save_image - save the suspend image data
  */
@@ -399,48 +434,34 @@ int swsusp_write(unsigned int flags)
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
+	unsigned long pages;
 	int error;
 
-	error = swsusp_swap_check();
+	pages = snapshot_get_image_size();
+	error = get_swap_writer(&handle);
 	if (error) {
-		printk(KERN_ERR "PM: Cannot find swap device, try "
-				"swapon -a.\n");
+		printk(KERN_ERR "PM: Cannot get swap writer\n");
 		return error;
 	}
+	if (!enough_swap(pages)) {
+		printk(KERN_ERR "PM: Not enough free swap\n");
+		error = -ENOSPC;
+		goto out_finish;
+	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_read_next(&snapshot);
 	if (error < PAGE_SIZE) {
 		if (error >= 0)
 			error = -EFAULT;
 
-		goto out;
+		goto out_finish;
 	}
 	header = (struct swsusp_info *)data_of(snapshot);
-	if (!enough_swap(header->pages)) {
-		printk(KERN_ERR "PM: Not enough free swap\n");
-		error = -ENOSPC;
-		goto out;
-	}
-	error = get_swap_writer(&handle);
-	if (!error) {
-		error = swap_write_page(&handle, header, NULL);
-		if (!error)
-			error = save_image(&handle, &snapshot,
-					header->pages - 1);
-
-		if (!error) {
-			flush_swap_writer(&handle);
-			printk(KERN_INFO "PM: S");
-			error = mark_swapfiles(&handle, flags);
-			printk("|\n");
-		}
-	}
-	if (error)
-		free_all_swap_pages(root_swap);
-
-	release_swap_writer(&handle);
- out:
-	swsusp_close(FMODE_WRITE);
+	error = swap_write_page(&handle, header, NULL);
+	if (!error)
+		error = save_image(&handle, &snapshot, pages - 1);
+out_finish:
+	error = swap_writer_finish(&handle, flags, error);
 	return error;
 }
 
@@ -456,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
 	handle->cur = NULL;
 }
 
-static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
+static int get_swap_reader(struct swap_map_handle *handle,
+		unsigned int *flags_p)
 {
 	int error;
 
-	if (!start)
+	*flags_p = swsusp_header->flags;
+
+	if (!swsusp_header->image) /* how can this happen? */
 		return -EINVAL;
 
 	handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
 	if (!handle->cur)
 		return -ENOMEM;
 
-	error = hib_bio_read_page(start, handle->cur, NULL);
+	error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -502,6 +526,13 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	return error;
 }
 
+static int swap_reader_finish(struct swap_map_handle *handle)
+{
+	release_swap_reader(handle);
+
+	return 0;
+}
+
 /**
  *	load_image - load the image using the swap map handle
  *	@handle and the snapshot handle @snapshot
@@ -571,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
 
-	*flags_p = swsusp_header->flags;
-
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
 	error = snapshot_write_next(&snapshot);
 	if (error < PAGE_SIZE)
 		return error < 0 ? error : -EFAULT;
 	header = (struct swsusp_info *)data_of(snapshot);
-	error = get_swap_reader(&handle, swsusp_header->image);
+	error = get_swap_reader(&handle, flags_p);
+	if (error)
+		goto end;
 	if (!error)
 		error = swap_read_page(&handle, header, NULL);
 	if (!error)
 		error = load_image(&handle, &snapshot, header->pages - 1);
-	release_swap_reader(&handle);
-
+	swap_reader_finish(&handle);
+end:
 	if (!error)
 		pr_debug("PM: Image successfully loaded\n");
 	else
-- 
cgit v1.2.3-55-g7522


From 0fef8b1e83c4ab08cf1304dbebcfd749caf4f187 Mon Sep 17 00:00:00 2001
From: Randy Dunlap
Date: Tue, 4 May 2010 00:03:26 +0200
Subject: PM / Hibernate: Fix block_io.c printk warning

Fix printk format warning in block_io.c:

kernel/power/block_io.c:41: warning: format '%ld' expects type 'long int', but argument 2 has type 'sector_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/block_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 2a2f8aed8e59..97024fd40cd5 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -37,8 +37,8 @@ static int submit(int rw, struct block_device *bdev, sector_t sector,
 	bio->bi_end_io = end_swap_bio_read;
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
-			sector);
+		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+			(unsigned long long)sector);
 		bio_put(bio);
 		return -EFAULT;
 	}
-- 
cgit v1.2.3-55-g7522


From ed77134bfccf5e75b6cbadab268e559dbe6a4ebb Mon Sep 17 00:00:00 2001
From: Mark Gross
Date: Thu, 6 May 2010 01:59:26 +0200
Subject: PM QOS update

This patch changes the string based list management to a handle base
implementation to help with the hot path use of pm-qos, it also renames
much of the API to use "request" as opposed to "requirement" that was
used in the initial implementation.  I did this because request more
accurately represents what it actually does.

Also, I added a string based ABI for users wanting to use a string
interface.  So if the user writes 0xDDDDDDDD formatted hex it will be
accepted by the interface.  (someone asked me for it and I don't think
it hurts anything.)

This patch updates some documentation input I got from Randy.

Signed-off-by: markgross <mgross@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/power/pm_qos_interface.txt |  48 +++----
 drivers/acpi/processor_idle.c            |   2 +-
 drivers/cpuidle/governors/ladder.c       |   2 +-
 drivers/cpuidle/governors/menu.c         |   2 +-
 drivers/net/e1000e/netdev.c              |  22 ++--
 drivers/net/igbvf/netdev.c               |   6 +-
 drivers/net/wireless/ipw2x00/ipw2100.c   |  11 +-
 include/linux/netdevice.h                |   4 +
 include/linux/pm_qos_params.h            |  14 +-
 include/sound/pcm.h                      |   3 +-
 kernel/pm_qos_params.c                   | 214 +++++++++++++++----------------
 net/mac80211/mlme.c                      |   2 +-
 sound/core/pcm.c                         |   3 -
 sound/core/pcm_native.c                  |  14 +-
 14 files changed, 176 insertions(+), 171 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index c40866e8b957..bfed898a03fc 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -18,44 +18,46 @@ and pm_qos_params.h.  This is done because having the available parameters
 being runtime configurable or changeable from a driver was seen as too easy to
 abuse.
 
-For each parameter a list of performance requirements is maintained along with
+For each parameter a list of performance requests is maintained along with
 an aggregated target value.  The aggregated target value is updated with
-changes to the requirement list or elements of the list.  Typically the
-aggregated target value is simply the max or min of the requirement values held
+changes to the request list or elements of the list.  Typically the
+aggregated target value is simply the max or min of the request values held
 in the parameter list elements.
 
 From kernel mode the use of this interface is simple:
-pm_qos_add_requirement(param_id, name, target_value):
-Will insert a named element in the list for that identified PM_QOS parameter
-with the target value.  Upon change to this list the new target is recomputed
-and any registered notifiers are called only if the target value is now
-different.
 
-pm_qos_update_requirement(param_id, name, new_target_value):
-Will search the list identified by the param_id for the named list element and
-then update its target value, calling the notification tree if the aggregated
-target is changed.  with that name is already registered.
+handle = pm_qos_add_request(param_class, target_value):
+Will insert an element into the list for that identified PM_QOS class with the
+target value.  Upon change to this list the new target is recomputed and any
+registered notifiers are called only if the target value is now different.
+Clients of pm_qos need to save the returned handle.
 
-pm_qos_remove_requirement(param_id, name):
-Will search the identified list for the named element and remove it, after
-removal it will update the aggregate target and call the notification tree if
-the target was changed as a result of removing the named requirement.
+void pm_qos_update_request(handle, new_target_value):
+Will update the list element pointed to by the handle with the new target value
+and recompute the new aggregated target, calling the notification tree if the
+target is changed.
+
+void pm_qos_remove_request(handle):
+Will remove the element.  After removal it will update the aggregate target and
+call the notification tree if the target was changed as a result of removing
+the request.
 
 
 From user mode:
-Only processes can register a pm_qos requirement.  To provide for automatic
-cleanup for process the interface requires the process to register its
-parameter requirements in the following way:
+Only processes can register a pm_qos request.  To provide for automatic
+cleanup of a process, the interface requires the process to register its
+parameter requests in the following way:
 
 To register the default pm_qos target for the specific parameter, the process
 must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
 
 As long as the device node is held open that process has a registered
-requirement on the parameter.  The name of the requirement is "process_<PID>"
-derived from the current->pid from within the open system call.
+request on the parameter.
 
-To change the requested target value the process needs to write a s32 value to
-the open device node.  This translates to a pm_qos_update_requirement call.
+To change the requested target value the process needs to write an s32 value to
+the open device node.  Alternatively the user mode program could write a hex
+string for the value using 10 char long format e.g. "0x12345678".  This
+translates to a pm_qos_update_request call.
 
 To remove the user mode request for a target value simply close the device
 node.
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5939e7f7d8e9..c3817e1f32c7 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -698,7 +698,7 @@ static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
 		   "max_cstate:              C%d\n"
 		   "maximum allowed latency: %d usec\n",
 		   pr->power.state ? pr->power.state - pr->power.states : 0,
-		   max_cstate, pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY));
+		   max_cstate, pm_qos_request(PM_QOS_CPU_DMA_LATENCY));
 
 	seq_puts(seq, "states:\n");
 
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 1c1ceb4f218f..12c98900dcf8 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -67,7 +67,7 @@ static int ladder_select_state(struct cpuidle_device *dev)
 	struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
 	struct ladder_device_state *last_state;
 	int last_residency, last_idx = ldev->last_state_idx;
-	int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
+	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 
 	/* Special case when user has set very strict latency requirement */
 	if (unlikely(latency_req == 0)) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f8e57c6303f2..b81ad9c731ae 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -182,7 +182,7 @@ static u64 div_round64(u64 dividend, u32 divisor)
 static int menu_select(struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
-	int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
+	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	int i;
 	int multiplier;
 
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index dbf81788bb40..d5d55c6a373f 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -2524,12 +2524,12 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 			 * excessive C-state transition latencies result in
 			 * dropped transactions.
 			 */
-			pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY,
-						  adapter->netdev->name, 55);
+			pm_qos_update_request(
+				adapter->netdev->pm_qos_req, 55);
 		} else {
-			pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY,
-						  adapter->netdev->name,
-						  PM_QOS_DEFAULT_VALUE);
+			pm_qos_update_request(
+				adapter->netdev->pm_qos_req,
+				PM_QOS_DEFAULT_VALUE);
 		}
 	}
 
@@ -2824,8 +2824,8 @@ int e1000e_up(struct e1000_adapter *adapter)
 
 	/* DMA latency requirement to workaround early-receive/jumbo issue */
 	if (adapter->flags & FLAG_HAS_ERT)
-		pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY,
-		                       adapter->netdev->name,
+		adapter->netdev->pm_qos_req =
+			pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
 				       PM_QOS_DEFAULT_VALUE);
 
 	/* hardware has been reset, we need to reload some things */
@@ -2887,9 +2887,11 @@ void e1000e_down(struct e1000_adapter *adapter)
 	e1000_clean_tx_ring(adapter);
 	e1000_clean_rx_ring(adapter);
 
-	if (adapter->flags & FLAG_HAS_ERT)
-		pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
-		                          adapter->netdev->name);
+	if (adapter->flags & FLAG_HAS_ERT) {
+		pm_qos_remove_request(
+			      adapter->netdev->pm_qos_req);
+		adapter->netdev->pm_qos_req = NULL;
+	}
 
 	/*
 	 * TODO: for power management, we could drop the link and
diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
index 1b1edad1eb5e..f16e981812a9 100644
--- a/drivers/net/igbvf/netdev.c
+++ b/drivers/net/igbvf/netdev.c
@@ -48,6 +48,7 @@
 #define DRV_VERSION "1.0.0-k0"
 char igbvf_driver_name[] = "igbvf";
 const char igbvf_driver_version[] = DRV_VERSION;
+struct pm_qos_request_list *igbvf_driver_pm_qos_req;
 static const char igbvf_driver_string[] =
 				"Intel(R) Virtual Function Network Driver";
 static const char igbvf_copyright[] = "Copyright (c) 2009 Intel Corporation.";
@@ -2899,7 +2900,7 @@ static int __init igbvf_init_module(void)
 	printk(KERN_INFO "%s\n", igbvf_copyright);
 
 	ret = pci_register_driver(&igbvf_driver);
-	pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, igbvf_driver_name,
+	igbvf_driver_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
 	                       PM_QOS_DEFAULT_VALUE);
 
 	return ret;
@@ -2915,7 +2916,8 @@ module_init(igbvf_init_module);
 static void __exit igbvf_exit_module(void)
 {
 	pci_unregister_driver(&igbvf_driver);
-	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, igbvf_driver_name);
+	pm_qos_remove_request(igbvf_driver_pm_qos_req);
+	igbvf_driver_pm_qos_req = NULL;
 }
 module_exit(igbvf_exit_module);
 
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 9b72c45a7748..2b05fe5e994c 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -174,6 +174,8 @@ that only one external action is invoked at a time.
 #define DRV_DESCRIPTION	"Intel(R) PRO/Wireless 2100 Network Driver"
 #define DRV_COPYRIGHT	"Copyright(c) 2003-2006 Intel Corporation"
 
+struct pm_qos_request_list *ipw2100_pm_qos_req;
+
 /* Debugging stuff */
 #ifdef CONFIG_IPW2100_DEBUG
 #define IPW2100_RX_DEBUG	/* Reception debugging */
@@ -1739,7 +1741,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
 	/* the ipw2100 hardware really doesn't want power management delays
 	 * longer than 175usec
 	 */
-	pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 175);
+	pm_qos_update_request(ipw2100_pm_qos_req, 175);
 
 	/* If the interrupt is enabled, turn it off... */
 	spin_lock_irqsave(&priv->low_lock, flags);
@@ -1887,8 +1889,7 @@ static void ipw2100_down(struct ipw2100_priv *priv)
 	ipw2100_disable_interrupts(priv);
 	spin_unlock_irqrestore(&priv->low_lock, flags);
 
-	pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100",
-			PM_QOS_DEFAULT_VALUE);
+	pm_qos_update_request(ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
 
 	/* We have to signal any supplicant if we are disassociating */
 	if (associated)
@@ -6669,7 +6670,7 @@ static int __init ipw2100_init(void)
 	if (ret)
 		goto out;
 
-	pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100",
+	ipw2100_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
 			PM_QOS_DEFAULT_VALUE);
 #ifdef CONFIG_IPW2100_DEBUG
 	ipw2100_debug_level = debug;
@@ -6692,7 +6693,7 @@ static void __exit ipw2100_exit(void)
 			   &driver_attr_debug_level);
 #endif
 	pci_unregister_driver(&ipw2100_pci_driver);
-	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100");
+	pm_qos_remove_request(ipw2100_pm_qos_req);
 }
 
 module_init(ipw2100_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b47637997..3857517f1ca5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -31,6 +31,7 @@
 #include <linux/if_link.h>
 
 #ifdef __KERNEL__
+#include <linux/pm_qos_params.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/mm.h>
@@ -711,6 +712,9 @@ struct net_device {
 	 * the interface.
 	 */
 	char			name[IFNAMSIZ];
+
+	struct pm_qos_request_list *pm_qos_req;
+
 	/* device name hash chain */
 	struct hlist_node	name_hlist;
 	/* snmp alias */
diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
index d74f75ed1e47..8ba440e5eb7f 100644
--- a/include/linux/pm_qos_params.h
+++ b/include/linux/pm_qos_params.h
@@ -14,12 +14,14 @@
 #define PM_QOS_NUM_CLASSES 4
 #define PM_QOS_DEFAULT_VALUE -1
 
-int pm_qos_add_requirement(int qos, char *name, s32 value);
-int pm_qos_update_requirement(int qos, char *name, s32 new_value);
-void pm_qos_remove_requirement(int qos, char *name);
+struct pm_qos_request_list;
 
-int pm_qos_requirement(int qos);
+struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value);
+void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+		s32 new_value);
+void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
 
-int pm_qos_add_notifier(int qos, struct notifier_block *notifier);
-int pm_qos_remove_notifier(int qos, struct notifier_block *notifier);
+int pm_qos_request(int pm_qos_class);
+int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier);
+int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier);
 
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 8b611a561985..dd76cdede64d 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -29,6 +29,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/bitops.h>
+#include <linux/pm_qos_params.h>
 
 #define snd_pcm_substream_chip(substream) ((substream)->private_data)
 #define snd_pcm_chip(pcm) ((pcm)->private_data)
@@ -365,7 +366,7 @@ struct snd_pcm_substream {
 	int number;
 	char name[32];			/* substream name */
 	int stream;			/* stream (direction) */
-	char latency_id[20];		/* latency identifier */
+	struct pm_qos_request_list *latency_pm_qos_req; /* pm_qos request */
 	size_t buffer_bytes_max;	/* limit ring buffer size */
 	struct snd_dma_buffer dma_buffer;
 	unsigned int dma_buf_id;
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..a1aea040eb57 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
  * This module exposes the interface to kernel space for specifying
  * QoS dependencies.  It provides infrastructure for registration of:
  *
- * Dependents on a QoS value : register requirements
+ * Dependents on a QoS value : register requests
  * Watchers of QoS value : get notified when target QoS value changes
  *
  * This QoS design is best effort based.  Dependents register their QoS needs.
@@ -14,19 +14,21 @@
  * timeout: usec <-- currently not used.
  * throughput: kbs (kilo byte / sec)
  *
- * There are lists of pm_qos_objects each one wrapping requirements, notifiers
+ * There are lists of pm_qos_objects each one wrapping requests, notifiers
  *
- * User mode requirements on a QOS parameter register themselves to the
+ * User mode requests on a QOS parameter register themselves to the
  * subsystem by opening the device node /dev/... and writing there request to
  * the node.  As long as the process holds a file handle open to the node the
  * client continues to be accounted for.  Upon file release the usermode
- * requirement is removed and a new qos target is computed.  This way when the
- * requirement that the application has is cleaned up when closes the file
+ * request is removed and a new qos target is computed.  This way when the
+ * request that the application has is cleaned up when closes the file
  * pointer or exits the pm_qos_object will get an opportunity to clean up.
  *
  * Mark Gross <mgross@linux.intel.com>
  */
 
+/*#define DEBUG*/
+
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -42,25 +44,25 @@
 #include <linux/uaccess.h>
 
 /*
- * locking rule: all changes to requirements or notifiers lists
+ * locking rule: all changes to requests or notifiers lists
  * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
  * held, taken with _irqsave.  One lock to rule them all
  */
-struct requirement_list {
+struct pm_qos_request_list {
 	struct list_head list;
 	union {
 		s32 value;
 		s32 usec;
 		s32 kbps;
 	};
-	char *name;
+	int pm_qos_class;
 };
 
 static s32 max_compare(s32 v1, s32 v2);
 static s32 min_compare(s32 v1, s32 v2);
 
 struct pm_qos_object {
-	struct requirement_list requirements;
+	struct pm_qos_request_list requests;
 	struct blocking_notifier_head *notifiers;
 	struct miscdevice pm_qos_power_miscdev;
 	char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-	.requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
+	.requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
 	.notifiers = &cpu_dma_lat_notifier,
 	.name = "cpu_dma_latency",
 	.default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-	.requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
+	.requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
 	.notifiers = &network_lat_notifier,
 	.name = "network_latency",
 	.default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
 
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-	.requirements =
-		{LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
+	.requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
 	.notifiers = &network_throughput_notifier,
 	.name = "network_throughput",
 	.default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
 }
 
 
-static void update_target(int target)
+static void update_target(int pm_qos_class)
 {
 	s32 extreme_value;
-	struct requirement_list *node;
+	struct pm_qos_request_list *node;
 	unsigned long flags;
 	int call_notifier = 0;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	extreme_value = pm_qos_array[target]->default_value;
+	extreme_value = pm_qos_array[pm_qos_class]->default_value;
 	list_for_each_entry(node,
-			&pm_qos_array[target]->requirements.list, list) {
-		extreme_value = pm_qos_array[target]->comparitor(
+			&pm_qos_array[pm_qos_class]->requests.list, list) {
+		extreme_value = pm_qos_array[pm_qos_class]->comparitor(
 				extreme_value, node->value);
 	}
-	if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
+	if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
+			extreme_value) {
 		call_notifier = 1;
-		atomic_set(&pm_qos_array[target]->target_value, extreme_value);
-		pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
-			atomic_read(&pm_qos_array[target]->target_value));
+		atomic_set(&pm_qos_array[pm_qos_class]->target_value,
+				extreme_value);
+		pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
+			atomic_read(&pm_qos_array[pm_qos_class]->target_value));
 	}
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
 	if (call_notifier)
-		blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
-			(unsigned long) extreme_value, NULL);
+		blocking_notifier_call_chain(
+				pm_qos_array[pm_qos_class]->notifiers,
+					(unsigned long) extreme_value, NULL);
 }
 
 static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,110 @@ static int find_pm_qos_object_by_minor(int minor)
 }
 
 /**
- * pm_qos_requirement - returns current system wide qos expectation
+ * pm_qos_request - returns current system wide qos expectation
  * @pm_qos_class: identification of which qos value is requested
  *
  * This function returns the current target value in an atomic manner.
  */
-int pm_qos_requirement(int pm_qos_class)
+int pm_qos_request(int pm_qos_class)
 {
 	return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
 }
-EXPORT_SYMBOL_GPL(pm_qos_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_request);
 
 /**
- * pm_qos_add_requirement - inserts new qos request into the list
+ * pm_qos_add_request - inserts new qos request into the list
  * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
  * @value: defines the qos request
  *
  * This function inserts a new entry in the pm_qos_class list of requested qos
  * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters.
+ * for the pm_qos_class of parameters, and returns the pm_qos_request list
+ * element as a handle for use in updating and removal.  Call needs to save
+ * this handle for later use.
  */
-int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
+struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
 {
-	struct requirement_list *dep;
+	struct pm_qos_request_list *dep;
 	unsigned long flags;
 
-	dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
+	dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
 	if (dep) {
 		if (value == PM_QOS_DEFAULT_VALUE)
 			dep->value = pm_qos_array[pm_qos_class]->default_value;
 		else
 			dep->value = value;
-		dep->name = kstrdup(name, GFP_KERNEL);
-		if (!dep->name)
-			goto cleanup;
+		dep->pm_qos_class = pm_qos_class;
 
 		spin_lock_irqsave(&pm_qos_lock, flags);
 		list_add(&dep->list,
-			&pm_qos_array[pm_qos_class]->requirements.list);
+			&pm_qos_array[pm_qos_class]->requests.list);
 		spin_unlock_irqrestore(&pm_qos_lock, flags);
 		update_target(pm_qos_class);
-
-		return 0;
 	}
 
-cleanup:
-	kfree(dep);
-	return -ENOMEM;
+	return dep;
 }
-EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
 /**
- * pm_qos_update_requirement - modifies an existing qos request
- * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
+ * pm_qos_update_request - modifies an existing qos request
+ * @pm_qos_req : handle to list element holding a pm_qos request to use
  * @value: defines the qos request
  *
- * Updates an existing qos requirement for the pm_qos_class of parameters along
+ * Updates an existing qos request for the pm_qos_class of parameters along
  * with updating the target pm_qos_class value.
  *
- * If the named request isn't in the list then no change is made.
+ * Attempts are made to make this code callable on hot code paths.
  */
-int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
+void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+		s32 new_value)
 {
 	unsigned long flags;
-	struct requirement_list *node;
 	int pending_update = 0;
+	s32 temp;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	list_for_each_entry(node,
-		&pm_qos_array[pm_qos_class]->requirements.list, list) {
-		if (strcmp(node->name, name) == 0) {
-			if (new_value == PM_QOS_DEFAULT_VALUE)
-				node->value =
-				pm_qos_array[pm_qos_class]->default_value;
-			else
-				node->value = new_value;
-			pending_update = 1;
-			break;
-		}
+	if (new_value == PM_QOS_DEFAULT_VALUE)
+		temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
+	else
+		temp = new_value;
+
+	if (temp != pm_qos_req->value) {
+		pending_update = 1;
+		pm_qos_req->value = temp;
 	}
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 	if (pending_update)
-		update_target(pm_qos_class);
-
-	return 0;
+		update_target(pm_qos_req->pm_qos_class);
 }
-EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_update_request);
 
 /**
- * pm_qos_remove_requirement - modifies an existing qos request
- * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
+ * pm_qos_remove_request - modifies an existing qos request
+ * @pm_qos_req: handle to request list element
  *
- * Will remove named qos request from pm_qos_class list of parameters and
- * recompute the current target value for the pm_qos_class.
+ * Will remove pm qos request from the list of requests and
+ * recompute the current target value for the pm_qos_class.  Call this
+ * on slow code paths.
  */
-void pm_qos_remove_requirement(int pm_qos_class, char *name)
+void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
 {
 	unsigned long flags;
-	struct requirement_list *node;
-	int pending_update = 0;
+	int qos_class;
+
+	if (pm_qos_req == NULL)
+		return;
+		/* silent return to keep pcm code cleaner */
 
+	qos_class = pm_qos_req->pm_qos_class;
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	list_for_each_entry(node,
-		&pm_qos_array[pm_qos_class]->requirements.list, list) {
-		if (strcmp(node->name, name) == 0) {
-			kfree(node->name);
-			list_del(&node->list);
-			kfree(node);
-			pending_update = 1;
-			break;
-		}
-	}
+	list_del(&pm_qos_req->list);
+	kfree(pm_qos_req);
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
-	if (pending_update)
-		update_target(pm_qos_class);
+	update_target(qos_class);
 }
-EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_remove_request);
 
 /**
  * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +302,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
  * will register the notifier into a notification chain that gets called
  * upon changes to the pm_qos_class target value.
  */
- int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
 	int retval;
 
@@ -343,21 +332,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
 
-#define PID_NAME_LEN 32
-
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
-	int ret;
 	long pm_qos_class;
-	char name[PID_NAME_LEN];
 
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
-		filp->private_data = (void *)pm_qos_class;
-		snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-		ret = pm_qos_add_requirement(pm_qos_class, name,
-					PM_QOS_DEFAULT_VALUE);
-		if (ret >= 0)
+		filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
+				PM_QOS_DEFAULT_VALUE);
+
+		if (filp->private_data)
 			return 0;
 	}
 	return -EPERM;
@@ -365,32 +349,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
-	int pm_qos_class;
-	char name[PID_NAME_LEN];
+	struct pm_qos_request_list *req;
 
-	pm_qos_class = (long)filp->private_data;
-	snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-	pm_qos_remove_requirement(pm_qos_class, name);
+	req = (struct pm_qos_request_list *)filp->private_data;
+	pm_qos_remove_request(req);
 
 	return 0;
 }
 
+
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		size_t count, loff_t *f_pos)
 {
 	s32 value;
-	int pm_qos_class;
-	char name[PID_NAME_LEN];
-
-	pm_qos_class = (long)filp->private_data;
-	if (count != sizeof(s32))
+	int x;
+	char ascii_value[11];
+	struct pm_qos_request_list *pm_qos_req;
+
+	if (count == sizeof(s32)) {
+		if (copy_from_user(&value, buf, sizeof(s32)))
+			return -EFAULT;
+	} else if (count == 11) { /* len('0x12345678/0') */
+		if (copy_from_user(ascii_value, buf, 11))
+			return -EFAULT;
+		x = sscanf(ascii_value, "%x", &value);
+		if (x != 1)
+			return -EINVAL;
+		pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
+	} else
 		return -EINVAL;
-	if (copy_from_user(&value, buf, sizeof(s32)))
-		return -EFAULT;
-	snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-	pm_qos_update_requirement(pm_qos_class, name, value);
 
-	return  sizeof(s32);
+	pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
+	pm_qos_update_request(pm_qos_req, value);
+
+	return count;
 }
 
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4aefa6dc3091..29de1965ff74 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -495,7 +495,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
 		s32 beaconint_us;
 
 		if (latency < 0)
-			latency = pm_qos_requirement(PM_QOS_NETWORK_LATENCY);
+			latency = pm_qos_request(PM_QOS_NETWORK_LATENCY);
 
 		beaconint_us = ieee80211_tu_to_usec(
 					found->vif.bss_conf.beacon_int);
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 0d428d0896db..cbe815dfbdc8 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -648,9 +648,6 @@ int snd_pcm_new_stream(struct snd_pcm *pcm, int stream, int substream_count)
 		substream->number = idx;
 		substream->stream = stream;
 		sprintf(substream->name, "subdevice #%i", idx);
-		snprintf(substream->latency_id, sizeof(substream->latency_id),
-			 "ALSA-PCM%d-%d%c%d", pcm->card->number, pcm->device,
-			 (stream ? 'c' : 'p'), idx);
 		substream->buffer_bytes_max = UINT_MAX;
 		if (prev == NULL)
 			pstr->substream = substream;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 872887624030..605c86df71c5 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -481,11 +481,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
 	snd_pcm_timer_resolution_change(substream);
 	runtime->status->state = SNDRV_PCM_STATE_SETUP;
 
-	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
-				substream->latency_id);
+	if (substream->latency_pm_qos_req) {
+		pm_qos_remove_request(substream->latency_pm_qos_req);
+		substream->latency_pm_qos_req = NULL;
+	}
 	if ((usecs = period_to_usecs(runtime)) >= 0)
-		pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY,
-					substream->latency_id, usecs);
+		substream->latency_pm_qos_req = pm_qos_add_request(
+					PM_QOS_CPU_DMA_LATENCY, usecs);
 	return 0;
  _error:
 	/* hardware might be unuseable from this time,
@@ -540,8 +542,8 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
 	if (substream->ops->hw_free)
 		result = substream->ops->hw_free(substream);
 	runtime->status->state = SNDRV_PCM_STATE_OPEN;
-	pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
-		substream->latency_id);
+	pm_qos_remove_request(substream->latency_pm_qos_req);
+	substream->latency_pm_qos_req = NULL;
 	return result;
 }
 
-- 
cgit v1.2.3-55-g7522


From 8f77578cc2debaeb30a4ef6206f4ba10944bdcd8 Mon Sep 17 00:00:00 2001
From: Matt Helsley
Date: Mon, 10 May 2010 23:18:47 +0200
Subject: Freezer / cgroup freezer: Update stale locking comments

Update stale comments regarding locking order and add a little more detail
so it's easier to follow the locking between the cgroup freezer and the
power management freezer code.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/cgroup_freezer.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e5c0244962b0..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
 
 /* Locks taken and their ordering
  * ------------------------------
- * css_set_lock
  * cgroup_mutex (AKA cgroup_lock)
- * task->alloc_lock (AKA task_lock)
  * freezer->lock
+ * css_set_lock
+ * task->alloc_lock (AKA task_lock)
  * task->sighand->siglock
  *
  * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
  * freezer_create(), freezer_destroy():
  * cgroup_mutex [ by cgroup core ]
  *
- * can_attach():
- * cgroup_mutex
+ * freezer_can_attach():
+ * cgroup_mutex (held by caller of can_attach)
  *
- * cgroup_frozen():
+ * cgroup_freezing_or_frozen():
  * task->alloc_lock (to get task's cgroup)
  *
  * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * task->alloc_lock (to get task's cgroup)
  * freezer->lock
  *  sighand->siglock (if the cgroup is freezing)
  *
  * freezer_read():
  * cgroup_mutex
  *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
  *
  * freezer_write() (freeze):
  * cgroup_mutex
  *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
- *    sighand->siglock
+ *    sighand->siglock (fake signal delivery inside freeze_task())
  *
  * freezer_write() (unfreeze):
  * cgroup_mutex
  *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
  *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (to prevent races with freeze_task())
+ *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
  *     sighand->siglock
  */
 static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
-- 
cgit v1.2.3-55-g7522


From e3174cfd2a1e28fff774681f00a0eef3d31da970 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 11 May 2010 08:31:49 +0200
Subject: Revert "perf: Fix exit() vs PERF_FORMAT_GROUP"

This reverts commit 4fd38e4595e2f6c9d27732c042a0e16b2753049c.

It causes various crashes and hangs when events are activated.

The cause is not fully understood yet but we need to revert it
because the effects are severe.

Reported-by: Stephane Eranian <eranian@google.com>
Reported-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 1 -
 kernel/perf_event.c        | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4924c96d7e2d..3fd5c82e0e18 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -575,7 +575,6 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
-	PERF_EVENT_STATE_FREE		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 180151ff8376..a9047463fd83 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -334,9 +334,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
 
-	if (event->state > PERF_EVENT_STATE_FREE)
-		return;
-
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
@@ -1871,8 +1868,6 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
-	event->state = PERF_EVENT_STATE_FREE;
-
 	WARN_ON_ONCE(ctx->parent_ctx);
 	/*
 	 * There are two ways this annotation is useful:
-- 
cgit v1.2.3-55-g7522


From 050735b08ca8a016bbace4445fa025b88fee770b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 11 May 2010 11:51:53 +0200
Subject: perf: Fix exit() vs PERF_FORMAT_GROUP

Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't
work as expected if the task the counters were attached to quit
before the read() call.

The cause is that we unconditionally destroy the grouping when
we remove counters from their context. Fix this by splitting off
the group destroy from the list removal such that
perf_event_remove_from_context() does not do this and change
perf_event_release() to do so.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: <stable@kernel.org> # .34.x
LKML-Reference: <1273571513.5605.3527.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a9047463fd83..c97e82518403 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -308,8 +308,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event *sibling, *tmp;
-
 	if (list_empty(&event->group_entry))
 		return;
 	ctx->nr_events--;
@@ -333,6 +331,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
+}
+
+static void
+perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event *sibling, *tmp;
 
 	/*
 	 * If this was a group event with sibling events then
@@ -1868,6 +1872,12 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	/*
+	 * Remove from the PMU, can't get re-enabled since we got
+	 * here because the last ref went.
+	 */
+	perf_event_disable(event);
+
 	WARN_ON_ONCE(ctx->parent_ctx);
 	/*
 	 * There are two ways this annotation is useful:
@@ -1882,7 +1892,10 @@ int perf_event_release_kernel(struct perf_event *event)
 	 *     to trigger the AB-BA case.
 	 */
 	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-	perf_event_remove_from_context(event);
+	raw_spin_lock_irq(&ctx->lock);
+	list_del_event(event, ctx);
+	perf_destroy_group(event, ctx);
+	raw_spin_unlock_irq(&ctx->lock);
 	mutex_unlock(&ctx->mutex);
 
 	mutex_lock(&event->owner->perf_event_mutex);
-- 
cgit v1.2.3-55-g7522


From 96c21a460a37880abfbc8445d5b098dbab958a29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 11 May 2010 16:19:10 +0200
Subject: perf: Fix exit() vs event-groups

Corey reported that the value scale times of group siblings are not
updated when the monitored task dies.

The problem appears to be that we only update the group leader's
time values, fix it by updating the whole group.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org> # .34.x
LKML-Reference: <1273588935.1810.6.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c97e82518403..a4fa381db3c2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -255,6 +255,18 @@ static void update_event_times(struct perf_event *event)
 	event->total_time_running = run_end - event->tstamp_running;
 }
 
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	update_event_times(leader);
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		update_event_times(event);
+}
+
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -320,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->group_leader != event)
 		event->group_leader->nr_siblings--;
 
-	update_event_times(event);
+	update_group_times(event);
 
 	/*
 	 * If event was in error state, then keep it
@@ -501,18 +513,6 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	update_event_times(leader);
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		update_event_times(event);
-}
-
 /*
  * Cross CPU call to disable a performance event
  */
-- 
cgit v1.2.3-55-g7522


From a93d2f1744206827ccf416e2cdc5018aa503314e Mon Sep 17 00:00:00 2001
From: Changli Gao
Date: Fri, 7 May 2010 14:33:26 +0800
Subject: sched, wait: Use wrapper functions

epoll should not touch flags in wait_queue_t. This patch introduces a new
function __add_wait_queue_exclusive(), for the users, who use wait queue as a
LIFO queue.

__add_wait_queue_tail_exclusive() is introduced too instead of
add_wait_queue_exclusive_locked(). remove_wait_queue_locked() is removed, as
it is a duplicate of __remove_wait_queue(), disliked by users, and with less
users.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: <containers@lists.linux-foundation.org>
LKML-Reference: <1273214006-2979-1-git-send-email-xiaosuo@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/eventpoll.c       |  3 +--
 include/linux/wait.h | 35 +++++++++++++++--------------------
 kernel/cgroup.c      |  2 +-
 kernel/sched.c       |  3 +--
 4 files changed, 18 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue(&ep->wq, &wait);
+		__add_wait_queue_exclusive(&ep->wq, &wait);
 
 		for (;;) {
 			/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b77d5e..76d96d035ea0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 /*
  * Used for wake-one threads:
  */
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+}
+
 static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-						wait_queue_t *new)
+					 wait_queue_t *new)
 {
 	list_add_tail(&new->task_list, &head->task_list);
 }
 
+static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue_tail(q, wait);
+}
+
 static inline void __remove_wait_queue(wait_queue_head_t *head,
 							wait_queue_t *old)
 {
@@ -403,25 +417,6 @@ do {									\
 	__ret;								\
 })
 
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
-						   wait_queue_t * wait)
-{
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_tail(q,  wait);
-}
-
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void remove_wait_queue_locked(wait_queue_head_t *q,
-					    wait_queue_t * wait)
-{
-	__remove_wait_queue(q,  wait);
-}
-
 /*
  * These are the old interfaces to sleep waiting for an event.
  * They are racy.  DO NOT use them, use the wait_event* interfaces above.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..4a07d057a265 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3010,7 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 	unsigned long flags = (unsigned long)key;
 
 	if (flags & POLLHUP) {
-		remove_wait_queue_locked(event->wqh, &event->wait);
+		__remove_wait_queue(event->wqh, &event->wait);
 		spin_lock(&cgrp->event_list_lock);
 		list_del(&event->list);
 		spin_unlock(&cgrp->event_list_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 39aa9c7e22c0..b531d7934083 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3983,8 +3983,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue_tail(&x->wait, &wait);
+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
-- 
cgit v1.2.3-55-g7522


From 8b6d043b7ee2d1b819dc833d677ea2aead71a0c0 Mon Sep 17 00:00:00 2001
From: Alan Cox
Date: Mon, 29 Mar 2010 19:38:00 +0200
Subject: resource: shared I/O region support

SuperIO devices share regions and use lock/unlock operations to chip
select.  We therefore need to be able to request a resource and wait for
it to be freed by whichever other SuperIO device currently hogs it.
Right now you have to poll which is horrible.

Add a MUXED field to IO port resources. If the MUXED field is set on the
resource and on the request (via request_muxed_region) then we block
until the previous owner of the muxed resource releases their region.

This allows us to implement proper resource sharing and locking for
superio chips using code of the form

enable_my_superio_dev() {
	request_muxed_region(0x44, 0x02, "superio:watchdog");
	outb() ..sequence to enable chip
}

disable_my_superio_dev() {
	outb() .. sequence of disable chip
	release_region(0x44, 0x02);
}

Signed-off-by: Giel van Schijndel <me@mortis.eu>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/ioport.h |  4 +++-
 kernel/resource.c      | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 26fad187d661..b22790268b64 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -52,6 +52,7 @@ struct resource_list {
 
 #define IORESOURCE_MEM_64	0x00100000
 #define IORESOURCE_WINDOW	0x00200000	/* forwarded by bridge */
+#define IORESOURCE_MUXED	0x00400000	/* Resource is software muxed */
 
 #define IORESOURCE_EXCLUSIVE	0x08000000	/* Userland may not map this resource */
 #define IORESOURCE_DISABLED	0x10000000
@@ -143,7 +144,8 @@ static inline unsigned long resource_type(const struct resource *res)
 }
 
 /* Convenience shorthand with allocation */
-#define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name), 0)
+#define request_region(start,n,name)		__request_region(&ioport_resource, (start), (n), (name), 0)
+#define request_muxed_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED)
 #define __request_mem_region(start,n,name, excl) __request_region(&iomem_resource, (start), (n), (name), excl)
 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name), 0)
 #define request_mem_region_exclusive(start,n,name) \
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c358e263534..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
@@ -681,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
  * release_region releases a matching busy region.
  */
 
+static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
+
 /**
  * __request_region - create a new busy resource region
  * @parent: parent resource descriptor
@@ -693,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
 				   resource_size_t start, resource_size_t n,
 				   const char *name, int flags)
 {
+	DECLARE_WAITQUEUE(wait, current);
 	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
 	if (!res)
@@ -717,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
 			if (!(conflict->flags & IORESOURCE_BUSY))
 				continue;
 		}
-
+		if (conflict->flags & flags & IORESOURCE_MUXED) {
+			add_wait_queue(&muxed_resource_wait, &wait);
+			write_unlock(&resource_lock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule();
+			remove_wait_queue(&muxed_resource_wait, &wait);
+			write_lock(&resource_lock);
+			continue;
+		}
 		/* Uhhuh, that didn't work out.. */
 		kfree(res);
 		res = NULL;
@@ -791,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
 				break;
 			*p = res->sibling;
 			write_unlock(&resource_lock);
+			if (res->flags & IORESOURCE_MUXED)
+				wake_up(&muxed_resource_wait);
 			kfree(res);
 			return;
 		}
-- 
cgit v1.2.3-55-g7522


From 72d5a9f7a9542f88397558c65bcfc3b115a65e34 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 10 May 2010 17:12:17 -0700
Subject: rcu: remove all rcu head initializations, except on_stack
 initializations

Remove all rcu head inits. We don't care about the RCU head state before passing
it to call_rcu() anyway. Only leave the "on_stack" variants so debugobjects can
keep track of objects on stack.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/init_task.h | 1 -
 kernel/rcutiny.c          | 6 ++++++
 kernel/rcutorture.c       | 2 ++
 kernel/rcutree.c          | 4 ++++
 kernel/rcutree_plugin.h   | 2 ++
 5 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b1ed1cd8e2a8..7996fc2c9ba9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -49,7 +49,6 @@ extern struct group_info init_groups;
 		{ .first = &init_task.pids[PIDTYPE_PGID].node },	\
 		{ .first = &init_task.pids[PIDTYPE_SID].node },		\
 	},								\
-	.rcu		= RCU_HEAD_INIT,				\
 	.level		= 0,						\
 	.numbers	= { {						\
 		.nr		= 0,					\
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index b1804ff83d5e..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -245,11 +245,13 @@ void rcu_barrier(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -257,11 +259,13 @@ void rcu_barrier_bh(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_bh(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 
@@ -269,11 +273,13 @@ void rcu_barrier_sched(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..077defb34571 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
 {
 	struct rcu_bh_torture_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 
 static struct rcu_torture_ops rcu_bh_ops = {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba6996943e28..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1484,11 +1484,13 @@ void synchronize_sched(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
@@ -1508,11 +1510,13 @@ void synchronize_rcu_bh(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_bh(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ac7d80fa895c..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -557,11 +557,13 @@ void synchronize_rcu(void)
 	if (!rcu_scheduler_active)
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-- 
cgit v1.2.3-55-g7522


From 34441427aab4bdb3069a4ffcda69a99357abcb2e Mon Sep 17 00:00:00 2001
From: Robin Holt
Date: Tue, 11 May 2010 14:06:46 -0700
Subject: revert "procfs: provide stack information for threads" and its fixup
 commits

Originally, commit d899bf7b ("procfs: provide stack information for
threads") attempted to introduce a new feature for showing where the
threadstack was located and how many pages are being utilized by the
stack.

Commit c44972f1 ("procfs: disable per-task stack usage on NOMMU") was
applied to fix the NO_MMU case.

Commit 89240ba0 ("x86, fs: Fix x86 procfs stack information for threads on
64-bit") was applied to fix a bug in ia32 executables being loaded.

Commit 9ebd4eba7 ("procfs: fix /proc/<pid>/stat stack pointer for kernel
threads") was applied to fix a bug which had kernel threads printing a
userland stack address.

Commit 1306d603f ('proc: partially revert "procfs: provide stack
information for threads"') was then applied to revert the stack pages
being used to solve a significant performance regression.

This patch nearly undoes the effect of all these patches.

The reason for reverting these is it provides an unusable value in
field 28.  For x86_64, a fork will result in the task->stack_start
value being updated to the current user top of stack and not the stack
start address.  This unpredictability of the stack_start value makes
it worthless.  That includes the intended use of showing how much stack
space a thread has.

Other architectures will get different values.  As an example, ia64
gets 0.  The do_fork() and copy_process() functions appear to treat the
stack_start and stack_size parameters as architecture specific.

I only partially reverted c44972f1 ("procfs: disable per-task stack usage
on NOMMU") .  If I had completely reverted it, I would have had to change
mm/Makefile only build pagewalk.o when CONFIG_PROC_PAGE_MONITOR is
configured.  Since I could not test the builds without significant effort,
I decided to not change mm/Makefile.

I only partially reverted 89240ba0 ("x86, fs: Fix x86 procfs stack
information for threads on 64-bit") .  I left the KSTK_ESP() change in
place as that seemed worthwhile.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  3 +--
 fs/compat.c                        |  2 --
 fs/exec.c                          |  2 --
 fs/proc/array.c                    |  3 +--
 fs/proc/task_mmu.c                 | 19 -------------------
 include/linux/sched.h              |  1 -
 kernel/fork.c                      |  2 --
 7 files changed, 2 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a4f30faa4f1f..1e359b62c40a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -316,7 +316,7 @@ address           perms offset  dev   inode      pathname
 08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
 0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
 a7cb1000-a7cb2000 ---p 00000000 00:00 0
-a7cb2000-a7eb2000 rw-p 00000000 00:00 0          [threadstack:001ff4b4]
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
 a7eb2000-a7eb3000 ---p 00000000 00:00 0
 a7eb3000-a7ed5000 rw-p 00000000 00:00 0
 a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
@@ -352,7 +352,6 @@ is not associated with a file:
  [stack]                  = the stack of the main process
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
- [threadstack:xxxxxxxx]   = the stack of the thread, xxxxxxxx is the stack size
 
  or if empty, the mapping is anonymous.
 
diff --git a/fs/compat.c b/fs/compat.c
index 4b6ed03cc478..05448730f840 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1531,8 +1531,6 @@ int compat_do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->stack_start = current->mm->start_stack;
-
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b9..e6e94c626c2c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1387,8 +1387,6 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
-	current->stack_start = current->mm->start_stack;
-
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e51f2ec2c5e5..885ab5513ac5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,7 +81,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
-#include <linux/swapops.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -495,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
-		(permitted && mm) ? task->stack_start : 0,
+		(permitted && mm) ? mm->start_stack : 0,
 		esp,
 		eip,
 		/* The signal information here is obsolete.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 070553427dd5..47f5b145f56e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -247,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 				} else if (vma->vm_start <= mm->start_stack &&
 					   vma->vm_end >= mm->start_stack) {
 					name = "[stack]";
-				} else {
-					unsigned long stack_start;
-					struct proc_maps_private *pmp;
-
-					pmp = m->private;
-					stack_start = pmp->task->stack_start;
-
-					if (vma->vm_start <= stack_start &&
-					    vma->vm_end >= stack_start) {
-						pad_len_spaces(m, len);
-						seq_printf(m,
-						 "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
-						 vma->vm_end - stack_start
-#else
-						 stack_start - vma->vm_start
-#endif
-						);
-					}
 				}
 			} else {
 				name = "[vdso]";
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf7..2b7b81df78b3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1497,7 +1497,6 @@ struct task_struct {
 	/* bitmask of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
-	unsigned long stack_start;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
 	struct memcg_batch_info {
 		int do_batch;	/* incremented when batch uncharge started */
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791b0a2e..4c14942a0ee3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1114,8 +1114,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->bts = NULL;
 
-	p->stack_start = stack_start;
-
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-- 
cgit v1.2.3-55-g7522


From 475f9aa6aa538befcbd0fa95bdebada600f247cd Mon Sep 17 00:00:00 2001
From: Vitaly Mayatskikh
Date: Tue, 11 May 2010 14:06:51 -0700
Subject: kexec: fix OOPS in crash_kernel_shrink

Two "echo 0 > /sys/kernel/kexec_crash_size" OOPSes kernel.  Also content
of this file is invalid after first shrink to zero: it shows 1 instead of
0.

This scenario is unlikely to happen often (root privs, valid crashkernel=
in cmdline, dump-capture kernel not loaded), I hit it only by chance.

This patch fixes it.

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Cc: Cong Wang <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc474..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
 
 	free_reserved_phys_range(end, crashk_res.end);
 
-	if (start == end) {
-		crashk_res.end = end;
+	if (start == end)
 		release_resource(&crashk_res);
-	} else
-		crashk_res.end = end - 1;
+	crashk_res.end = end - 1;
 
 unlock:
 	mutex_unlock(&kexec_mutex);
-- 
cgit v1.2.3-55-g7522


From 11cad320a4f4bc53d3585c85600c782faa12b99e Mon Sep 17 00:00:00 2001
From: Vitaliy Gusev
Date: Tue, 11 May 2010 14:06:56 -0700
Subject: bsdacct: use del_timer_sync() in acct_exit_ns()

acct_exit_ns --> acct_file_reopen deletes timer without check timer
execution on other CPUs.  So acct_timeout() can change an unmapped memory.

Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..e4c0e1fee9b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	struct bsd_acct_struct *acct;
+	struct bsd_acct_struct *acct = ns->bacct;
 
-	spin_lock(&acct_lock);
-	acct = ns->bacct;
-	if (acct != NULL) {
-		if (acct->file != NULL)
-			acct_file_reopen(acct, NULL, NULL);
+	if (acct == NULL)
+		return;
 
-		kfree(acct);
-	}
+	del_timer_sync(&acct->timer);
+	spin_lock(&acct_lock);
+	if (acct->file != NULL)
+		acct_file_reopen(acct, NULL, NULL);
 	spin_unlock(&acct_lock);
+
+	kfree(acct);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 7f0f15464185a92f9d8791ad231bcd7bf6df54e4 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Tue, 11 May 2010 14:06:58 -0700
Subject: memcg: fix css_id() RCU locking for real

Commit ad4ba375373937817404fd92239ef4cadbded23b ("memcg: css_id() must be
called under rcu_read_lock()") modifies memcontol.c for fixing RCU check
message.  But Andrew Morton pointed out that the fix doesn't seems sane
and it was just for hidining lockdep messages.

This is a patch for do proper things.  Checking again, all places,
accessing without rcu_read_lock, that commit fixies was intentional....
all callers of css_id() has reference count on it.  So, it's not necessary
to be under rcu_read_lock().

Considering again, we can use rcu_dereference_check for css_id().  We know
css->id is valid if css->refcnt > 0.  (css->id never changes and freed
after css->refcnt going to be 0.)

This patch makes use of rcu_dereference_check() in css_id/depth and remove
unnecessary rcu-read-lock added by the commit.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 15 +++++++++++++--
 mm/memcontrol.c | 19 +++++--------------
 2 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a53c771e503..6db8b7f297a1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4435,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable);
  */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
-	struct css_id *cssid = rcu_dereference(css->id);
+	struct css_id *cssid;
+
+	/*
+	 * This css_id() can return correct value when somone has refcnt
+	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
+	 * it's unchanged until freed.
+	 */
+	cssid = rcu_dereference_check(css->id,
+			rcu_read_lock_held() || atomic_read(&css->refcnt));
 
 	if (cssid)
 		return cssid->id;
@@ -4445,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id);
 
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
-	struct css_id *cssid = rcu_dereference(css->id);
+	struct css_id *cssid;
+
+	cssid = rcu_dereference_check(css->id,
+			rcu_read_lock_held() || atomic_read(&css->refcnt));
 
 	if (cssid)
 		return cssid->depth;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f711c213d2e..595d03f33b2c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2314,9 +2314,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
 	/* record memcg information */
 	if (do_swap_account && swapout && memcg) {
-		rcu_read_lock();
 		swap_cgroup_record(ent, css_id(&memcg->css));
-		rcu_read_unlock();
 		mem_cgroup_get(memcg);
 	}
 	if (swapout && memcg)
@@ -2373,10 +2371,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 {
 	unsigned short old_id, new_id;
 
-	rcu_read_lock();
 	old_id = css_id(&from->css);
 	new_id = css_id(&to->css);
-	rcu_read_unlock();
 
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
@@ -4044,16 +4040,11 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
 			put_page(page);
 	}
 	/* throught */
-	if (ent.val && do_swap_account && !ret) {
-		unsigned short id;
-		rcu_read_lock();
-		id = css_id(&mc.from->css);
-		rcu_read_unlock();
-		if (id == lookup_swap_cgroup(ent)) {
-			ret = MC_TARGET_SWAP;
-			if (target)
-				target->ent = ent;
-		}
+	if (ent.val && do_swap_account && !ret &&
+			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
+		ret = MC_TARGET_SWAP;
+		if (target)
+			target->ent = ent;
 	}
 	return ret;
 }
-- 
cgit v1.2.3-55-g7522


From 747388d78a0ae768fd82b55c4ed38aa646a72364 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Tue, 11 May 2010 14:06:59 -0700
Subject: memcg: fix css_is_ancestor() RCU locking

Some callers (in memcontrol.c) calls css_is_ancestor() without
rcu_read_lock.  Because css_is_ancestor() has to access RCU protected
data, it should be under rcu_read_lock().

This makes css_is_ancestor() itself does safe access to RCU protected
area.  (At least, "root" can have refcnt==0 if it's not an ancestor of
"child".  So, we need rcu_read_lock().)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 31 ++++++++++++++++++++++++++-----
 mm/memcontrol.c |  4 ----
 2 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6db8b7f297a1..6d870f2d1228 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4464,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_depth);
 
+/**
+ *  css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
+
 bool css_is_ancestor(struct cgroup_subsys_state *child,
 		    const struct cgroup_subsys_state *root)
 {
-	struct css_id *child_id = rcu_dereference(child->id);
-	struct css_id *root_id = rcu_dereference(root->id);
+	struct css_id *child_id;
+	struct css_id *root_id;
+	bool ret = true;
 
-	if (!child_id || !root_id || (child_id->depth < root_id->depth))
-		return false;
-	return child_id->stack[root_id->depth] == root_id->id;
+	rcu_read_lock();
+	child_id  = rcu_dereference(child->id);
+	root_id = rcu_dereference(root->id);
+	if (!child_id
+	    || !root_id
+	    || (child_id->depth < root_id->depth)
+	    || (child_id->stack[root_id->depth] != root_id->id))
+		ret = false;
+	rcu_read_unlock();
+	return ret;
 }
 
 static void __free_css_id_cb(struct rcu_head *head)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 595d03f33b2c..8a79a6f0f029 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -811,12 +811,10 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "mem").
 	 */
-	rcu_read_lock();
 	if (mem->use_hierarchy)
 		ret = css_is_ancestor(&curr->css, &mem->css);
 	else
 		ret = (curr == mem);
-	rcu_read_unlock();
 	css_put(&curr->css);
 	return ret;
 }
@@ -1603,7 +1601,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			 * There is a small race that "from" or "to" can be
 			 * freed by rmdir, so we use css_tryget().
 			 */
-			rcu_read_lock();
 			from = mc.from;
 			to = mc.to;
 			if (from && css_tryget(&from->css)) {
@@ -1624,7 +1621,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 					do_continue = (to == mem_over_limit);
 				css_put(&to->css);
 			}
-			rcu_read_unlock();
 			if (do_continue) {
 				DEFINE_WAIT(wait);
 				prepare_to_wait(&mc.waitq, &wait,
-- 
cgit v1.2.3-55-g7522


From 4308ad801193f14ff42cb746da37cf07e35f0d08 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr
Date: Wed, 5 May 2010 13:56:42 -0700
Subject: genirq: Clear CPU mask in affinity_hint when none is provided

When an interrupt is disabled and torn down, the CPU mask returned
through affinity_hint right now is all CPUs. Also, for drivers that
don't provide an affinity_hint mask, this can be misleading. There
should be no hint at all, meaning an empty CPU mask.

[ tglx: use zalloc_cpumask_var instead of clearing it under the lock ]

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Cc: davem@davemloft.net
Cc: arjan@linux.jf.intel.com
Cc: bhutchings@solarflare.com
LKML-Reference: <20100505205638.5426.87189.stgit@ppwaskie-hc2.jf.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4f9427a30e14..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -38,14 +38,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
 	unsigned long flags;
 	cpumask_var_t mask;
 
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (desc->affinity_hint)
 		cpumask_copy(mask, desc->affinity_hint);
-	else
-		cpumask_setall(mask);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	seq_cpumask(m, mask);
-- 
cgit v1.2.3-55-g7522


From 16a2164bb03612efe79a76c73da6da44445b9287 Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Fri, 14 May 2010 19:44:10 -0700
Subject: profile: fix stats and data leakage

If the kernel is large or the profiling step small, /proc/profile
leaks data and readprofile shows silly stats, until readprofile -r
has reset the buffer: clear the prof_buffer when it is vmalloc()ed.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/profile.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
 		return 0;
 
 	prof_buffer = vmalloc(buffer_bytes);
-	if (prof_buffer)
+	if (prof_buffer) {
+		memset(prof_buffer, 0, buffer_bytes);
 		return 0;
+	}
 
 	free_cpumask_var(prof_cpu_mask);
 	return -ENOMEM;
-- 
cgit v1.2.3-55-g7522


From 00b7c3395aec3df43de5bd02a3c5a099ca51169f Mon Sep 17 00:00:00 2001
From: Amerigo Wang
Date: Wed, 5 May 2010 00:26:45 +0000
Subject: sysctl: refactor integer handling proc code

(Based on Octavian's work, and I modified a lot.)

As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is also fixed: if the user specifies a number
with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Behavior for EFAULT handling was changed as well. Previous to this
patch, when an EFAULT error occurred in the middle of a write
operation, although some of the elements were set, that was not
acknowledged to the user (by shorting the write and returning the
number of bytes accepted). EFAULT is now treated just like any other
errors by acknowledging the amount of bytes accepted.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 390 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 229 insertions(+), 161 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..4a976208de29 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2040,8 +2040,122 @@ int proc_dostring(struct ctl_table *table, int write,
 			       buffer, lenp, ppos);
 }
 
+static size_t proc_skip_spaces(char **buf)
+{
+	size_t ret;
+	char *tmp = skip_spaces(*buf);
+	ret = tmp - *buf;
+	*buf = tmp;
+	return ret;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formated integer from a user buffer
+ *
+ * @buf - a kernel buffer
+ * @size - size of the kernel buffer
+ * @val - this is where the number will be stored
+ * @neg - set to %TRUE if number is negative
+ * @perm_tr - a vector which contains the allowed trailers
+ * @perm_tr_len - size of the perm_tr vector
+ * @tr - pointer to store the trailer character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read. If tr is non NULL and a trailing
+ * character exist (size is non zero after returning from this
+ * function) tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+			  unsigned long *val, bool *neg,
+			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+	int len;
+	char *p, tmp[TMPBUFLEN];
+
+	if (!*size)
+		return -EINVAL;
+
+	len = *size;
+	if (len > TMPBUFLEN - 1)
+		len = TMPBUFLEN - 1;
+
+	memcpy(tmp, *buf, len);
+
+	tmp[len] = 0;
+	p = tmp;
+	if (*p == '-' && *size > 1) {
+		*neg = true;
+		p++;
+	} else
+		*neg = false;
+	if (!isdigit(*p))
+		return -EINVAL;
+
+	*val = simple_strtoul(p, &p, 0);
+
+	len = p - tmp;
+
+	/* We don't know if the next char is whitespace thus we may accept
+	 * invalid integers (e.g. 1234...a) or two integers instead of one
+	 * (e.g. 123...1). So lets not allow such large numbers. */
+	if (len == TMPBUFLEN - 1)
+		return -EINVAL;
+
+	if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+		return -EINVAL;
 
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+	if (tr && (len < *size))
+		*tr = *p;
+
+	*buf += len;
+	*size -= len;
+
+	return 0;
+}
+
+/**
+ * proc_put_long - coverts an integer to a decimal ASCII formated string
+ *
+ * @buf - the user buffer
+ * @size - the size of the user buffer
+ * @val - the integer to be converted
+ * @neg - sign of the number, %TRUE for negative
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read.
+ */
+static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+			  bool neg)
+{
+	int len;
+	char tmp[TMPBUFLEN], *p = tmp;
+
+	sprintf(p, "%s%lu", neg ? "-" : "", val);
+	len = strlen(tmp);
+	if (len > *size)
+		len = *size;
+	if (copy_to_user(*buf, tmp, len))
+		return -EFAULT;
+	*size -= len;
+	*buf += len;
+	return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(void __user **buf, size_t *size, char c)
+{
+	if (*size) {
+		char __user **buffer = (char __user **)buf;
+		if (put_user(c, *buffer))
+			return -EFAULT;
+		(*size)--, (*buffer)++;
+		*buf = *buffer;
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int *valp,
 				 int write, void *data)
 {
@@ -2050,33 +2164,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			*lvalp = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			*lvalp = (unsigned long)val;
 		}
 	}
 	return 0;
 }
 
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-#define TMPBUFLEN 21
-	int *i, vleft, first = 1, neg;
-	unsigned long lval;
-	size_t left, len;
-	
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
+	int *i, vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
 	
-	if (!tbl_data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
@@ -2088,89 +2200,69 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 	if (!conv)
 		conv = do_proc_dointvec_conv;
 
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, first=0) {
-		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > sizeof(buf) - 1)
-				len = sizeof(buf) - 1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
+		unsigned long lval;
+		bool neg;
 
-			lval = simple_strtoul(p, &p, 0);
+		if (write) {
+			left -= proc_skip_spaces(&kbuf);
 
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			err = proc_get_long(&kbuf, &left, &lval, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
-			s += len;
-			left -= len;
-
-			if (conv(&neg, &lval, i, 1, data))
+			if (conv(&neg, &lval, i, 1, data)) {
+				err = -EINVAL;
 				break;
+			}
 		} else {
-			p = buf;
+			if (conv(&neg, &lval, i, 0, data)) {
+				err = -EINVAL;
+				break;
+			}
 			if (!first)
-				*p++ = '\t';
-	
-			if (conv(&neg, &lval, i, 0, data))
+				err = proc_put_char(&buffer, &left, '\t');
+			if (err)
+				break;
+			err = proc_put_long(&buffer, &left, lval, neg);
+			if (err)
 				break;
-
-			sprintf(p, "%s%lu", neg ? "-" : "", lval);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
@@ -2238,8 +2330,8 @@ struct do_proc_dointvec_minmax_conv_param {
 	int *max;
 };
 
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
-					int *valp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+					int *valp,
 					int write, void *data)
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,10 +2344,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			*lvalp = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			*lvalp = (unsigned long)val;
 		}
 	}
@@ -2295,102 +2387,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
-	unsigned long *i, *min, *max, val;
-	int vleft, first=1, neg;
-	size_t len, left;
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
-	
-	if (!data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	unsigned long *i, *min, *max;
+	int vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
+
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
-	
+
 	i = (unsigned long *) data;
 	min = (unsigned long *) table->extra1;
 	max = (unsigned long *) table->extra2;
 	vleft = table->maxlen / sizeof(unsigned long);
 	left = *lenp;
-	
+
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, min++, max++, first=0) {
+		unsigned long val;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > TMPBUFLEN-1)
-				len = TMPBUFLEN-1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			bool neg;
+
+			left -= proc_skip_spaces(&kbuf);
+
+			err = proc_get_long(&kbuf, &left, &val, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
 			if (neg)
-				val = -val;
-			s += len;
-			left -= len;
-
-			if(neg)
 				continue;
 			if ((min && val < *min) || (max && val > *max))
 				continue;
 			*i = val;
 		} else {
-			p = buf;
+			val = convdiv * (*i) / convmul;
 			if (!first)
-				*p++ = '\t';
-			sprintf(p, "%lu", convdiv * (*i) / convmul);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
+				err = proc_put_char(&buffer, &left, '\t');
+			err = proc_put_long(&buffer, &left, val, false);
+			if (err)
+				break;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2519,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
 
 
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int *valp,
 					 int write, void *data)
 {
@@ -2463,10 +2531,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = lval / HZ;
@@ -2474,7 +2542,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
 	return 0;
 }
 
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
 						int *valp,
 						int write, void *data)
 {
@@ -2486,10 +2554,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = jiffies_to_clock_t(lval);
@@ -2497,7 +2565,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
 	return 0;
 }
 
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 					    int *valp,
 					    int write, void *data)
 {
@@ -2507,10 +2575,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = true;
 			lval = (unsigned long)-val;
 		} else {
-			*negp = 0;
+			*negp = false;
 			lval = (unsigned long)val;
 		}
 		*lvalp = jiffies_to_msecs(lval);
-- 
cgit v1.2.3-55-g7522


From 9f977fb7ae9ddf565b4800854212fb9a1ed6c2ea Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 5 May 2010 00:26:55 +0000
Subject: sysctl: add proc_do_large_bitmap

The new function can be used to read/write large bitmaps via /proc. A
comma separated range format is used for compact output and input
(e.g. 1,3-4,10-10).

Writing into the file will first reset the bitmap then update it
based on the given input.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h |   2 +
 kernel/sysctl.c        | 161 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index f66014c90c9f..7bb5cb64f3b8 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -980,6 +980,8 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int,
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
+extern int proc_do_large_bitmap(struct ctl_table *, int,
+				void __user *, size_t *, loff_t *);
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4a976208de29..bcfb79e94ec7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2049,6 +2049,16 @@ static size_t proc_skip_spaces(char **buf)
 	return ret;
 }
 
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+	while (*size) {
+		if (**buf != v)
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+}
+
 #define TMPBUFLEN 22
 /**
  * proc_get_long - reads an ASCII formated integer from a user buffer
@@ -2675,6 +2685,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
 	return 0;
 }
 
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err = 0;
+	bool first = 1;
+	size_t left = *lenp;
+	unsigned long bitmap_len = table->maxlen;
+	unsigned long *bitmap = (unsigned long *) table->data;
+	unsigned long *tmp_bitmap = NULL;
+	char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+	if (!bitmap_len || !left || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		unsigned long page = 0;
+		char *kbuf;
+
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			free_page(page);
+			return -EFAULT;
+                }
+		kbuf[left] = 0;
+
+		tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+				     GFP_KERNEL);
+		if (!tmp_bitmap) {
+			free_page(page);
+			return -ENOMEM;
+		}
+		proc_skip_char(&kbuf, &left, '\n');
+		while (!err && left) {
+			unsigned long val_a, val_b;
+			bool neg;
+
+			err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+					     sizeof(tr_a), &c);
+			if (err)
+				break;
+			if (val_a >= bitmap_len || neg) {
+				err = -EINVAL;
+				break;
+			}
+
+			val_b = val_a;
+			if (left) {
+				kbuf++;
+				left--;
+			}
+
+			if (c == '-') {
+				err = proc_get_long(&kbuf, &left, &val_b,
+						     &neg, tr_b, sizeof(tr_b),
+						     &c);
+				if (err)
+					break;
+				if (val_b >= bitmap_len || neg ||
+				    val_a > val_b) {
+					err = -EINVAL;
+					break;
+				}
+				if (left) {
+					kbuf++;
+					left--;
+				}
+			}
+
+			while (val_a <= val_b)
+				set_bit(val_a++, tmp_bitmap);
+
+			first = 0;
+			proc_skip_char(&kbuf, &left, '\n');
+		}
+		free_page(page);
+	} else {
+		unsigned long bit_a, bit_b = 0;
+
+		while (left) {
+			bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+			if (bit_a >= bitmap_len)
+				break;
+			bit_b = find_next_zero_bit(bitmap, bitmap_len,
+						   bit_a + 1) - 1;
+
+			if (!first) {
+				err = proc_put_char(&buffer, &left, ',');
+				if (err)
+					break;
+			}
+			err = proc_put_long(&buffer, &left, bit_a, false);
+			if (err)
+				break;
+			if (bit_a != bit_b) {
+				err = proc_put_char(&buffer, &left, '-');
+				if (err)
+					break;
+				err = proc_put_long(&buffer, &left, bit_b, false);
+				if (err)
+					break;
+			}
+
+			first = 0; bit_b++;
+		}
+		if (!err)
+			err = proc_put_char(&buffer, &left, '\n');
+	}
+
+	if (!err) {
+		if (write) {
+			if (*ppos)
+				bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+			else
+				memcpy(bitmap, tmp_bitmap,
+					BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+		}
+		kfree(tmp_bitmap);
+		*lenp -= left;
+		*ppos += *lenp;
+		return 0;
+	} else {
+		kfree(tmp_bitmap);
+		return err;
+	}
+}
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(struct ctl_table *table, int write,
-- 
cgit v1.2.3-55-g7522


From 25f3a5a2854dce8b8413fd24cc9d5b9e3632be54 Mon Sep 17 00:00:00 2001
From: Mark Gross
Date: Mon, 17 May 2010 00:21:03 +0200
Subject: PM: PM QOS update fix

This update handles a use case where pm_qos update requests need to
silently fail if the update is being sent to a handle that is NULL.

The problem was that the original pm_qos silently fails when a request
update is passed to a parameter that has not been added to the list yet.
This update restores that behavior.

Signed-off-by: markgross <markgross@thegnar.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/pm_qos_params.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index a1aea040eb57..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -252,19 +252,21 @@ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
 	int pending_update = 0;
 	s32 temp;
 
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	if (new_value == PM_QOS_DEFAULT_VALUE)
-		temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
-	else
-		temp = new_value;
-
-	if (temp != pm_qos_req->value) {
-		pending_update = 1;
-		pm_qos_req->value = temp;
+	if (pm_qos_req) { /*guard against callers passing in null */
+		spin_lock_irqsave(&pm_qos_lock, flags);
+		if (new_value == PM_QOS_DEFAULT_VALUE)
+			temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
+		else
+			temp = new_value;
+
+		if (temp != pm_qos_req->value) {
+			pending_update = 1;
+			pm_qos_req->value = temp;
+		}
+		spin_unlock_irqrestore(&pm_qos_lock, flags);
+		if (pending_update)
+			update_target(pm_qos_req->pm_qos_class);
 	}
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-	if (pending_update)
-		update_target(pm_qos_req->pm_qos_class);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
 
-- 
cgit v1.2.3-55-g7522


From ab3c68ee5fd329ba48094d3417fd60e30ea14a87 Mon Sep 17 00:00:00 2001
From: Heiko Carstens
Date: Mon, 17 May 2010 10:00:21 +0200
Subject: [S390] debug: enable exception-trace debug facility

The exception-trace facility on x86 and other architectures prints
traces to dmesg whenever a user space application crashes.
s390 has such a feature since ages however it is called
userprocess_debug and is enabled differently.
This patch makes sure that whenever one of the two procfs files

/proc/sys/kernel/userprocess_debug
/proc/sys/debug/exception-trace

is modified the contents of the second one changes as well.
That way we keep backwards compatibilty but also support the same
interface like other architectures do.
Besides that the output of the traces is improved since it will now
also contain the corresponding filename of the vma (when available)
where the process caused a fault or trap.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig        |  7 -------
 arch/s390/kernel/traps.c | 31 +++++++++++++------------------
 arch/s390/mm/fault.c     | 32 +++++++++++++++++---------------
 kernel/sysctl.c          |  5 +++--
 4 files changed, 33 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0d8cd9bbe101..79d0ca086820 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -444,13 +444,6 @@ config FORCE_MAX_ZONEORDER
 	int
 	default "9"
 
-config PROCESS_DEBUG
-	bool "Show crashed user process info"
-	help
-	  Say Y to print all process fault locations to the console.  This is
-	  a debugging option; you probably do not want to set it unless you
-	  are an S390 port maintainer.
-
 config PFAULT
 	bool "Pseudo page fault support"
 	help
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index e605f070610c..5d8f0f3d0250 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -46,13 +46,7 @@
 
 pgm_check_handler_t *pgm_check_table[128];
 
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_PROCESS_DEBUG
-int sysctl_userprocess_debug = 1;
-#else
-int sysctl_userprocess_debug = 0;
-#endif
-#endif
+int show_unhandled_signals;
 
 extern pgm_check_handler_t do_protection_exception;
 extern pgm_check_handler_t do_dat_exception;
@@ -315,18 +309,19 @@ void die(const char * str, struct pt_regs * regs, long err)
 	do_exit(SIGSEGV);
 }
 
-static void inline
-report_user_fault(long interruption_code, struct pt_regs *regs)
+static void inline report_user_fault(struct pt_regs *regs, long int_code,
+				     int signr)
 {
-#if defined(CONFIG_SYSCTL)
-	if (!sysctl_userprocess_debug)
+	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 		return;
-#endif
-#if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG)
-	printk("User process fault: interruption code 0x%lX\n",
-	       interruption_code);
+	if (!unhandled_signal(current, signr))
+		return;
+	if (!printk_ratelimit())
+		return;
+	printk("User process fault: interruption code 0x%lX ", int_code);
+	print_vma_addr("in ", regs->psw.addr & PSW_ADDR_INSN);
+	printk("\n");
 	show_regs(regs);
-#endif
 }
 
 int is_valid_bugaddr(unsigned long addr)
@@ -354,7 +349,7 @@ static void __kprobes inline do_trap(long interruption_code, int signr,
 
                 tsk->thread.trap_no = interruption_code & 0xffff;
 		force_sig_info(signr, info, tsk);
-		report_user_fault(interruption_code, regs);
+		report_user_fault(regs, interruption_code, signr);
         } else {
                 const struct exception_table_entry *fixup;
                 fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
@@ -390,7 +385,7 @@ static void default_trap_handler(struct pt_regs * regs, long interruption_code)
 {
         if (regs->psw.mask & PSW_MASK_PSTATE) {
 		local_irq_enable();
-		report_user_fault(interruption_code, regs);
+		report_user_fault(regs, interruption_code, SIGSEGV);
 		do_exit(SIGSEGV);
 	} else
 		die("Unknown program exception", regs, interruption_code);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 3040d7c78fe0..2505b2ea0ef1 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -48,10 +48,6 @@
 #define __PF_RES_FIELD 0x8000000000000000ULL
 #endif /* CONFIG_64BIT */
 
-#ifdef CONFIG_SYSCTL
-extern int sysctl_userprocess_debug;
-#endif
-
 #define VM_FAULT_BADCONTEXT	0x010000
 #define VM_FAULT_BADMAP		0x020000
 #define VM_FAULT_BADACCESS	0x040000
@@ -120,6 +116,22 @@ static inline int user_space_fault(unsigned long trans_exc_code)
 	return trans_exc_code != 3;
 }
 
+static inline void report_user_fault(struct pt_regs *regs, long int_code,
+				     int signr, unsigned long address)
+{
+	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
+		return;
+	if (!unhandled_signal(current, signr))
+		return;
+	if (!printk_ratelimit())
+		return;
+	printk("User process fault: interruption code 0x%lX ", int_code);
+	print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN);
+	printk("\n");
+	printk("failing address: %lX\n", address);
+	show_regs(regs);
+}
+
 /*
  * Send SIGSEGV to task.  This is an external routine
  * to keep the stack usage of do_page_fault small.
@@ -133,17 +145,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, long int_code,
 	address = trans_exc_code & __FAIL_ADDR_MASK;
 	current->thread.prot_addr = address;
 	current->thread.trap_no = int_code;
-#if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG)
-#if defined(CONFIG_SYSCTL)
-	if (sysctl_userprocess_debug)
-#endif
-	{
-		printk("User process fault: interruption code 0x%lX\n",
-		       int_code);
-		printk("failing address: %lX\n", address);
-		show_regs(regs);
-	}
-#endif
+	report_user_fault(regs, int_code, SIGSEGV, address);
 	si.si_signo = SIGSEGV;
 	si.si_code = si_code;
 	si.si_addr = (void __user *) address;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..90f536d84643 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.procname	= "userprocess_debug",
-		.data		= &sysctl_userprocess_debug,
+		.data		= &show_unhandled_signals,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -1431,7 +1431,8 @@ static struct ctl_table fs_table[] = {
 };
 
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
+    defined(CONFIG_S390)
 	{
 		.procname	= "exception-trace",
 		.data		= &show_unhandled_signals,
-- 
cgit v1.2.3-55-g7522


From 9c6f7e43b4e02c161b53e97ba913855246876c61 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 18 May 2010 00:17:44 +0200
Subject: stop_machine: Move local variable closer to the usage site in
 cpu_stop_cpu_callback()

This addresses the following compiler warning:

 kernel/stop_machine.c: In function 'cpu_stop_cpu_callback':
 kernel/stop_machine.c:297: warning: unused variable 'work'

Cc: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <tip-3fc1f1e27a5b807791d72e5d992aa33b668a6626@git.kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ef51d1fcf5e6..b4e7431e7c78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -294,7 +294,6 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	unsigned int cpu = (unsigned long)hcpu;
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	struct cpu_stop_work *work;
 	struct task_struct *p;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
@@ -323,6 +322,9 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_DEAD:
+	{
+		struct cpu_stop_work *work;
+
 		/* kill the stopper */
 		kthread_stop(stopper->thread);
 		/* drain remaining works */
@@ -335,6 +337,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		put_task_struct(stopper->thread);
 		stopper->thread = NULL;
 		break;
+	}
 #endif
 	}
 
-- 
cgit v1.2.3-55-g7522


From b2be05273a1744d175bf4b67f6665637bb9ac7a8 Mon Sep 17 00:00:00 2001
From: Ben Hutchings
Date: Sat, 3 Apr 2010 19:34:56 +0100
Subject: panic: Allow warnings to set different taint flags

WARN() is used in some places to report firmware or hardware bugs that
are then worked-around.  These bugs do not affect the stability of the
kernel and should not set the flag for TAINT_WARN.  To allow for this,
add WARN_TAINT() and WARN_TAINT_ONCE() macros that take a taint number
as argument.

Architectures that implement warnings using trap instructions instead
of calls to warn_slowpath_*() now implement __WARN_TAINT(taint)
instead of __WARN().

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Helge Deller <deller@gmx.de>
Tested-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/oops-tracing.txt |  1 +
 arch/parisc/include/asm/bug.h  |  8 ++++----
 arch/powerpc/include/asm/bug.h |  6 +++---
 arch/s390/include/asm/bug.h    |  8 ++++----
 arch/sh/include/asm/bug.h      |  4 ++--
 include/asm-generic/bug.h      | 34 ++++++++++++++++++++++++++++++++--
 kernel/panic.c                 | 24 ++++++++++++++++++++----
 lib/bug.c                      |  2 +-
 8 files changed, 67 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index c10c022b911c..069fab3ea4d6 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -256,6 +256,7 @@ characters, each representing a particular tainted value.
   9: 'A' if the ACPI table has been overridden.
 
  10: 'W' if a warning has previously been issued by the kernel.
+     (Though some warnings may set more specific taint flags.)
 
  11: 'C' if a staging driver has been loaded.
 
diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h
index 75e46c557a16..72cfdb0cfdd1 100644
--- a/arch/parisc/include/asm/bug.h
+++ b/arch/parisc/include/asm/bug.h
@@ -44,7 +44,7 @@
 #endif
 
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-#define __WARN()							\
+#define __WARN_TAINT(taint)						\
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
@@ -54,11 +54,11 @@
 			     "\t.org 2b+%c3\n"				\
 			     "\t.popsection"				\
 			     : : "i" (__FILE__), "i" (__LINE__),	\
-			     "i" (BUGFLAG_WARNING),			\
+			     "i" (BUGFLAG_TAINT(taint)), 		\
 			     "i" (sizeof(struct bug_entry)) );		\
 	} while(0)
 #else
-#define __WARN()							\
+#define __WARN_TAINT(taint)						\
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
@@ -67,7 +67,7 @@
 			     "\t.short %c0\n"				\
 			     "\t.org 2b+%c1\n"				\
 			     "\t.popsection"				\
-			     : : "i" (BUGFLAG_WARNING),			\
+			     : : "i" (BUGFLAG_TAINT(taint)),		\
 			     "i" (sizeof(struct bug_entry)) );		\
 	} while(0)
 #endif
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 2c15212e1700..065c590c991d 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -85,12 +85,12 @@
 	}							\
 } while (0)
 
-#define __WARN() do {						\
+#define __WARN_TAINT(taint) do {				\
 	__asm__ __volatile__(					\
 		"1:	twi 31,0,0\n"				\
 		_EMIT_BUG_ENTRY					\
 		: : "i" (__FILE__), "i" (__LINE__),		\
-		  "i" (BUGFLAG_WARNING),			\
+		  "i" (BUGFLAG_TAINT(taint)),			\
 		  "i" (sizeof(struct bug_entry)));		\
 } while (0)
 
@@ -104,7 +104,7 @@
 		"1:	"PPC_TLNEI"	%4,0\n"			\
 		_EMIT_BUG_ENTRY					\
 		: : "i" (__FILE__), "i" (__LINE__),		\
-		  "i" (BUGFLAG_WARNING),			\
+		  "i" (BUGFLAG_TAINT(TAINT_WARN)),		\
 		  "i" (sizeof(struct bug_entry)),		\
 		  "r" (__ret_warn_on));				\
 	}							\
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index 9beeb9db9b23..bf90d1fd97a5 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -46,18 +46,18 @@
 	unreachable();					\
 } while (0)
 
-#define __WARN() do {					\
-	__EMIT_BUG(BUGFLAG_WARNING);			\
+#define __WARN_TAINT(taint) do {			\
+	__EMIT_BUG(BUGFLAG_TAINT(taint));		\
 } while (0)
 
 #define WARN_ON(x) ({					\
 	int __ret_warn_on = !!(x);			\
 	if (__builtin_constant_p(__ret_warn_on)) {	\
 		if (__ret_warn_on)			\
-			__EMIT_BUG(BUGFLAG_WARNING);	\
+			__WARN();			\
 	} else {					\
 		if (unlikely(__ret_warn_on))		\
-			__EMIT_BUG(BUGFLAG_WARNING);	\
+			__WARN();			\
 	}						\
 	unlikely(__ret_warn_on);			\
 })
diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h
index d02c01b3e6b9..6323f864d111 100644
--- a/arch/sh/include/asm/bug.h
+++ b/arch/sh/include/asm/bug.h
@@ -48,7 +48,7 @@ do {							\
 		   "i" (sizeof(struct bug_entry)));	\
 } while (0)
 
-#define __WARN()					\
+#define __WARN_TAINT(taint)				\
 do {							\
 	__asm__ __volatile__ (				\
 		"1:\t.short %O0\n"			\
@@ -57,7 +57,7 @@ do {							\
 		 : "n" (TRAPA_BUG_OPCODE),		\
 		   "i" (__FILE__),			\
 		   "i" (__LINE__),			\
-		   "i" (BUGFLAG_WARNING),		\
+		   "i" (BUGFLAG_TAINT(taint)),		\
 		   "i" (sizeof(struct bug_entry)));	\
 } while (0)
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 18c435d7c082..c2c9ba032d46 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -25,7 +25,10 @@ struct bug_entry {
 };
 #endif		/* __ASSEMBLY__ */
 
-#define BUGFLAG_WARNING	(1<<0)
+#define BUGFLAG_WARNING		(1 << 0)
+#define BUGFLAG_TAINT(taint)	(BUGFLAG_WARNING | ((taint) << 8))
+#define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
+
 #endif	/* CONFIG_GENERIC_BUG */
 
 /*
@@ -56,17 +59,25 @@ struct bug_entry {
  * appear at runtime.  Use the versions with printk format strings
  * to provide better diagnostics.
  */
-#ifndef __WARN
+#ifndef __WARN_TAINT
 #ifndef __ASSEMBLY__
 extern void warn_slowpath_fmt(const char *file, const int line,
 		const char *fmt, ...) __attribute__((format(printf, 3, 4)));
+extern void warn_slowpath_fmt_taint(const char *file, const int line,
+				    unsigned taint, const char *fmt, ...)
+	__attribute__((format(printf, 4, 5)));
 extern void warn_slowpath_null(const char *file, const int line);
 #define WANT_WARN_ON_SLOWPATH
 #endif
 #define __WARN()		warn_slowpath_null(__FILE__, __LINE__)
 #define __WARN_printf(arg...)	warn_slowpath_fmt(__FILE__, __LINE__, arg)
+#define __WARN_printf_taint(taint, arg...)				\
+	warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
 #else
+#define __WARN()		__WARN_TAINT(TAINT_WARN)
 #define __WARN_printf(arg...)	do { printk(arg); __WARN(); } while (0)
+#define __WARN_printf_taint(taint, arg...)				\
+	do { printk(arg); __WARN_TAINT(taint); } while (0)
 #endif
 
 #ifndef WARN_ON
@@ -87,6 +98,13 @@ extern void warn_slowpath_null(const char *file, const int line);
 })
 #endif
 
+#define WARN_TAINT(condition, taint, format...) ({			\
+	int __ret_warn_on = !!(condition);				\
+	if (unlikely(__ret_warn_on))					\
+		__WARN_printf_taint(taint, format);			\
+	unlikely(__ret_warn_on);					\
+})
+
 #else /* !CONFIG_BUG */
 #ifndef HAVE_ARCH_BUG
 #define BUG() do {} while(0)
@@ -110,6 +128,8 @@ extern void warn_slowpath_null(const char *file, const int line);
 })
 #endif
 
+#define WARN_TAINT(condition, taint, format...) WARN_ON(condition)
+
 #endif
 
 #define WARN_ON_ONCE(condition)	({				\
@@ -132,6 +152,16 @@ extern void warn_slowpath_null(const char *file, const int line);
 	unlikely(__ret_warn_once);				\
 })
 
+#define WARN_TAINT_ONCE(condition, taint, format...)	({	\
+	static bool __warned;					\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once))				\
+		if (WARN_TAINT(!__warned, taint, format))	\
+			__warned = true;			\
+	unlikely(__ret_warn_once);				\
+})
+
 #define WARN_ON_RATELIMIT(condition, state)			\
 		WARN_ON((condition) && __ratelimit(state))
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 13d966b4c14a..8b821bce66e6 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -365,7 +365,8 @@ struct slowpath_args {
 	va_list args;
 };
 
-static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
+static void warn_slowpath_common(const char *file, int line, void *caller,
+				 unsigned taint, struct slowpath_args *args)
 {
 	const char *board;
 
@@ -381,7 +382,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
 	print_modules();
 	dump_stack();
 	print_oops_end_marker();
-	add_taint(TAINT_WARN);
+	add_taint(taint);
 }
 
 void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -390,14 +391,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
 
 	args.fmt = fmt;
 	va_start(args.args, fmt);
-	warn_slowpath_common(file, line, __builtin_return_address(0), &args);
+	warn_slowpath_common(file, line, __builtin_return_address(0),
+			     TAINT_WARN, &args);
 	va_end(args.args);
 }
 EXPORT_SYMBOL(warn_slowpath_fmt);
 
+void warn_slowpath_fmt_taint(const char *file, int line,
+			     unsigned taint, const char *fmt, ...)
+{
+	struct slowpath_args args;
+
+	args.fmt = fmt;
+	va_start(args.args, fmt);
+	warn_slowpath_common(file, line, __builtin_return_address(0),
+			     taint, &args);
+	va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt_taint);
+
 void warn_slowpath_null(const char *file, int line)
 {
-	warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
+	warn_slowpath_common(file, line, __builtin_return_address(0),
+			     TAINT_WARN, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
 #endif
diff --git a/lib/bug.c b/lib/bug.c
index 300e41afbf97..f13daf435211 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -165,7 +165,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 			       (void *)bugaddr);
 
 		show_regs(regs);
-		add_taint(TAINT_WARN);
+		add_taint(BUG_GET_TAINT(bug));
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 92946bc72f2e74c3281b7fc12be9704d455fb3ed Mon Sep 17 00:00:00 2001
From: Ben Hutchings
Date: Sat, 3 Apr 2010 19:36:42 +0100
Subject: panic: Add taint flag TAINT_FIRMWARE_WORKAROUND ('I')

This taint flag will initially be used when warning about invalid ACPI
DMAR tables.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 Documentation/oops-tracing.txt | 3 +++
 include/linux/kernel.h         | 1 +
 kernel/panic.c                 | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 069fab3ea4d6..6fe9001b9263 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -260,6 +260,9 @@ characters, each representing a particular tainted value.
 
  11: 'C' if a staging driver has been loaded.
 
+ 12: 'I' if the kernel is working around a severe bug in the platform
+     firmware (BIOS or similar).
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7f0707463360..7f2f7b34dd08 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -344,6 +344,7 @@ extern enum system_states {
 #define TAINT_OVERRIDDEN_ACPI_TABLE	8
 #define TAINT_WARN			9
 #define TAINT_CRAP			10
+#define TAINT_FIRMWARE_WORKAROUND	11
 
 extern void dump_stack(void) __cold;
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 8b821bce66e6..dbe13dbb057a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -178,6 +178,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' },
 	{ TAINT_WARN,			'W', ' ' },
 	{ TAINT_CRAP,			'C', ' ' },
+	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },
 };
 
 /**
@@ -194,6 +195,7 @@ static const struct tnt tnts[] = {
  *  'A' - ACPI table overridden.
  *  'W' - Taint on warning.
  *  'C' - modules from drivers/staging are loaded.
+ *  'I' - Working around severe firmware bug.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.2.3-55-g7522


From 480b02df3aa9f07d1c7df0cd8be7a5ca73893455 Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Wed, 19 May 2010 17:33:39 -0600
Subject: module: drop the lock while waiting for module to complete
 initialization.

This fixes "gave up waiting for init of module libcrc32c." which
happened at boot time due to multiple parallel module loads.

The problem was a deadlock: we wait for a module to finish
initializing, but we keep the module_lock mutex so it can't complete.
In particular, this could reasonably happen if a module does a
request_module() in its initialization routine.

So we change use_module() to return an errno rather than a bool, and if
it's -EBUSY we drop the lock and wait in the caller, then reaquire the
lock.

Reported-by: Brandon Philips <brandon@ifup.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Tested-by: Brandon Philips <brandon@ifup.org>
---
 kernel/module.c | 59 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e2564580f3f1..970d773aec62 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -561,33 +561,26 @@ int use_module(struct module *a, struct module *b)
 	struct module_use *use;
 	int no_warn, err;
 
-	if (b == NULL || already_uses(a, b)) return 1;
-
-	/* If we're interrupted or time out, we fail. */
-	if (wait_event_interruptible_timeout(
-		    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
-		    30 * HZ) <= 0) {
-		printk("%s: gave up waiting for init of module %s.\n",
-		       a->name, b->name);
+	if (b == NULL || already_uses(a, b))
 		return 0;
-	}
 
-	/* If strong_try_module_get() returned a different error, we fail. */
+	/* If we're interrupted or time out, we fail. */
+	err = strong_try_module_get(b);
 	if (err)
-		return 0;
+		return err;
 
 	DEBUGP("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
 	if (!use) {
 		printk("%s: out of memory loading\n", a->name);
 		module_put(b);
-		return 0;
+		return -ENOMEM;
 	}
 
 	use->module_which_uses = a;
 	list_add(&use->list, &b->modules_which_use_me);
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
-	return 1;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(use_module);
 
@@ -880,7 +873,7 @@ static inline void module_unload_free(struct module *mod)
 
 int use_module(struct module *a, struct module *b)
 {
-	return strong_try_module_get(b) == 0;
+	return strong_try_module_get(b);
 }
 EXPORT_SYMBOL_GPL(use_module);
 
@@ -1051,17 +1044,39 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 	struct module *owner;
 	const struct kernel_symbol *sym;
 	const unsigned long *crc;
+	DEFINE_WAIT(wait);
+	int err;
+	long timeleft = 30 * HZ;
 
+again:
 	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	/* use_module can fail due to OOM,
-	   or module initialization or unloading */
-	if (sym) {
-		if (!check_version(sechdrs, versindex, name, mod, crc, owner)
-		    || !use_module(mod, owner))
-			sym = NULL;
-	}
-	return sym;
+	if (!sym)
+		return NULL;
+
+	if (!check_version(sechdrs, versindex, name, mod, crc, owner))
+		return NULL;
+
+	prepare_to_wait(&module_wq, &wait, TASK_INTERRUPTIBLE);
+	err = use_module(mod, owner);
+	if (likely(!err) || err != -EBUSY || signal_pending(current)) {
+		finish_wait(&module_wq, &wait);
+		return err ? NULL : sym;
+	}
+
+	/* Module is still loading.  Drop lock and wait. */
+	mutex_unlock(&module_mutex);
+	timeleft = schedule_timeout(timeleft);
+	mutex_lock(&module_mutex);
+	finish_wait(&module_wq, &wait);
+
+	/* Module might be gone entirely, or replaced.  Re-lookup. */
+	if (timeleft)
+		goto again;
+
+	printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+	       mod->name, owner->name);
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From fa9dc265ace9774e62f0e31108e5f47911124bda Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro
Date: Wed, 19 May 2010 09:37:41 +0900
Subject: cpumask: fix compat getaffinity

Commit a45185d2d "cpumask: convert kernel/compat.c" broke libnuma, which
abuses sched_getaffinity to find out NR_CPUS in order to parse
/sys/devices/system/node/node*/cpumap.

On NUMA systems with less than 32 possibly CPUs, the current
compat_sys_sched_getaffinity now returns '4' instead of the actual
NR_CPUS/8, which makes libnuma bail out when parsing the cpumap.

The libnuma call sched_getaffinity(0, bitmap, 4096) at first.  It mean
the libnuma expect the return value of sched_getaffinity() is either len
argument or NR_CPUS.  But it doesn't expect to return nr_cpu_ids.

Strictly speaking, userland requirement are

1) Glibc assume the return value mean the lengh of initialized
   of mask argument. E.g. if sched_getaffinity(1024) return 128,
   glibc make zero fill rest 896 byte.
2) Libnuma assume the return value can be used to guess NR_CPUS
   in kernel. It assume len-arg<NR_CPUS makes -EINVAL. But
   it try len=4096 at first and 4096 is always bigger than
   NR_CPUS. Then, if we remove strange min_length normalization,
   we never hit -EINVAL case.

sched_getaffinity() already solved this issue.  This patch adapts
compat_sys_sched_getaffinity() to match the non-compat case.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: Ken Werner <ken.werner@web.de>
Cc: stable@kernel.org
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/compat.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 7f40e9275fd9..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
 {
 	int ret;
 	cpumask_var_t mask;
-	unsigned long *k;
-	unsigned int min_length = cpumask_size();
-
-	if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
-		min_length = sizeof(compat_ulong_t);
 
-	if (len < min_length)
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(compat_ulong_t)-1))
 		return -EINVAL;
 
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	ret = sched_getaffinity(pid, mask);
-	if (ret < 0)
-		goto out;
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
 
-	k = cpumask_bits(mask);
-	ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
-	if (ret == 0)
-		ret = min_length;
-
-out:
+		if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
 	free_cpumask_var(mask);
+
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 22c43c81a51e05f61e90445ceb59d486c12fd921 Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz
Date: Wed, 5 May 2010 12:53:11 +0200
Subject: wait_event_interruptible_locked() interface

New wait_event_interruptible{,_exclusive}_locked{,_irq} macros added.
They work just like versions without _locked* suffix but require the
wait queue's lock to be held.  Also __wake_up_locked() is now exported
as to pair it with the above macros.

The use case of this new facility is when one uses wait queue's lock
to  protect a data structure.  This may be advantageous if the
structure needs to be protected by a spinlock anyway.  In particular,
with additional spinlock the following code has to be used to wait
for a condition:

spin_lock(&data.lock);
...
for (ret = 0; !ret && !(condition); ) {
	spin_unlock(&data.lock);
	ret = wait_event_interruptible(data.wqh, (condition));
	spin_lock(&data.lock);
}
...
spin_unlock(&data.lock);

This looks bizarre plus wait_event_interruptible() locks the wait
queue's lock anyway so there is a unlock+lock sequence where it could
be avoided.

To avoid those problems and benefit from wait queue's lock, a code
similar to the following should be used:

/* Waiting */
spin_lock(&data.wqh.lock);
...
ret = wait_event_interruptible_locked(data.wqh, (condition));
...
spin_unlock(&data.wqh.lock);

/* Waiting exclusively */
spin_lock(&data.whq.lock);
...
ret = wait_event_interruptible_exclusive_locked(data.whq, (condition));
...
spin_unlock(&data.whq.lock);

/* Waking up */
spin_lock(&data.wqh.lock);
...
wake_up_locked(&data.wqh);
...
spin_unlock(&data.wqh.lock);

When spin_lock_irq() is used matching versions of macros need to be
used (*_locked_irq()).

Signed-off-by: Michal Nazarewicz <m.nazarewicz@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 include/linux/wait.h | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched.c       |   1 +
 2 files changed, 150 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b77d5e..fc3c040e5e3a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -362,6 +362,155 @@ do {									\
 	__ret;								\
 })
 
+
+#define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \
+({									\
+	int __ret = 0;							\
+	DEFINE_WAIT(__wait);						\
+	if (exclusive)							\
+		__wait.flags |= WQ_FLAG_EXCLUSIVE;			\
+	do {								\
+		if (likely(list_empty(&__wait.task_list)))		\
+			__add_wait_queue_tail(&(wq), &__wait);		\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (signal_pending(current)) {				\
+			__ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		if (irq)						\
+			spin_unlock_irq(&(wq).lock);			\
+		else							\
+			spin_unlock(&(wq).lock);			\
+		schedule();						\
+		if (irq)						\
+			spin_lock_irq(&(wq).lock);			\
+		else							\
+			spin_lock(&(wq).lock);				\
+	} while (!(condition));						\
+	__remove_wait_queue(&(wq), &__wait);				\
+	__set_current_state(TASK_RUNNING);				\
+	__ret;								\
+})
+
+
+/**
+ * wait_event_interruptible_locked - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock()/spin_unlock()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_locked(wq, condition)			\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 0))
+
+/**
+ * wait_event_interruptible_locked_irq - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_locked_irq(wq, condition)		\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 0, 1))
+
+/**
+ * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock()/spin_unlock()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus when other process waits process on the list if this
+ * process is awaken further processes are not considered.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_exclusive_locked(wq, condition)	\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 0))
+
+/**
+ * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * It must be called with wq.lock being held.  This spinlock is
+ * unlocked while sleeping but @condition testing is done while lock
+ * is held and when this macro exits the lock is held.
+ *
+ * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
+ * functions which must match the way they are locked/unlocked outside
+ * of this macro.
+ *
+ * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
+ * set thus when other process waits process on the list if this
+ * process is awaken further processes are not considered.
+ *
+ * wake_up_locked() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function will return -ERESTARTSYS if it was interrupted by a
+ * signal and 0 if @condition evaluated to true.
+ */
+#define wait_event_interruptible_exclusive_locked_irq(wq, condition)	\
+	((condition)							\
+	 ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1))
+
+
+
 #define __wait_event_killable(wq, condition, ret)			\
 do {									\
 	DEFINE_WAIT(__wait);						\
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f70ffe..9584b66c249a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3950,6 +3950,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
-- 
cgit v1.2.3-55-g7522


From c433820971ffa854feda6adc17f5f24201354f11 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Tue, 19 May 2009 07:49:32 -0500
Subject: Move kernel/kgdb.c to kernel/debug/debug_core.c

Move kgdb.c in preparation to separate the gdbstub from the debug
core and exception handling.

CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/Makefile           |    2 +-
 kernel/debug/Makefile     |    6 +
 kernel/debug/debug_core.c | 1764 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/kgdb.c             | 1764 ---------------------------------------------
 4 files changed, 1771 insertions(+), 1765 deletions(-)
 create mode 100644 kernel/debug/Makefile
 create mode 100644 kernel/debug/debug_core.c
 delete mode 100644 kernel/kgdb.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 149e18ef1ab1..057472fbc272 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -75,7 +75,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..5d7850415266
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+
+obj-$(CONFIG_KGDB) += debug_core.o
+
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..11f3515ca83f
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,1764 @@
+/*
+ * KGDB stub.
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2008 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unaligned.h>
+
+static int kgdb_break_asap;
+
+#define KGDB_MAX_THREAD_QUERY 17
+struct kgdb_state {
+	int			ex_vector;
+	int			signo;
+	int			err_code;
+	int			cpu;
+	int			pass_exception;
+	unsigned long		thr_query;
+	unsigned long		threadid;
+	long			kgdb_usethreadid;
+	struct pt_regs		*linux_regs;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
+
+static struct debuggerinfo_struct {
+	void			*debuggerinfo;
+	struct task_struct	*task;
+	int 			exception_state;
+} kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int				kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+static int			kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int			exception_level;
+
+static struct kgdb_io		*kgdb_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+
+static int __init opt_kgdb_con(char *str)
+{
+	kgdb_use_con = 1;
+	return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+	[0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t			kgdb_active = ATOMIC_INIT(-1);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t			passive_cpu_wait[NR_CPUS];
+static atomic_t			cpu_in_kgdb[NR_CPUS];
+atomic_t			kgdb_setting_breakpoint;
+
+struct task_struct		*kgdb_usethread;
+struct task_struct		*kgdb_contthread;
+
+int				kgdb_single_step;
+pid_t				kgdb_sstep_pid;
+
+/* Our I/O buffers. */
+static char			remcom_in_buffer[BUFMAX];
+static char			remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long		gdb_regs[(NUMREGBYTES +
+					sizeof(unsigned long) - 1) /
+					sizeof(unsigned long)];
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+	kgdb_do_roundup = 0;
+
+	return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+	int err;
+
+	err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+
+	return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+				  BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+	return probe_kernel_write((char *)addr,
+				  (char *)bundle, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+	char tmp_variable[BREAK_INSTR_SIZE];
+	int err;
+	/* Validate setting the breakpoint and then removing it.  In the
+	 * remove fails, the kernel needs to emit a bad message because we
+	 * are deep trouble not being able to put things back the way we
+	 * found them.
+	 */
+	err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+	if (err)
+		return err;
+	err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+	if (err)
+		printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+		   "memory destroyed at: %lx", addr);
+	return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+	return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+	return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+	return 0;
+}
+
+void __weak
+kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
+{
+	return;
+}
+
+/**
+ *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ *	@regs: Current &struct pt_regs.
+ *
+ *	This function will be called if the particular architecture must
+ *	disable hardware debugging while it is processing gdb packets or
+ *	handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+
+/*
+ * GDB remote protocol parser:
+ */
+
+static int hex(char ch)
+{
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+	unsigned char checksum;
+	unsigned char xmitcsum;
+	int count;
+	char ch;
+
+	do {
+		/*
+		 * Spin and wait around for the start character, ignore all
+		 * other characters:
+		 */
+		while ((ch = (kgdb_io_ops->read_char())) != '$')
+			/* nothing */;
+
+		kgdb_connected = 1;
+		checksum = 0;
+		xmitcsum = -1;
+
+		count = 0;
+
+		/*
+		 * now, read until a # or end of buffer is found:
+		 */
+		while (count < (BUFMAX - 1)) {
+			ch = kgdb_io_ops->read_char();
+			if (ch == '#')
+				break;
+			checksum = checksum + ch;
+			buffer[count] = ch;
+			count = count + 1;
+		}
+		buffer[count] = 0;
+
+		if (ch == '#') {
+			xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
+			xmitcsum += hex(kgdb_io_ops->read_char());
+
+			if (checksum != xmitcsum)
+				/* failed checksum */
+				kgdb_io_ops->write_char('-');
+			else
+				/* successful transfer */
+				kgdb_io_ops->write_char('+');
+			if (kgdb_io_ops->flush)
+				kgdb_io_ops->flush();
+		}
+	} while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+	unsigned char checksum;
+	int count;
+	char ch;
+
+	/*
+	 * $<packet info>#<checksum>.
+	 */
+	while (1) {
+		kgdb_io_ops->write_char('$');
+		checksum = 0;
+		count = 0;
+
+		while ((ch = buffer[count])) {
+			kgdb_io_ops->write_char(ch);
+			checksum += ch;
+			count++;
+		}
+
+		kgdb_io_ops->write_char('#');
+		kgdb_io_ops->write_char(hex_asc_hi(checksum));
+		kgdb_io_ops->write_char(hex_asc_lo(checksum));
+		if (kgdb_io_ops->flush)
+			kgdb_io_ops->flush();
+
+		/* Now see what we get in reply. */
+		ch = kgdb_io_ops->read_char();
+
+		if (ch == 3)
+			ch = kgdb_io_ops->read_char();
+
+		/* If we get an ACK, we are done. */
+		if (ch == '+')
+			return;
+
+		/*
+		 * If we get the start of another packet, this means
+		 * that GDB is attempting to reconnect.  We will NAK
+		 * the packet being sent, and stop trying to send this
+		 * packet.
+		 */
+		if (ch == '$') {
+			kgdb_io_ops->write_char('-');
+			if (kgdb_io_ops->flush)
+				kgdb_io_ops->flush();
+			return;
+		}
+	}
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in buf.
+ * Return a pointer to the last char put in buf (null). May return an error.
+ */
+int kgdb_mem2hex(char *mem, char *buf, int count)
+{
+	char *tmp;
+	int err;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory copy.  Hex conversion will work against this one.
+	 */
+	tmp = buf + count;
+
+	err = probe_kernel_read(tmp, mem, count);
+	if (!err) {
+		while (count > 0) {
+			buf = pack_hex_byte(buf, *tmp);
+			tmp++;
+			count--;
+		}
+
+		*buf = 0;
+	}
+
+	return err;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+	int size = 0;
+	char *c = buf;
+
+	while (count-- > 0) {
+		c[size] = *buf++;
+		if (c[size] == 0x7d)
+			c[size] = *buf++ ^ 0x20;
+		size++;
+	}
+
+	return probe_kernel_write(mem, c, size);
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in mem.
+ * Return a pointer to the character AFTER the last byte written.
+ * May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+	char *tmp_raw;
+	char *tmp_hex;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory that is converted from hex.
+	 */
+	tmp_raw = buf + count * 2;
+
+	tmp_hex = tmp_raw - 1;
+	while (tmp_hex >= buf) {
+		tmp_raw--;
+		*tmp_raw = hex(*tmp_hex--);
+		*tmp_raw |= hex(*tmp_hex--) << 4;
+	}
+
+	return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+	int hex_val;
+	int num = 0;
+	int negate = 0;
+
+	*long_val = 0;
+
+	if (**ptr == '-') {
+		negate = 1;
+		(*ptr)++;
+	}
+	while (**ptr) {
+		hex_val = hex(**ptr);
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		num++;
+		(*ptr)++;
+	}
+
+	if (negate)
+		*long_val = -*long_val;
+
+	return num;
+}
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long addr;
+	unsigned long length;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+		if (binary)
+			err = kgdb_ebin2mem(ptr, (char *)addr, length);
+		else
+			err = kgdb_hex2mem(ptr, (char *)addr, length);
+		if (err)
+			return err;
+		if (CACHE_FLUSH_IS_SAFE)
+			flush_icache_range(addr, addr + length);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+	error = -error;
+	pkt[0] = 'E';
+	pkt[1] = hex_asc[(error / 10)];
+	pkt[2] = hex_asc[(error % 10)];
+	pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE	16
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+	char *limit;
+
+	limit = pkt + BUF_THREAD_ID_SIZE;
+	while (pkt < limit)
+		pkt = pack_hex_byte(pkt, *id++);
+
+	return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+	unsigned char *scan;
+	int i = 4;
+
+	scan = (unsigned char *)id;
+	while (i--)
+		*scan++ = 0;
+	put_unaligned_be32(value, scan);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+	/*
+	 * Non-positive TIDs are remapped to the cpu shadow information
+	 */
+	if (tid == 0 || tid == -1)
+		tid = -atomic_read(&kgdb_active) - 2;
+	if (tid < -1 && tid > -NR_CPUS - 2) {
+		if (kgdb_info[-tid - 2].task)
+			return kgdb_info[-tid - 2].task;
+		else
+			return idle_task(-tid - 2);
+	}
+	if (tid <= 0) {
+		printk(KERN_ERR "KGDB: Internal thread select error\n");
+		dump_stack();
+		return NULL;
+	}
+
+	/*
+	 * find_task_by_pid_ns() does not take the tasklist lock anymore
+	 * but is nicely RCU locked - hence is a pretty resilient
+	 * thing to use:
+	 */
+	return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+	if (!CACHE_FLUSH_IS_SAFE)
+		return;
+
+	if (current->mm && current->mm->mmap_cache) {
+		flush_cache_range(current->mm->mmap_cache,
+				  addr, addr + BREAK_INSTR_SIZE);
+	}
+	/* Force flush instruction cache if it was outside the mm */
+	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+static int kgdb_activate_sw_breakpoints(void)
+{
+	unsigned long addr;
+	int error;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_SET)
+			continue;
+
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_set_breakpoint(addr,
+				kgdb_break[i].saved_instr);
+		if (error) {
+			ret = error;
+			printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+			continue;
+		}
+
+		kgdb_flush_swbreak_addr(addr);
+		kgdb_break[i].state = BP_ACTIVE;
+	}
+	return ret;
+}
+
+static int kgdb_set_sw_break(unsigned long addr)
+{
+	int err = kgdb_validate_break_address(addr);
+	int breakno = -1;
+	int i;
+
+	if (err)
+		return err;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_SET) &&
+					(kgdb_break[i].bpt_addr == addr))
+			return -EEXIST;
+	}
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state == BP_REMOVED &&
+					kgdb_break[i].bpt_addr == addr) {
+			breakno = i;
+			break;
+		}
+	}
+
+	if (breakno == -1) {
+		for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+			if (kgdb_break[i].state == BP_UNDEFINED) {
+				breakno = i;
+				break;
+			}
+		}
+	}
+
+	if (breakno == -1)
+		return -E2BIG;
+
+	kgdb_break[breakno].state = BP_SET;
+	kgdb_break[breakno].type = BP_BREAKPOINT;
+	kgdb_break[breakno].bpt_addr = addr;
+
+	return 0;
+}
+
+static int kgdb_deactivate_sw_breakpoints(void)
+{
+	unsigned long addr;
+	int error;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_ACTIVE)
+			continue;
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_remove_breakpoint(addr,
+					kgdb_break[i].saved_instr);
+		if (error) {
+			printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+			ret = error;
+		}
+
+		kgdb_flush_swbreak_addr(addr);
+		kgdb_break[i].state = BP_SET;
+	}
+	return ret;
+}
+
+static int kgdb_remove_sw_break(unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_SET) &&
+				(kgdb_break[i].bpt_addr == addr)) {
+			kgdb_break[i].state = BP_REMOVED;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_REMOVED) &&
+					(kgdb_break[i].bpt_addr == addr))
+			return 1;
+	}
+	return 0;
+}
+
+static int remove_all_break(void)
+{
+	unsigned long addr;
+	int error;
+	int i;
+
+	/* Clear memory breakpoints. */
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_ACTIVE)
+			goto setundefined;
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_remove_breakpoint(addr,
+				kgdb_break[i].saved_instr);
+		if (error)
+			printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+			   addr);
+setundefined:
+		kgdb_break[i].state = BP_UNDEFINED;
+	}
+
+	/* Clear hardware breakpoints. */
+	if (arch_kgdb_ops.remove_all_hw_break)
+		arch_kgdb_ops.remove_all_hw_break();
+
+	return 0;
+}
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+	if (realpid)
+		return realpid;
+
+	return -raw_smp_processor_id() - 2;
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+static void kgdb_msg_write(const char *s, int len)
+{
+	char *bufptr;
+	int wcount;
+	int i;
+
+	/* 'O'utput */
+	gdbmsgbuf[0] = 'O';
+
+	/* Fill and send buffers... */
+	while (len > 0) {
+		bufptr = gdbmsgbuf + 1;
+
+		/* Calculate how many this time */
+		if ((len << 1) > (BUFMAX - 2))
+			wcount = (BUFMAX - 2) >> 1;
+		else
+			wcount = len;
+
+		/* Pack in hex chars */
+		for (i = 0; i < wcount; i++)
+			bufptr = pack_hex_byte(bufptr, s[i]);
+		*bufptr = '\0';
+
+		/* Move up */
+		s += wcount;
+		len -= wcount;
+
+		/* Write packet */
+		put_packet(gdbmsgbuf);
+	}
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module.  Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+	if (!kgdb_io_ops)
+		return 0;
+	if (kgdb_connected)
+		return 1;
+	if (atomic_read(&kgdb_setting_breakpoint))
+		return 1;
+	if (print_wait)
+		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+	return 1;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+	/*
+	 * We know that this packet is only sent
+	 * during initial connect.  So to be safe,
+	 * we clear out our breakpoints now in case
+	 * GDB is reconnecting.
+	 */
+	remove_all_break();
+
+	remcom_out_buffer[0] = 'S';
+	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	void *local_debuggerinfo;
+	int i;
+
+	thread = kgdb_usethread;
+	if (!thread) {
+		thread = kgdb_info[ks->cpu].task;
+		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+	} else {
+		local_debuggerinfo = NULL;
+		for_each_online_cpu(i) {
+			/*
+			 * Try to find the task on some other
+			 * or possibly this node if we do not
+			 * find the matching task then we try
+			 * to approximate the results.
+			 */
+			if (thread == kgdb_info[i].task)
+				local_debuggerinfo = kgdb_info[i].debuggerinfo;
+		}
+	}
+
+	/*
+	 * All threads that don't have debuggerinfo should be
+	 * in schedule() sleeping, since all other CPUs
+	 * are in kgdb_wait, and thus have debuggerinfo.
+	 */
+	if (local_debuggerinfo) {
+		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+	} else {
+		/*
+		 * Pull stuff saved during switch_to; nothing
+		 * else is accessible (or even particularly
+		 * relevant).
+		 *
+		 * This should be enough for a stack trace.
+		 */
+		sleeping_thread_to_gdb_regs(gdb_regs, thread);
+	}
+	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+	if (kgdb_usethread && kgdb_usethread != current) {
+		error_packet(remcom_out_buffer, -EINVAL);
+	} else {
+		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+		strcpy(remcom_out_buffer, "OK");
+	}
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long length;
+	unsigned long addr;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+					kgdb_hex2long(&ptr, &length) > 0) {
+		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+		if (err)
+			error_packet(remcom_out_buffer, err);
+	} else {
+		error_packet(remcom_out_buffer, -EINVAL);
+	}
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(0);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(1);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+	int error;
+
+	/* The detach case */
+	if (remcom_in_buffer[0] == 'D') {
+		error = remove_all_break();
+		if (error < 0) {
+			error_packet(remcom_out_buffer, error);
+		} else {
+			strcpy(remcom_out_buffer, "OK");
+			kgdb_connected = 0;
+		}
+		put_packet(remcom_out_buffer);
+	} else {
+		/*
+		 * Assume the kill case, with no exit code checking,
+		 * trying to force detach the debugger:
+		 */
+		remove_all_break();
+		kgdb_connected = 0;
+	}
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+	/* For now, only honor R0 */
+	if (strcmp(remcom_in_buffer, "R0") == 0) {
+		printk(KERN_CRIT "Executing emergency reboot\n");
+		strcpy(remcom_out_buffer, "OK");
+		put_packet(remcom_out_buffer);
+
+		/*
+		 * Execution should not return from
+		 * machine_emergency_restart()
+		 */
+		machine_emergency_restart();
+		kgdb_connected = 0;
+
+		return 1;
+	}
+	return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+	struct task_struct *g;
+	struct task_struct *p;
+	unsigned char thref[8];
+	char *ptr;
+	int i;
+	int cpu;
+	int finished = 0;
+
+	switch (remcom_in_buffer[1]) {
+	case 's':
+	case 'f':
+		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+
+		i = 0;
+		remcom_out_buffer[0] = 'm';
+		ptr = remcom_out_buffer + 1;
+		if (remcom_in_buffer[1] == 'f') {
+			/* Each cpu is a shadow thread */
+			for_each_online_cpu(cpu) {
+				ks->thr_query = 0;
+				int_to_threadref(thref, -cpu - 2);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				i++;
+			}
+		}
+
+		do_each_thread(g, p) {
+			if (i >= ks->thr_query && !finished) {
+				int_to_threadref(thref, p->pid);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				ks->thr_query++;
+				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+					finished = 1;
+			}
+			i++;
+		} while_each_thread(g, p);
+
+		*(--ptr) = '\0';
+		break;
+
+	case 'C':
+		/* Current thread id */
+		strcpy(remcom_out_buffer, "QC");
+		ks->threadid = shadow_pid(current->pid);
+		int_to_threadref(thref, ks->threadid);
+		pack_threadid(remcom_out_buffer + 2, thref);
+		break;
+	case 'T':
+		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		ks->threadid = 0;
+		ptr = remcom_in_buffer + 17;
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!getthread(ks->linux_regs, ks->threadid)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		if ((int)ks->threadid > 0) {
+			kgdb_mem2hex(getthread(ks->linux_regs,
+					ks->threadid)->comm,
+					remcom_out_buffer, 16);
+		} else {
+			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+			sprintf(tmpstr, "shadowCPU%d",
+					(int)(-ks->threadid - 2));
+			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+		}
+		break;
+	}
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	char *ptr;
+
+	switch (remcom_in_buffer[1]) {
+	case 'g':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		thread = getthread(ks->linux_regs, ks->threadid);
+		if (!thread && ks->threadid > 0) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		kgdb_usethread = thread;
+		ks->kgdb_usethreadid = ks->threadid;
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	case 'c':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!ks->threadid) {
+			kgdb_contthread = NULL;
+		} else {
+			thread = getthread(ks->linux_regs, ks->threadid);
+			if (!thread && ks->threadid > 0) {
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			kgdb_contthread = thread;
+		}
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	}
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	struct task_struct *thread;
+
+	kgdb_hex2long(&ptr, &ks->threadid);
+	thread = getthread(ks->linux_regs, ks->threadid);
+	if (thread)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+	/*
+	 * Since GDB-5.3, it's been drafted that '0' is a software
+	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
+	 */
+	char *bpt_type = &remcom_in_buffer[1];
+	char *ptr = &remcom_in_buffer[2];
+	unsigned long addr;
+	unsigned long length;
+	int error = 0;
+
+	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+		/* Unsupported */
+		if (*bpt_type > '4')
+			return;
+	} else {
+		if (*bpt_type != '0' && *bpt_type != '1')
+			/* Unsupported. */
+			return;
+	}
+
+	/*
+	 * Test if this is a hardware breakpoint, and
+	 * if we support it:
+	 */
+	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+		/* Unsupported. */
+		return;
+
+	if (*(ptr++) != ',') {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (!kgdb_hex2long(&ptr, &addr)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (*(ptr++) != ',' ||
+		!kgdb_hex2long(&ptr, &length)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+
+	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+		error = kgdb_set_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+		error = kgdb_remove_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'Z')
+		error = arch_kgdb_ops.set_hw_breakpoint(addr,
+			(int)length, *bpt_type - '0');
+	else if (remcom_in_buffer[0] == 'z')
+		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+			(int) length, *bpt_type - '0');
+
+	if (error == 0)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+	/* C09 == pass exception
+	 * C15 == detach kgdb, pass exception
+	 */
+	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'c';
+
+	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'D';
+		remove_all_break();
+		kgdb_connected = 0;
+		return 1;
+
+	} else {
+		kgdb_msg_write("KGDB only knows signal 9 (pass)"
+			" and 15 (pass and disconnect)\n"
+			"Executing a continue without signal passing\n", 0);
+		remcom_in_buffer[0] = 'c';
+	}
+
+	/* Indicate fall through */
+	return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+static int gdb_serial_stub(struct kgdb_state *ks)
+{
+	int error = 0;
+	int tmp;
+
+	/* Clear the out buffer. */
+	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+	if (kgdb_connected) {
+		unsigned char thref[8];
+		char *ptr;
+
+		/* Reply to host that an exception has occurred */
+		ptr = remcom_out_buffer;
+		*ptr++ = 'T';
+		ptr = pack_hex_byte(ptr, ks->signo);
+		ptr += strlen(strcpy(ptr, "thread:"));
+		int_to_threadref(thref, shadow_pid(current->pid));
+		ptr = pack_threadid(ptr, thref);
+		*ptr++ = ';';
+		put_packet(remcom_out_buffer);
+	}
+
+	kgdb_usethread = kgdb_info[ks->cpu].task;
+	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+	ks->pass_exception = 0;
+
+	while (1) {
+		error = 0;
+
+		/* Clear the out buffer. */
+		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+		get_packet(remcom_in_buffer);
+
+		switch (remcom_in_buffer[0]) {
+		case '?': /* gdbserial status */
+			gdb_cmd_status(ks);
+			break;
+		case 'g': /* return the value of the CPU registers */
+			gdb_cmd_getregs(ks);
+			break;
+		case 'G': /* set the value of the CPU registers - return OK */
+			gdb_cmd_setregs(ks);
+			break;
+		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+			gdb_cmd_memread(ks);
+			break;
+		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_memwrite(ks);
+			break;
+		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_binwrite(ks);
+			break;
+			/* kill or detach. KGDB should treat this like a
+			 * continue.
+			 */
+		case 'D': /* Debugger detach */
+		case 'k': /* Debugger detach via kill */
+			gdb_cmd_detachkill(ks);
+			goto default_handle;
+		case 'R': /* Reboot */
+			if (gdb_cmd_reboot(ks))
+				goto default_handle;
+			break;
+		case 'q': /* query command */
+			gdb_cmd_query(ks);
+			break;
+		case 'H': /* task related */
+			gdb_cmd_task(ks);
+			break;
+		case 'T': /* Query thread status */
+			gdb_cmd_thread(ks);
+			break;
+		case 'z': /* Break point remove */
+		case 'Z': /* Break point set */
+			gdb_cmd_break(ks);
+			break;
+		case 'C': /* Exception passing */
+			tmp = gdb_cmd_exception_pass(ks);
+			if (tmp > 0)
+				goto default_handle;
+			if (tmp == 0)
+				break;
+			/* Fall through on tmp < 0 */
+		case 'c': /* Continue packet */
+		case 's': /* Single step packet */
+			if (kgdb_contthread && kgdb_contthread != current) {
+				/* Can't switch threads in kgdb */
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			kgdb_activate_sw_breakpoints();
+			/* Fall through to default processing */
+		default:
+default_handle:
+			error = kgdb_arch_handle_exception(ks->ex_vector,
+						ks->signo,
+						ks->err_code,
+						remcom_in_buffer,
+						remcom_out_buffer,
+						ks->linux_regs);
+			/*
+			 * Leave cmd processing on error, detach,
+			 * kill, continue, or single step.
+			 */
+			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+			    remcom_in_buffer[0] == 'k') {
+				error = 0;
+				goto kgdb_exit;
+			}
+
+		}
+
+		/* reply to the request */
+		put_packet(remcom_out_buffer);
+	}
+
+kgdb_exit:
+	if (ks->pass_exception)
+		error = 1;
+	return error;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+	unsigned long addr;
+
+	if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+		return 0;
+
+	/* Panic on recursive debugger calls: */
+	exception_level++;
+	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+	kgdb_deactivate_sw_breakpoints();
+
+	/*
+	 * If the break point removed ok at the place exception
+	 * occurred, try to recover and print a warning to the end
+	 * user because the user planted a breakpoint in a place that
+	 * KGDB needs in order to function.
+	 */
+	if (kgdb_remove_sw_break(addr) == 0) {
+		exception_level = 0;
+		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+		kgdb_activate_sw_breakpoints();
+		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+			addr);
+		WARN_ON_ONCE(1);
+
+		return 1;
+	}
+	remove_all_break();
+	kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+	if (exception_level > 1) {
+		dump_stack();
+		panic("Recursive entry to debugger");
+	}
+
+	printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+	dump_stack();
+	panic("Recursive entry to debugger");
+
+	return 1;
+}
+
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+{
+	unsigned long flags;
+	int sstep_tries = 100;
+	int error = 0;
+	int i, cpu;
+	int trace_on = 0;
+acquirelock:
+	/*
+	 * Interrupts will be restored by the 'trap return' code, except when
+	 * single stepping.
+	 */
+	local_irq_save(flags);
+
+	cpu = ks->cpu;
+	kgdb_info[cpu].debuggerinfo = regs;
+	kgdb_info[cpu].task = current;
+	/*
+	 * Make sure the above info reaches the primary CPU before
+	 * our cpu_in_kgdb[] flag setting does:
+	 */
+	atomic_inc(&cpu_in_kgdb[cpu]);
+
+	/*
+	 * CPU will loop if it is a slave or request to become a kgdb
+	 * master cpu and acquire the kgdb_active lock:
+	 */
+	while (1) {
+		if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+				break;
+		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+			if (!atomic_read(&passive_cpu_wait[cpu]))
+				goto return_normal;
+		} else {
+return_normal:
+			/* Return to normal operation by executing any
+			 * hw breakpoint fixup.
+			 */
+			if (arch_kgdb_ops.correct_hw_break)
+				arch_kgdb_ops.correct_hw_break();
+			if (trace_on)
+				tracing_on();
+			atomic_dec(&cpu_in_kgdb[cpu]);
+			touch_softlockup_watchdog_sync();
+			clocksource_touch_watchdog();
+			local_irq_restore(flags);
+			return 0;
+		}
+		cpu_relax();
+	}
+
+	/*
+	 * For single stepping, try to only enter on the processor
+	 * that was single stepping.  To gaurd against a deadlock, the
+	 * kernel will only try for the value of sstep_tries before
+	 * giving up and continuing on.
+	 */
+	if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+	    (kgdb_info[cpu].task &&
+	     kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+		atomic_set(&kgdb_active, -1);
+		touch_softlockup_watchdog_sync();
+		clocksource_touch_watchdog();
+		local_irq_restore(flags);
+
+		goto acquirelock;
+	}
+
+	if (!kgdb_io_ready(1)) {
+		error = 1;
+		goto kgdb_restore; /* No I/O connection, so resume the system */
+	}
+
+	/*
+	 * Don't enter if we have hit a removed breakpoint.
+	 */
+	if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+		goto kgdb_restore;
+
+	/* Call the I/O driver's pre_exception routine */
+	if (kgdb_io_ops->pre_exception)
+		kgdb_io_ops->pre_exception();
+
+	kgdb_disable_hw_debug(ks->linux_regs);
+
+	/*
+	 * Get the passive CPU lock which will hold all the non-primary
+	 * CPU in a spin state while the debugger is active
+	 */
+	if (!kgdb_single_step) {
+		for (i = 0; i < NR_CPUS; i++)
+			atomic_inc(&passive_cpu_wait[i]);
+	}
+
+#ifdef CONFIG_SMP
+	/* Signal the other CPUs to enter kgdb_wait() */
+	if ((!kgdb_single_step) && kgdb_do_roundup)
+		kgdb_roundup_cpus(flags);
+#endif
+
+	/*
+	 * Wait for the other CPUs to be notified and be waiting for us:
+	 */
+	for_each_online_cpu(i) {
+		while (!atomic_read(&cpu_in_kgdb[i]))
+			cpu_relax();
+	}
+
+	/*
+	 * At this point the primary processor is completely
+	 * in the debugger and all secondary CPUs are quiescent
+	 */
+	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
+	kgdb_deactivate_sw_breakpoints();
+	kgdb_single_step = 0;
+	kgdb_contthread = current;
+	exception_level = 0;
+	trace_on = tracing_is_on();
+	if (trace_on)
+		tracing_off();
+
+	/* Talk to debugger with gdbserial protocol */
+	error = gdb_serial_stub(ks);
+
+	/* Call the I/O driver's post_exception routine */
+	if (kgdb_io_ops->post_exception)
+		kgdb_io_ops->post_exception();
+
+	atomic_dec(&cpu_in_kgdb[ks->cpu]);
+
+	if (!kgdb_single_step) {
+		for (i = NR_CPUS-1; i >= 0; i--)
+			atomic_dec(&passive_cpu_wait[i]);
+		/*
+		 * Wait till all the CPUs have quit
+		 * from the debugger.
+		 */
+		for_each_online_cpu(i) {
+			while (atomic_read(&cpu_in_kgdb[i]))
+				cpu_relax();
+		}
+	}
+
+kgdb_restore:
+	if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+		int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+		if (kgdb_info[sstep_cpu].task)
+			kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+		else
+			kgdb_sstep_pid = 0;
+	}
+	if (trace_on)
+		tracing_on();
+	/* Free kgdb_active */
+	atomic_set(&kgdb_active, -1);
+	touch_softlockup_watchdog_sync();
+	clocksource_touch_watchdog();
+	local_irq_restore(flags);
+
+	return error;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *	interface locks, if any (begin_session)
+ *	kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+	struct kgdb_state kgdb_var;
+	struct kgdb_state *ks = &kgdb_var;
+	int ret;
+
+	ks->cpu			= raw_smp_processor_id();
+	ks->ex_vector		= evector;
+	ks->signo		= signo;
+	ks->ex_vector		= evector;
+	ks->err_code		= ecode;
+	ks->kgdb_usethreadid	= 0;
+	ks->linux_regs		= regs;
+
+	if (kgdb_reenter_check(ks))
+		return 0; /* Ouch, double exception ! */
+	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+	ret = kgdb_cpu_enter(ks, regs);
+	kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+	return ret;
+}
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+	struct kgdb_state kgdb_var;
+	struct kgdb_state *ks = &kgdb_var;
+
+	memset(ks, 0, sizeof(struct kgdb_state));
+	ks->cpu			= cpu;
+	ks->linux_regs		= regs;
+
+	if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+	    atomic_read(&kgdb_active) != -1 &&
+	    atomic_read(&kgdb_active) != cpu) {
+		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+		kgdb_cpu_enter(ks, regs);
+		kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
+		return 0;
+	}
+#endif
+	return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+   unsigned count)
+{
+	unsigned long flags;
+
+	/* If we're debugging, or KGDB has not connected, don't try
+	 * and print. */
+	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+		return;
+
+	local_irq_save(flags);
+	kgdb_msg_write(s, count);
+	local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+	.name		= "kgdb",
+	.write		= kgdb_console_write,
+	.flags		= CON_PRINTBUFFER | CON_ENABLED,
+	.index		= -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_gdb(int key, struct tty_struct *tty)
+{
+	if (!kgdb_io_ops) {
+		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+		return;
+	}
+	if (!kgdb_connected)
+		printk(KERN_CRIT "Entering KGDB\n");
+
+	kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_gdb_op = {
+	.handler	= sysrq_handle_gdb,
+	.help_msg	= "debug(G)",
+	.action_msg	= "DEBUG",
+};
+#endif
+
+static void kgdb_register_callbacks(void)
+{
+	if (!kgdb_io_module_registered) {
+		kgdb_io_module_registered = 1;
+		kgdb_arch_init();
+#ifdef CONFIG_MAGIC_SYSRQ
+		register_sysrq_key('g', &sysrq_gdb_op);
+#endif
+		if (kgdb_use_con && !kgdb_con_registered) {
+			register_console(&kgdbcons);
+			kgdb_con_registered = 1;
+		}
+	}
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+	/*
+	 * When this routine is called KGDB should unregister from the
+	 * panic handler and clean up, making sure it is not handling any
+	 * break exceptions at the time.
+	 */
+	if (kgdb_io_module_registered) {
+		kgdb_io_module_registered = 0;
+		kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+		unregister_sysrq_key('g', &sysrq_gdb_op);
+#endif
+		if (kgdb_con_registered) {
+			unregister_console(&kgdbcons);
+			kgdb_con_registered = 0;
+		}
+	}
+}
+
+static void kgdb_initial_breakpoint(void)
+{
+	kgdb_break_asap = 0;
+
+	printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+	kgdb_breakpoint();
+}
+
+/**
+ *	kgdb_register_io_module - register KGDB IO module
+ *	@new_kgdb_io_ops: the io ops vector
+ *
+ *	Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
+{
+	int err;
+
+	spin_lock(&kgdb_registration_lock);
+
+	if (kgdb_io_ops) {
+		spin_unlock(&kgdb_registration_lock);
+
+		printk(KERN_ERR "kgdb: Another I/O driver is already "
+				"registered with KGDB.\n");
+		return -EBUSY;
+	}
+
+	if (new_kgdb_io_ops->init) {
+		err = new_kgdb_io_ops->init();
+		if (err) {
+			spin_unlock(&kgdb_registration_lock);
+			return err;
+		}
+	}
+
+	kgdb_io_ops = new_kgdb_io_ops;
+
+	spin_unlock(&kgdb_registration_lock);
+
+	printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+	       new_kgdb_io_ops->name);
+
+	/* Arm KGDB now. */
+	kgdb_register_callbacks();
+
+	if (kgdb_break_asap)
+		kgdb_initial_breakpoint();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ *	kkgdb_unregister_io_module - unregister KGDB IO module
+ *	@old_kgdb_io_ops: the io ops vector
+ *
+ *	Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
+{
+	BUG_ON(kgdb_connected);
+
+	/*
+	 * KGDB is no longer able to communicate out, so
+	 * unregister our callbacks and reset state.
+	 */
+	kgdb_unregister_callbacks();
+
+	spin_lock(&kgdb_registration_lock);
+
+	WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
+	kgdb_io_ops = NULL;
+
+	spin_unlock(&kgdb_registration_lock);
+
+	printk(KERN_INFO
+		"kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+		old_kgdb_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception.  It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+	atomic_inc(&kgdb_setting_breakpoint);
+	wmb(); /* Sync point before breakpoint */
+	arch_kgdb_breakpoint();
+	wmb(); /* Sync point after breakpoint */
+	atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+	kgdb_break_asap = 1;
+
+	if (kgdb_io_module_registered)
+		kgdb_initial_breakpoint();
+
+	return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 11f3515ca83f..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1764 +0,0 @@
-/*
- * KGDB stub.
- *
- * Maintainer: Jason Wessel <jason.wessel@windriver.com>
- *
- * Copyright (C) 2000-2001 VERITAS Software Corporation.
- * Copyright (C) 2002-2004 Timesys Corporation
- * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
- * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
- * Copyright (C) 2005-2008 Wind River Systems, Inc.
- * Copyright (C) 2007 MontaVista Software, Inc.
- * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * Contributors at various stages not listed above:
- *  Jason Wessel ( jason.wessel@windriver.com )
- *  George Anzinger <george@mvista.com>
- *  Anurekh Saxena (anurekh.saxena@timesys.com)
- *  Lake Stevens Instrument Division (Glenn Engel)
- *  Jim Kingdon, Cygnus Support.
- *
- * Original KGDB stub: David Grothe <dave@gcom.com>,
- * Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-#include <linux/pid_namespace.h>
-#include <linux/clocksource.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/console.h>
-#include <linux/threads.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/sysrq.h>
-#include <linux/init.h>
-#include <linux/kgdb.h>
-#include <linux/pid.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm/cacheflush.h>
-#include <asm/byteorder.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/unaligned.h>
-
-static int kgdb_break_asap;
-
-#define KGDB_MAX_THREAD_QUERY 17
-struct kgdb_state {
-	int			ex_vector;
-	int			signo;
-	int			err_code;
-	int			cpu;
-	int			pass_exception;
-	unsigned long		thr_query;
-	unsigned long		threadid;
-	long			kgdb_usethreadid;
-	struct pt_regs		*linux_regs;
-};
-
-/* Exception state values */
-#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
-#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
-#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP       0x8 /* CPU is single stepping */
-
-static struct debuggerinfo_struct {
-	void			*debuggerinfo;
-	struct task_struct	*task;
-	int 			exception_state;
-} kgdb_info[NR_CPUS];
-
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
-int				kgdb_connected;
-EXPORT_SYMBOL_GPL(kgdb_connected);
-
-/* All the KGDB handlers are installed */
-static int			kgdb_io_module_registered;
-
-/* Guard for recursive entry */
-static int			exception_level;
-
-static struct kgdb_io		*kgdb_io_ops;
-static DEFINE_SPINLOCK(kgdb_registration_lock);
-
-/* kgdb console driver is loaded */
-static int kgdb_con_registered;
-/* determine if kgdb console output should be used */
-static int kgdb_use_con;
-
-static int __init opt_kgdb_con(char *str)
-{
-	kgdb_use_con = 1;
-	return 0;
-}
-
-early_param("kgdbcon", opt_kgdb_con);
-
-module_param(kgdb_use_con, int, 0644);
-
-/*
- * Holds information about breakpoints in a kernel. These breakpoints are
- * added and removed by gdb.
- */
-static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
-	[0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
-};
-
-/*
- * The CPU# of the active CPU, or -1 if none:
- */
-atomic_t			kgdb_active = ATOMIC_INIT(-1);
-
-/*
- * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
- * bootup code (which might not have percpu set up yet):
- */
-static atomic_t			passive_cpu_wait[NR_CPUS];
-static atomic_t			cpu_in_kgdb[NR_CPUS];
-atomic_t			kgdb_setting_breakpoint;
-
-struct task_struct		*kgdb_usethread;
-struct task_struct		*kgdb_contthread;
-
-int				kgdb_single_step;
-pid_t				kgdb_sstep_pid;
-
-/* Our I/O buffers. */
-static char			remcom_in_buffer[BUFMAX];
-static char			remcom_out_buffer[BUFMAX];
-
-/* Storage for the registers, in GDB format. */
-static unsigned long		gdb_regs[(NUMREGBYTES +
-					sizeof(unsigned long) - 1) /
-					sizeof(unsigned long)];
-
-/* to keep track of the CPU which is doing the single stepping*/
-atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
-
-/*
- * If you are debugging a problem where roundup (the collection of
- * all other CPUs) is a problem [this should be extremely rare],
- * then use the nokgdbroundup option to avoid roundup. In that case
- * the other CPUs might interfere with your debugging context, so
- * use this with care:
- */
-static int kgdb_do_roundup = 1;
-
-static int __init opt_nokgdbroundup(char *str)
-{
-	kgdb_do_roundup = 0;
-
-	return 0;
-}
-
-early_param("nokgdbroundup", opt_nokgdbroundup);
-
-/*
- * Finally, some KGDB code :-)
- */
-
-/*
- * Weak aliases for breakpoint management,
- * can be overriden by architectures when needed:
- */
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
-{
-	int err;
-
-	err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
-	if (err)
-		return err;
-
-	return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
-				  BREAK_INSTR_SIZE);
-}
-
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
-{
-	return probe_kernel_write((char *)addr,
-				  (char *)bundle, BREAK_INSTR_SIZE);
-}
-
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-	char tmp_variable[BREAK_INSTR_SIZE];
-	int err;
-	/* Validate setting the breakpoint and then removing it.  In the
-	 * remove fails, the kernel needs to emit a bad message because we
-	 * are deep trouble not being able to put things back the way we
-	 * found them.
-	 */
-	err = kgdb_arch_set_breakpoint(addr, tmp_variable);
-	if (err)
-		return err;
-	err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
-	if (err)
-		printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
-		   "memory destroyed at: %lx", addr);
-	return err;
-}
-
-unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
-{
-	return instruction_pointer(regs);
-}
-
-int __weak kgdb_arch_init(void)
-{
-	return 0;
-}
-
-int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
-{
-	return 0;
-}
-
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	return;
-}
-
-/**
- *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- *	@regs: Current &struct pt_regs.
- *
- *	This function will be called if the particular architecture must
- *	disable hardware debugging while it is processing gdb packets or
- *	handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
-
-/*
- * GDB remote protocol parser:
- */
-
-static int hex(char ch)
-{
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/* scan for the sequence $<data>#<checksum> */
-static void get_packet(char *buffer)
-{
-	unsigned char checksum;
-	unsigned char xmitcsum;
-	int count;
-	char ch;
-
-	do {
-		/*
-		 * Spin and wait around for the start character, ignore all
-		 * other characters:
-		 */
-		while ((ch = (kgdb_io_ops->read_char())) != '$')
-			/* nothing */;
-
-		kgdb_connected = 1;
-		checksum = 0;
-		xmitcsum = -1;
-
-		count = 0;
-
-		/*
-		 * now, read until a # or end of buffer is found:
-		 */
-		while (count < (BUFMAX - 1)) {
-			ch = kgdb_io_ops->read_char();
-			if (ch == '#')
-				break;
-			checksum = checksum + ch;
-			buffer[count] = ch;
-			count = count + 1;
-		}
-		buffer[count] = 0;
-
-		if (ch == '#') {
-			xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
-			xmitcsum += hex(kgdb_io_ops->read_char());
-
-			if (checksum != xmitcsum)
-				/* failed checksum */
-				kgdb_io_ops->write_char('-');
-			else
-				/* successful transfer */
-				kgdb_io_ops->write_char('+');
-			if (kgdb_io_ops->flush)
-				kgdb_io_ops->flush();
-		}
-	} while (checksum != xmitcsum);
-}
-
-/*
- * Send the packet in buffer.
- * Check for gdb connection if asked for.
- */
-static void put_packet(char *buffer)
-{
-	unsigned char checksum;
-	int count;
-	char ch;
-
-	/*
-	 * $<packet info>#<checksum>.
-	 */
-	while (1) {
-		kgdb_io_ops->write_char('$');
-		checksum = 0;
-		count = 0;
-
-		while ((ch = buffer[count])) {
-			kgdb_io_ops->write_char(ch);
-			checksum += ch;
-			count++;
-		}
-
-		kgdb_io_ops->write_char('#');
-		kgdb_io_ops->write_char(hex_asc_hi(checksum));
-		kgdb_io_ops->write_char(hex_asc_lo(checksum));
-		if (kgdb_io_ops->flush)
-			kgdb_io_ops->flush();
-
-		/* Now see what we get in reply. */
-		ch = kgdb_io_ops->read_char();
-
-		if (ch == 3)
-			ch = kgdb_io_ops->read_char();
-
-		/* If we get an ACK, we are done. */
-		if (ch == '+')
-			return;
-
-		/*
-		 * If we get the start of another packet, this means
-		 * that GDB is attempting to reconnect.  We will NAK
-		 * the packet being sent, and stop trying to send this
-		 * packet.
-		 */
-		if (ch == '$') {
-			kgdb_io_ops->write_char('-');
-			if (kgdb_io_ops->flush)
-				kgdb_io_ops->flush();
-			return;
-		}
-	}
-}
-
-/*
- * Convert the memory pointed to by mem into hex, placing result in buf.
- * Return a pointer to the last char put in buf (null). May return an error.
- */
-int kgdb_mem2hex(char *mem, char *buf, int count)
-{
-	char *tmp;
-	int err;
-
-	/*
-	 * We use the upper half of buf as an intermediate buffer for the
-	 * raw memory copy.  Hex conversion will work against this one.
-	 */
-	tmp = buf + count;
-
-	err = probe_kernel_read(tmp, mem, count);
-	if (!err) {
-		while (count > 0) {
-			buf = pack_hex_byte(buf, *tmp);
-			tmp++;
-			count--;
-		}
-
-		*buf = 0;
-	}
-
-	return err;
-}
-
-/*
- * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
- */
-static int kgdb_ebin2mem(char *buf, char *mem, int count)
-{
-	int size = 0;
-	char *c = buf;
-
-	while (count-- > 0) {
-		c[size] = *buf++;
-		if (c[size] == 0x7d)
-			c[size] = *buf++ ^ 0x20;
-		size++;
-	}
-
-	return probe_kernel_write(mem, c, size);
-}
-
-/*
- * Convert the hex array pointed to by buf into binary to be placed in mem.
- * Return a pointer to the character AFTER the last byte written.
- * May return an error.
- */
-int kgdb_hex2mem(char *buf, char *mem, int count)
-{
-	char *tmp_raw;
-	char *tmp_hex;
-
-	/*
-	 * We use the upper half of buf as an intermediate buffer for the
-	 * raw memory that is converted from hex.
-	 */
-	tmp_raw = buf + count * 2;
-
-	tmp_hex = tmp_raw - 1;
-	while (tmp_hex >= buf) {
-		tmp_raw--;
-		*tmp_raw = hex(*tmp_hex--);
-		*tmp_raw |= hex(*tmp_hex--) << 4;
-	}
-
-	return probe_kernel_write(mem, tmp_raw, count);
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int kgdb_hex2long(char **ptr, unsigned long *long_val)
-{
-	int hex_val;
-	int num = 0;
-	int negate = 0;
-
-	*long_val = 0;
-
-	if (**ptr == '-') {
-		negate = 1;
-		(*ptr)++;
-	}
-	while (**ptr) {
-		hex_val = hex(**ptr);
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		num++;
-		(*ptr)++;
-	}
-
-	if (negate)
-		*long_val = -*long_val;
-
-	return num;
-}
-
-/* Write memory due to an 'M' or 'X' packet. */
-static int write_mem_msg(int binary)
-{
-	char *ptr = &remcom_in_buffer[1];
-	unsigned long addr;
-	unsigned long length;
-	int err;
-
-	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
-	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
-		if (binary)
-			err = kgdb_ebin2mem(ptr, (char *)addr, length);
-		else
-			err = kgdb_hex2mem(ptr, (char *)addr, length);
-		if (err)
-			return err;
-		if (CACHE_FLUSH_IS_SAFE)
-			flush_icache_range(addr, addr + length);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static void error_packet(char *pkt, int error)
-{
-	error = -error;
-	pkt[0] = 'E';
-	pkt[1] = hex_asc[(error / 10)];
-	pkt[2] = hex_asc[(error % 10)];
-	pkt[3] = '\0';
-}
-
-/*
- * Thread ID accessors. We represent a flat TID space to GDB, where
- * the per CPU idle threads (which under Linux all have PID 0) are
- * remapped to negative TIDs.
- */
-
-#define BUF_THREAD_ID_SIZE	16
-
-static char *pack_threadid(char *pkt, unsigned char *id)
-{
-	char *limit;
-
-	limit = pkt + BUF_THREAD_ID_SIZE;
-	while (pkt < limit)
-		pkt = pack_hex_byte(pkt, *id++);
-
-	return pkt;
-}
-
-static void int_to_threadref(unsigned char *id, int value)
-{
-	unsigned char *scan;
-	int i = 4;
-
-	scan = (unsigned char *)id;
-	while (i--)
-		*scan++ = 0;
-	put_unaligned_be32(value, scan);
-}
-
-static struct task_struct *getthread(struct pt_regs *regs, int tid)
-{
-	/*
-	 * Non-positive TIDs are remapped to the cpu shadow information
-	 */
-	if (tid == 0 || tid == -1)
-		tid = -atomic_read(&kgdb_active) - 2;
-	if (tid < -1 && tid > -NR_CPUS - 2) {
-		if (kgdb_info[-tid - 2].task)
-			return kgdb_info[-tid - 2].task;
-		else
-			return idle_task(-tid - 2);
-	}
-	if (tid <= 0) {
-		printk(KERN_ERR "KGDB: Internal thread select error\n");
-		dump_stack();
-		return NULL;
-	}
-
-	/*
-	 * find_task_by_pid_ns() does not take the tasklist lock anymore
-	 * but is nicely RCU locked - hence is a pretty resilient
-	 * thing to use:
-	 */
-	return find_task_by_pid_ns(tid, &init_pid_ns);
-}
-
-/*
- * Some architectures need cache flushes when we set/clear a
- * breakpoint:
- */
-static void kgdb_flush_swbreak_addr(unsigned long addr)
-{
-	if (!CACHE_FLUSH_IS_SAFE)
-		return;
-
-	if (current->mm && current->mm->mmap_cache) {
-		flush_cache_range(current->mm->mmap_cache,
-				  addr, addr + BREAK_INSTR_SIZE);
-	}
-	/* Force flush instruction cache if it was outside the mm */
-	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
-}
-
-/*
- * SW breakpoint management:
- */
-static int kgdb_activate_sw_breakpoints(void)
-{
-	unsigned long addr;
-	int error;
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_SET)
-			continue;
-
-		addr = kgdb_break[i].bpt_addr;
-		error = kgdb_arch_set_breakpoint(addr,
-				kgdb_break[i].saved_instr);
-		if (error) {
-			ret = error;
-			printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
-			continue;
-		}
-
-		kgdb_flush_swbreak_addr(addr);
-		kgdb_break[i].state = BP_ACTIVE;
-	}
-	return ret;
-}
-
-static int kgdb_set_sw_break(unsigned long addr)
-{
-	int err = kgdb_validate_break_address(addr);
-	int breakno = -1;
-	int i;
-
-	if (err)
-		return err;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if ((kgdb_break[i].state == BP_SET) &&
-					(kgdb_break[i].bpt_addr == addr))
-			return -EEXIST;
-	}
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state == BP_REMOVED &&
-					kgdb_break[i].bpt_addr == addr) {
-			breakno = i;
-			break;
-		}
-	}
-
-	if (breakno == -1) {
-		for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-			if (kgdb_break[i].state == BP_UNDEFINED) {
-				breakno = i;
-				break;
-			}
-		}
-	}
-
-	if (breakno == -1)
-		return -E2BIG;
-
-	kgdb_break[breakno].state = BP_SET;
-	kgdb_break[breakno].type = BP_BREAKPOINT;
-	kgdb_break[breakno].bpt_addr = addr;
-
-	return 0;
-}
-
-static int kgdb_deactivate_sw_breakpoints(void)
-{
-	unsigned long addr;
-	int error;
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_ACTIVE)
-			continue;
-		addr = kgdb_break[i].bpt_addr;
-		error = kgdb_arch_remove_breakpoint(addr,
-					kgdb_break[i].saved_instr);
-		if (error) {
-			printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
-			ret = error;
-		}
-
-		kgdb_flush_swbreak_addr(addr);
-		kgdb_break[i].state = BP_SET;
-	}
-	return ret;
-}
-
-static int kgdb_remove_sw_break(unsigned long addr)
-{
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if ((kgdb_break[i].state == BP_SET) &&
-				(kgdb_break[i].bpt_addr == addr)) {
-			kgdb_break[i].state = BP_REMOVED;
-			return 0;
-		}
-	}
-	return -ENOENT;
-}
-
-int kgdb_isremovedbreak(unsigned long addr)
-{
-	int i;
-
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if ((kgdb_break[i].state == BP_REMOVED) &&
-					(kgdb_break[i].bpt_addr == addr))
-			return 1;
-	}
-	return 0;
-}
-
-static int remove_all_break(void)
-{
-	unsigned long addr;
-	int error;
-	int i;
-
-	/* Clear memory breakpoints. */
-	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_ACTIVE)
-			goto setundefined;
-		addr = kgdb_break[i].bpt_addr;
-		error = kgdb_arch_remove_breakpoint(addr,
-				kgdb_break[i].saved_instr);
-		if (error)
-			printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
-			   addr);
-setundefined:
-		kgdb_break[i].state = BP_UNDEFINED;
-	}
-
-	/* Clear hardware breakpoints. */
-	if (arch_kgdb_ops.remove_all_hw_break)
-		arch_kgdb_ops.remove_all_hw_break();
-
-	return 0;
-}
-
-/*
- * Remap normal tasks to their real PID,
- * CPU shadow threads are mapped to -CPU - 2
- */
-static inline int shadow_pid(int realpid)
-{
-	if (realpid)
-		return realpid;
-
-	return -raw_smp_processor_id() - 2;
-}
-
-static char gdbmsgbuf[BUFMAX + 1];
-
-static void kgdb_msg_write(const char *s, int len)
-{
-	char *bufptr;
-	int wcount;
-	int i;
-
-	/* 'O'utput */
-	gdbmsgbuf[0] = 'O';
-
-	/* Fill and send buffers... */
-	while (len > 0) {
-		bufptr = gdbmsgbuf + 1;
-
-		/* Calculate how many this time */
-		if ((len << 1) > (BUFMAX - 2))
-			wcount = (BUFMAX - 2) >> 1;
-		else
-			wcount = len;
-
-		/* Pack in hex chars */
-		for (i = 0; i < wcount; i++)
-			bufptr = pack_hex_byte(bufptr, s[i]);
-		*bufptr = '\0';
-
-		/* Move up */
-		s += wcount;
-		len -= wcount;
-
-		/* Write packet */
-		put_packet(gdbmsgbuf);
-	}
-}
-
-/*
- * Return true if there is a valid kgdb I/O module.  Also if no
- * debugger is attached a message can be printed to the console about
- * waiting for the debugger to attach.
- *
- * The print_wait argument is only to be true when called from inside
- * the core kgdb_handle_exception, because it will wait for the
- * debugger to attach.
- */
-static int kgdb_io_ready(int print_wait)
-{
-	if (!kgdb_io_ops)
-		return 0;
-	if (kgdb_connected)
-		return 1;
-	if (atomic_read(&kgdb_setting_breakpoint))
-		return 1;
-	if (print_wait)
-		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
-	return 1;
-}
-
-/*
- * All the functions that start with gdb_cmd are the various
- * operations to implement the handlers for the gdbserial protocol
- * where KGDB is communicating with an external debugger
- */
-
-/* Handle the '?' status packets */
-static void gdb_cmd_status(struct kgdb_state *ks)
-{
-	/*
-	 * We know that this packet is only sent
-	 * during initial connect.  So to be safe,
-	 * we clear out our breakpoints now in case
-	 * GDB is reconnecting.
-	 */
-	remove_all_break();
-
-	remcom_out_buffer[0] = 'S';
-	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
-}
-
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
-{
-	struct task_struct *thread;
-	void *local_debuggerinfo;
-	int i;
-
-	thread = kgdb_usethread;
-	if (!thread) {
-		thread = kgdb_info[ks->cpu].task;
-		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
-	} else {
-		local_debuggerinfo = NULL;
-		for_each_online_cpu(i) {
-			/*
-			 * Try to find the task on some other
-			 * or possibly this node if we do not
-			 * find the matching task then we try
-			 * to approximate the results.
-			 */
-			if (thread == kgdb_info[i].task)
-				local_debuggerinfo = kgdb_info[i].debuggerinfo;
-		}
-	}
-
-	/*
-	 * All threads that don't have debuggerinfo should be
-	 * in schedule() sleeping, since all other CPUs
-	 * are in kgdb_wait, and thus have debuggerinfo.
-	 */
-	if (local_debuggerinfo) {
-		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
-	} else {
-		/*
-		 * Pull stuff saved during switch_to; nothing
-		 * else is accessible (or even particularly
-		 * relevant).
-		 *
-		 * This should be enough for a stack trace.
-		 */
-		sleeping_thread_to_gdb_regs(gdb_regs, thread);
-	}
-	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
-}
-
-/* Handle the 'G' set registers request */
-static void gdb_cmd_setregs(struct kgdb_state *ks)
-{
-	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
-
-	if (kgdb_usethread && kgdb_usethread != current) {
-		error_packet(remcom_out_buffer, -EINVAL);
-	} else {
-		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
-		strcpy(remcom_out_buffer, "OK");
-	}
-}
-
-/* Handle the 'm' memory read bytes */
-static void gdb_cmd_memread(struct kgdb_state *ks)
-{
-	char *ptr = &remcom_in_buffer[1];
-	unsigned long length;
-	unsigned long addr;
-	int err;
-
-	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
-					kgdb_hex2long(&ptr, &length) > 0) {
-		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-		if (err)
-			error_packet(remcom_out_buffer, err);
-	} else {
-		error_packet(remcom_out_buffer, -EINVAL);
-	}
-}
-
-/* Handle the 'M' memory write bytes */
-static void gdb_cmd_memwrite(struct kgdb_state *ks)
-{
-	int err = write_mem_msg(0);
-
-	if (err)
-		error_packet(remcom_out_buffer, err);
-	else
-		strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'X' memory binary write bytes */
-static void gdb_cmd_binwrite(struct kgdb_state *ks)
-{
-	int err = write_mem_msg(1);
-
-	if (err)
-		error_packet(remcom_out_buffer, err);
-	else
-		strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'D' or 'k', detach or kill packets */
-static void gdb_cmd_detachkill(struct kgdb_state *ks)
-{
-	int error;
-
-	/* The detach case */
-	if (remcom_in_buffer[0] == 'D') {
-		error = remove_all_break();
-		if (error < 0) {
-			error_packet(remcom_out_buffer, error);
-		} else {
-			strcpy(remcom_out_buffer, "OK");
-			kgdb_connected = 0;
-		}
-		put_packet(remcom_out_buffer);
-	} else {
-		/*
-		 * Assume the kill case, with no exit code checking,
-		 * trying to force detach the debugger:
-		 */
-		remove_all_break();
-		kgdb_connected = 0;
-	}
-}
-
-/* Handle the 'R' reboot packets */
-static int gdb_cmd_reboot(struct kgdb_state *ks)
-{
-	/* For now, only honor R0 */
-	if (strcmp(remcom_in_buffer, "R0") == 0) {
-		printk(KERN_CRIT "Executing emergency reboot\n");
-		strcpy(remcom_out_buffer, "OK");
-		put_packet(remcom_out_buffer);
-
-		/*
-		 * Execution should not return from
-		 * machine_emergency_restart()
-		 */
-		machine_emergency_restart();
-		kgdb_connected = 0;
-
-		return 1;
-	}
-	return 0;
-}
-
-/* Handle the 'q' query packets */
-static void gdb_cmd_query(struct kgdb_state *ks)
-{
-	struct task_struct *g;
-	struct task_struct *p;
-	unsigned char thref[8];
-	char *ptr;
-	int i;
-	int cpu;
-	int finished = 0;
-
-	switch (remcom_in_buffer[1]) {
-	case 's':
-	case 'f':
-		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-
-		i = 0;
-		remcom_out_buffer[0] = 'm';
-		ptr = remcom_out_buffer + 1;
-		if (remcom_in_buffer[1] == 'f') {
-			/* Each cpu is a shadow thread */
-			for_each_online_cpu(cpu) {
-				ks->thr_query = 0;
-				int_to_threadref(thref, -cpu - 2);
-				pack_threadid(ptr, thref);
-				ptr += BUF_THREAD_ID_SIZE;
-				*(ptr++) = ',';
-				i++;
-			}
-		}
-
-		do_each_thread(g, p) {
-			if (i >= ks->thr_query && !finished) {
-				int_to_threadref(thref, p->pid);
-				pack_threadid(ptr, thref);
-				ptr += BUF_THREAD_ID_SIZE;
-				*(ptr++) = ',';
-				ks->thr_query++;
-				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
-					finished = 1;
-			}
-			i++;
-		} while_each_thread(g, p);
-
-		*(--ptr) = '\0';
-		break;
-
-	case 'C':
-		/* Current thread id */
-		strcpy(remcom_out_buffer, "QC");
-		ks->threadid = shadow_pid(current->pid);
-		int_to_threadref(thref, ks->threadid);
-		pack_threadid(remcom_out_buffer + 2, thref);
-		break;
-	case 'T':
-		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		ks->threadid = 0;
-		ptr = remcom_in_buffer + 17;
-		kgdb_hex2long(&ptr, &ks->threadid);
-		if (!getthread(ks->linux_regs, ks->threadid)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		if ((int)ks->threadid > 0) {
-			kgdb_mem2hex(getthread(ks->linux_regs,
-					ks->threadid)->comm,
-					remcom_out_buffer, 16);
-		} else {
-			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-
-			sprintf(tmpstr, "shadowCPU%d",
-					(int)(-ks->threadid - 2));
-			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
-		}
-		break;
-	}
-}
-
-/* Handle the 'H' task query packets */
-static void gdb_cmd_task(struct kgdb_state *ks)
-{
-	struct task_struct *thread;
-	char *ptr;
-
-	switch (remcom_in_buffer[1]) {
-	case 'g':
-		ptr = &remcom_in_buffer[2];
-		kgdb_hex2long(&ptr, &ks->threadid);
-		thread = getthread(ks->linux_regs, ks->threadid);
-		if (!thread && ks->threadid > 0) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		kgdb_usethread = thread;
-		ks->kgdb_usethreadid = ks->threadid;
-		strcpy(remcom_out_buffer, "OK");
-		break;
-	case 'c':
-		ptr = &remcom_in_buffer[2];
-		kgdb_hex2long(&ptr, &ks->threadid);
-		if (!ks->threadid) {
-			kgdb_contthread = NULL;
-		} else {
-			thread = getthread(ks->linux_regs, ks->threadid);
-			if (!thread && ks->threadid > 0) {
-				error_packet(remcom_out_buffer, -EINVAL);
-				break;
-			}
-			kgdb_contthread = thread;
-		}
-		strcpy(remcom_out_buffer, "OK");
-		break;
-	}
-}
-
-/* Handle the 'T' thread query packets */
-static void gdb_cmd_thread(struct kgdb_state *ks)
-{
-	char *ptr = &remcom_in_buffer[1];
-	struct task_struct *thread;
-
-	kgdb_hex2long(&ptr, &ks->threadid);
-	thread = getthread(ks->linux_regs, ks->threadid);
-	if (thread)
-		strcpy(remcom_out_buffer, "OK");
-	else
-		error_packet(remcom_out_buffer, -EINVAL);
-}
-
-/* Handle the 'z' or 'Z' breakpoint remove or set packets */
-static void gdb_cmd_break(struct kgdb_state *ks)
-{
-	/*
-	 * Since GDB-5.3, it's been drafted that '0' is a software
-	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
-	 */
-	char *bpt_type = &remcom_in_buffer[1];
-	char *ptr = &remcom_in_buffer[2];
-	unsigned long addr;
-	unsigned long length;
-	int error = 0;
-
-	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
-		/* Unsupported */
-		if (*bpt_type > '4')
-			return;
-	} else {
-		if (*bpt_type != '0' && *bpt_type != '1')
-			/* Unsupported. */
-			return;
-	}
-
-	/*
-	 * Test if this is a hardware breakpoint, and
-	 * if we support it:
-	 */
-	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
-		/* Unsupported. */
-		return;
-
-	if (*(ptr++) != ',') {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-	if (!kgdb_hex2long(&ptr, &addr)) {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-	if (*(ptr++) != ',' ||
-		!kgdb_hex2long(&ptr, &length)) {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-
-	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
-		error = kgdb_set_sw_break(addr);
-	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
-		error = kgdb_remove_sw_break(addr);
-	else if (remcom_in_buffer[0] == 'Z')
-		error = arch_kgdb_ops.set_hw_breakpoint(addr,
-			(int)length, *bpt_type - '0');
-	else if (remcom_in_buffer[0] == 'z')
-		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
-			(int) length, *bpt_type - '0');
-
-	if (error == 0)
-		strcpy(remcom_out_buffer, "OK");
-	else
-		error_packet(remcom_out_buffer, error);
-}
-
-/* Handle the 'C' signal / exception passing packets */
-static int gdb_cmd_exception_pass(struct kgdb_state *ks)
-{
-	/* C09 == pass exception
-	 * C15 == detach kgdb, pass exception
-	 */
-	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
-
-		ks->pass_exception = 1;
-		remcom_in_buffer[0] = 'c';
-
-	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
-
-		ks->pass_exception = 1;
-		remcom_in_buffer[0] = 'D';
-		remove_all_break();
-		kgdb_connected = 0;
-		return 1;
-
-	} else {
-		kgdb_msg_write("KGDB only knows signal 9 (pass)"
-			" and 15 (pass and disconnect)\n"
-			"Executing a continue without signal passing\n", 0);
-		remcom_in_buffer[0] = 'c';
-	}
-
-	/* Indicate fall through */
-	return -1;
-}
-
-/*
- * This function performs all gdbserial command procesing
- */
-static int gdb_serial_stub(struct kgdb_state *ks)
-{
-	int error = 0;
-	int tmp;
-
-	/* Clear the out buffer. */
-	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
-	if (kgdb_connected) {
-		unsigned char thref[8];
-		char *ptr;
-
-		/* Reply to host that an exception has occurred */
-		ptr = remcom_out_buffer;
-		*ptr++ = 'T';
-		ptr = pack_hex_byte(ptr, ks->signo);
-		ptr += strlen(strcpy(ptr, "thread:"));
-		int_to_threadref(thref, shadow_pid(current->pid));
-		ptr = pack_threadid(ptr, thref);
-		*ptr++ = ';';
-		put_packet(remcom_out_buffer);
-	}
-
-	kgdb_usethread = kgdb_info[ks->cpu].task;
-	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-	ks->pass_exception = 0;
-
-	while (1) {
-		error = 0;
-
-		/* Clear the out buffer. */
-		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
-		get_packet(remcom_in_buffer);
-
-		switch (remcom_in_buffer[0]) {
-		case '?': /* gdbserial status */
-			gdb_cmd_status(ks);
-			break;
-		case 'g': /* return the value of the CPU registers */
-			gdb_cmd_getregs(ks);
-			break;
-		case 'G': /* set the value of the CPU registers - return OK */
-			gdb_cmd_setregs(ks);
-			break;
-		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
-			gdb_cmd_memread(ks);
-			break;
-		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-			gdb_cmd_memwrite(ks);
-			break;
-		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-			gdb_cmd_binwrite(ks);
-			break;
-			/* kill or detach. KGDB should treat this like a
-			 * continue.
-			 */
-		case 'D': /* Debugger detach */
-		case 'k': /* Debugger detach via kill */
-			gdb_cmd_detachkill(ks);
-			goto default_handle;
-		case 'R': /* Reboot */
-			if (gdb_cmd_reboot(ks))
-				goto default_handle;
-			break;
-		case 'q': /* query command */
-			gdb_cmd_query(ks);
-			break;
-		case 'H': /* task related */
-			gdb_cmd_task(ks);
-			break;
-		case 'T': /* Query thread status */
-			gdb_cmd_thread(ks);
-			break;
-		case 'z': /* Break point remove */
-		case 'Z': /* Break point set */
-			gdb_cmd_break(ks);
-			break;
-		case 'C': /* Exception passing */
-			tmp = gdb_cmd_exception_pass(ks);
-			if (tmp > 0)
-				goto default_handle;
-			if (tmp == 0)
-				break;
-			/* Fall through on tmp < 0 */
-		case 'c': /* Continue packet */
-		case 's': /* Single step packet */
-			if (kgdb_contthread && kgdb_contthread != current) {
-				/* Can't switch threads in kgdb */
-				error_packet(remcom_out_buffer, -EINVAL);
-				break;
-			}
-			kgdb_activate_sw_breakpoints();
-			/* Fall through to default processing */
-		default:
-default_handle:
-			error = kgdb_arch_handle_exception(ks->ex_vector,
-						ks->signo,
-						ks->err_code,
-						remcom_in_buffer,
-						remcom_out_buffer,
-						ks->linux_regs);
-			/*
-			 * Leave cmd processing on error, detach,
-			 * kill, continue, or single step.
-			 */
-			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
-			    remcom_in_buffer[0] == 'k') {
-				error = 0;
-				goto kgdb_exit;
-			}
-
-		}
-
-		/* reply to the request */
-		put_packet(remcom_out_buffer);
-	}
-
-kgdb_exit:
-	if (ks->pass_exception)
-		error = 1;
-	return error;
-}
-
-static int kgdb_reenter_check(struct kgdb_state *ks)
-{
-	unsigned long addr;
-
-	if (atomic_read(&kgdb_active) != raw_smp_processor_id())
-		return 0;
-
-	/* Panic on recursive debugger calls: */
-	exception_level++;
-	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
-	kgdb_deactivate_sw_breakpoints();
-
-	/*
-	 * If the break point removed ok at the place exception
-	 * occurred, try to recover and print a warning to the end
-	 * user because the user planted a breakpoint in a place that
-	 * KGDB needs in order to function.
-	 */
-	if (kgdb_remove_sw_break(addr) == 0) {
-		exception_level = 0;
-		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-		kgdb_activate_sw_breakpoints();
-		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
-			addr);
-		WARN_ON_ONCE(1);
-
-		return 1;
-	}
-	remove_all_break();
-	kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-
-	if (exception_level > 1) {
-		dump_stack();
-		panic("Recursive entry to debugger");
-	}
-
-	printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
-	dump_stack();
-	panic("Recursive entry to debugger");
-
-	return 1;
-}
-
-static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
-{
-	unsigned long flags;
-	int sstep_tries = 100;
-	int error = 0;
-	int i, cpu;
-	int trace_on = 0;
-acquirelock:
-	/*
-	 * Interrupts will be restored by the 'trap return' code, except when
-	 * single stepping.
-	 */
-	local_irq_save(flags);
-
-	cpu = ks->cpu;
-	kgdb_info[cpu].debuggerinfo = regs;
-	kgdb_info[cpu].task = current;
-	/*
-	 * Make sure the above info reaches the primary CPU before
-	 * our cpu_in_kgdb[] flag setting does:
-	 */
-	atomic_inc(&cpu_in_kgdb[cpu]);
-
-	/*
-	 * CPU will loop if it is a slave or request to become a kgdb
-	 * master cpu and acquire the kgdb_active lock:
-	 */
-	while (1) {
-		if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
-			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
-				break;
-		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
-			if (!atomic_read(&passive_cpu_wait[cpu]))
-				goto return_normal;
-		} else {
-return_normal:
-			/* Return to normal operation by executing any
-			 * hw breakpoint fixup.
-			 */
-			if (arch_kgdb_ops.correct_hw_break)
-				arch_kgdb_ops.correct_hw_break();
-			if (trace_on)
-				tracing_on();
-			atomic_dec(&cpu_in_kgdb[cpu]);
-			touch_softlockup_watchdog_sync();
-			clocksource_touch_watchdog();
-			local_irq_restore(flags);
-			return 0;
-		}
-		cpu_relax();
-	}
-
-	/*
-	 * For single stepping, try to only enter on the processor
-	 * that was single stepping.  To gaurd against a deadlock, the
-	 * kernel will only try for the value of sstep_tries before
-	 * giving up and continuing on.
-	 */
-	if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-	    (kgdb_info[cpu].task &&
-	     kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
-		atomic_set(&kgdb_active, -1);
-		touch_softlockup_watchdog_sync();
-		clocksource_touch_watchdog();
-		local_irq_restore(flags);
-
-		goto acquirelock;
-	}
-
-	if (!kgdb_io_ready(1)) {
-		error = 1;
-		goto kgdb_restore; /* No I/O connection, so resume the system */
-	}
-
-	/*
-	 * Don't enter if we have hit a removed breakpoint.
-	 */
-	if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
-		goto kgdb_restore;
-
-	/* Call the I/O driver's pre_exception routine */
-	if (kgdb_io_ops->pre_exception)
-		kgdb_io_ops->pre_exception();
-
-	kgdb_disable_hw_debug(ks->linux_regs);
-
-	/*
-	 * Get the passive CPU lock which will hold all the non-primary
-	 * CPU in a spin state while the debugger is active
-	 */
-	if (!kgdb_single_step) {
-		for (i = 0; i < NR_CPUS; i++)
-			atomic_inc(&passive_cpu_wait[i]);
-	}
-
-#ifdef CONFIG_SMP
-	/* Signal the other CPUs to enter kgdb_wait() */
-	if ((!kgdb_single_step) && kgdb_do_roundup)
-		kgdb_roundup_cpus(flags);
-#endif
-
-	/*
-	 * Wait for the other CPUs to be notified and be waiting for us:
-	 */
-	for_each_online_cpu(i) {
-		while (!atomic_read(&cpu_in_kgdb[i]))
-			cpu_relax();
-	}
-
-	/*
-	 * At this point the primary processor is completely
-	 * in the debugger and all secondary CPUs are quiescent
-	 */
-	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
-	kgdb_deactivate_sw_breakpoints();
-	kgdb_single_step = 0;
-	kgdb_contthread = current;
-	exception_level = 0;
-	trace_on = tracing_is_on();
-	if (trace_on)
-		tracing_off();
-
-	/* Talk to debugger with gdbserial protocol */
-	error = gdb_serial_stub(ks);
-
-	/* Call the I/O driver's post_exception routine */
-	if (kgdb_io_ops->post_exception)
-		kgdb_io_ops->post_exception();
-
-	atomic_dec(&cpu_in_kgdb[ks->cpu]);
-
-	if (!kgdb_single_step) {
-		for (i = NR_CPUS-1; i >= 0; i--)
-			atomic_dec(&passive_cpu_wait[i]);
-		/*
-		 * Wait till all the CPUs have quit
-		 * from the debugger.
-		 */
-		for_each_online_cpu(i) {
-			while (atomic_read(&cpu_in_kgdb[i]))
-				cpu_relax();
-		}
-	}
-
-kgdb_restore:
-	if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
-		int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
-		if (kgdb_info[sstep_cpu].task)
-			kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
-		else
-			kgdb_sstep_pid = 0;
-	}
-	if (trace_on)
-		tracing_on();
-	/* Free kgdb_active */
-	atomic_set(&kgdb_active, -1);
-	touch_softlockup_watchdog_sync();
-	clocksource_touch_watchdog();
-	local_irq_restore(flags);
-
-	return error;
-}
-
-/*
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- *	interface locks, if any (begin_session)
- *	kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
-{
-	struct kgdb_state kgdb_var;
-	struct kgdb_state *ks = &kgdb_var;
-	int ret;
-
-	ks->cpu			= raw_smp_processor_id();
-	ks->ex_vector		= evector;
-	ks->signo		= signo;
-	ks->ex_vector		= evector;
-	ks->err_code		= ecode;
-	ks->kgdb_usethreadid	= 0;
-	ks->linux_regs		= regs;
-
-	if (kgdb_reenter_check(ks))
-		return 0; /* Ouch, double exception ! */
-	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
-	ret = kgdb_cpu_enter(ks, regs);
-	kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
-	return ret;
-}
-
-int kgdb_nmicallback(int cpu, void *regs)
-{
-#ifdef CONFIG_SMP
-	struct kgdb_state kgdb_var;
-	struct kgdb_state *ks = &kgdb_var;
-
-	memset(ks, 0, sizeof(struct kgdb_state));
-	ks->cpu			= cpu;
-	ks->linux_regs		= regs;
-
-	if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-	    atomic_read(&kgdb_active) != -1 &&
-	    atomic_read(&kgdb_active) != cpu) {
-		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
-		kgdb_cpu_enter(ks, regs);
-		kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
-		return 0;
-	}
-#endif
-	return 1;
-}
-
-static void kgdb_console_write(struct console *co, const char *s,
-   unsigned count)
-{
-	unsigned long flags;
-
-	/* If we're debugging, or KGDB has not connected, don't try
-	 * and print. */
-	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
-		return;
-
-	local_irq_save(flags);
-	kgdb_msg_write(s, count);
-	local_irq_restore(flags);
-}
-
-static struct console kgdbcons = {
-	.name		= "kgdb",
-	.write		= kgdb_console_write,
-	.flags		= CON_PRINTBUFFER | CON_ENABLED,
-	.index		= -1,
-};
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_gdb(int key, struct tty_struct *tty)
-{
-	if (!kgdb_io_ops) {
-		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
-		return;
-	}
-	if (!kgdb_connected)
-		printk(KERN_CRIT "Entering KGDB\n");
-
-	kgdb_breakpoint();
-}
-
-static struct sysrq_key_op sysrq_gdb_op = {
-	.handler	= sysrq_handle_gdb,
-	.help_msg	= "debug(G)",
-	.action_msg	= "DEBUG",
-};
-#endif
-
-static void kgdb_register_callbacks(void)
-{
-	if (!kgdb_io_module_registered) {
-		kgdb_io_module_registered = 1;
-		kgdb_arch_init();
-#ifdef CONFIG_MAGIC_SYSRQ
-		register_sysrq_key('g', &sysrq_gdb_op);
-#endif
-		if (kgdb_use_con && !kgdb_con_registered) {
-			register_console(&kgdbcons);
-			kgdb_con_registered = 1;
-		}
-	}
-}
-
-static void kgdb_unregister_callbacks(void)
-{
-	/*
-	 * When this routine is called KGDB should unregister from the
-	 * panic handler and clean up, making sure it is not handling any
-	 * break exceptions at the time.
-	 */
-	if (kgdb_io_module_registered) {
-		kgdb_io_module_registered = 0;
-		kgdb_arch_exit();
-#ifdef CONFIG_MAGIC_SYSRQ
-		unregister_sysrq_key('g', &sysrq_gdb_op);
-#endif
-		if (kgdb_con_registered) {
-			unregister_console(&kgdbcons);
-			kgdb_con_registered = 0;
-		}
-	}
-}
-
-static void kgdb_initial_breakpoint(void)
-{
-	kgdb_break_asap = 0;
-
-	printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
-	kgdb_breakpoint();
-}
-
-/**
- *	kgdb_register_io_module - register KGDB IO module
- *	@new_kgdb_io_ops: the io ops vector
- *
- *	Register it with the KGDB core.
- */
-int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
-{
-	int err;
-
-	spin_lock(&kgdb_registration_lock);
-
-	if (kgdb_io_ops) {
-		spin_unlock(&kgdb_registration_lock);
-
-		printk(KERN_ERR "kgdb: Another I/O driver is already "
-				"registered with KGDB.\n");
-		return -EBUSY;
-	}
-
-	if (new_kgdb_io_ops->init) {
-		err = new_kgdb_io_ops->init();
-		if (err) {
-			spin_unlock(&kgdb_registration_lock);
-			return err;
-		}
-	}
-
-	kgdb_io_ops = new_kgdb_io_ops;
-
-	spin_unlock(&kgdb_registration_lock);
-
-	printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
-	       new_kgdb_io_ops->name);
-
-	/* Arm KGDB now. */
-	kgdb_register_callbacks();
-
-	if (kgdb_break_asap)
-		kgdb_initial_breakpoint();
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kgdb_register_io_module);
-
-/**
- *	kkgdb_unregister_io_module - unregister KGDB IO module
- *	@old_kgdb_io_ops: the io ops vector
- *
- *	Unregister it with the KGDB core.
- */
-void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
-{
-	BUG_ON(kgdb_connected);
-
-	/*
-	 * KGDB is no longer able to communicate out, so
-	 * unregister our callbacks and reset state.
-	 */
-	kgdb_unregister_callbacks();
-
-	spin_lock(&kgdb_registration_lock);
-
-	WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
-	kgdb_io_ops = NULL;
-
-	spin_unlock(&kgdb_registration_lock);
-
-	printk(KERN_INFO
-		"kgdb: Unregistered I/O driver %s, debugger disabled.\n",
-		old_kgdb_io_ops->name);
-}
-EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
-
-/**
- * kgdb_breakpoint - generate breakpoint exception
- *
- * This function will generate a breakpoint exception.  It is used at the
- * beginning of a program to sync up with a debugger and can be used
- * otherwise as a quick means to stop program execution and "break" into
- * the debugger.
- */
-void kgdb_breakpoint(void)
-{
-	atomic_inc(&kgdb_setting_breakpoint);
-	wmb(); /* Sync point before breakpoint */
-	arch_kgdb_breakpoint();
-	wmb(); /* Sync point after breakpoint */
-	atomic_dec(&kgdb_setting_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_breakpoint);
-
-static int __init opt_kgdb_wait(char *str)
-{
-	kgdb_break_asap = 1;
-
-	if (kgdb_io_module_registered)
-		kgdb_initial_breakpoint();
-
-	return 0;
-}
-
-early_param("kgdbwait", opt_kgdb_wait);
-- 
cgit v1.2.3-55-g7522


From 53197fc49549240f6c6a963b2713a4cd9517964b Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Fri, 2 Apr 2010 11:48:03 -0500
Subject: Separate the gdbstub from the debug core

Split the former kernel/kgdb.c into debug_core.c which contains the
kernel debugger exception logic and to the gdbstub.c which contains
the logic for allowing gdb to talk to the debug core.

This also created a private include file called debug_core.h which
contains all the definitions to glue the debug_core to any other
debugger connections.

CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h      |   1 +
 kernel/debug/Makefile     |   3 +-
 kernel/debug/debug_core.c | 994 ++--------------------------------------------
 kernel/debug/debug_core.h |  55 +++
 kernel/debug/gdbstub.c    | 934 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1030 insertions(+), 957 deletions(-)
 create mode 100644 kernel/debug/debug_core.h
 create mode 100644 kernel/debug/gdbstub.c

(limited to 'kernel')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 19ec41a183f5..4830142ec339 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -264,6 +264,7 @@ extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
 
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
+extern struct kgdb_io *dbg_io_ops;
 
 extern int kgdb_hex2long(char **ptr, unsigned long *long_val);
 extern int kgdb_mem2hex(char *mem, char *buf, int count);
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index 5d7850415266..fd4dc6e7782c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -2,5 +2,4 @@
 # Makefile for the linux kernel debugger
 #
 
-obj-$(CONFIG_KGDB) += debug_core.o
-
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 11f3515ca83f..7e03969330bc 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1,5 +1,5 @@
 /*
- * KGDB stub.
+ * Kernel Debug Core
  *
  * Maintainer: Jason Wessel <jason.wessel@windriver.com>
  *
@@ -9,7 +9,7 @@
  * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
  * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
  * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
- * Copyright (C) 2005-2008 Wind River Systems, Inc.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
  * Copyright (C) 2007 MontaVista Software, Inc.
  * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
@@ -37,7 +37,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
-#include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
@@ -52,34 +51,12 @@
 #include <asm/byteorder.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
-#include <asm/unaligned.h>
 
-static int kgdb_break_asap;
-
-#define KGDB_MAX_THREAD_QUERY 17
-struct kgdb_state {
-	int			ex_vector;
-	int			signo;
-	int			err_code;
-	int			cpu;
-	int			pass_exception;
-	unsigned long		thr_query;
-	unsigned long		threadid;
-	long			kgdb_usethreadid;
-	struct pt_regs		*linux_regs;
-};
+#include "debug_core.h"
 
-/* Exception state values */
-#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
-#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
-#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP       0x8 /* CPU is single stepping */
+static int kgdb_break_asap;
 
-static struct debuggerinfo_struct {
-	void			*debuggerinfo;
-	struct task_struct	*task;
-	int 			exception_state;
-} kgdb_info[NR_CPUS];
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
 
 /**
  * kgdb_connected - Is a host GDB connected to us?
@@ -93,7 +70,7 @@ static int			kgdb_io_module_registered;
 /* Guard for recursive entry */
 static int			exception_level;
 
-static struct kgdb_io		*kgdb_io_ops;
+struct kgdb_io		*dbg_io_ops;
 static DEFINE_SPINLOCK(kgdb_registration_lock);
 
 /* kgdb console driver is loaded */
@@ -136,16 +113,7 @@ struct task_struct		*kgdb_usethread;
 struct task_struct		*kgdb_contthread;
 
 int				kgdb_single_step;
-pid_t				kgdb_sstep_pid;
-
-/* Our I/O buffers. */
-static char			remcom_in_buffer[BUFMAX];
-static char			remcom_out_buffer[BUFMAX];
-
-/* Storage for the registers, in GDB format. */
-static unsigned long		gdb_regs[(NUMREGBYTES +
-					sizeof(unsigned long) - 1) /
-					sizeof(unsigned long)];
+static pid_t			kgdb_sstep_pid;
 
 /* to keep track of the CPU which is doing the single stepping*/
 atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
@@ -246,324 +214,6 @@ void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
 {
 }
 
-/*
- * GDB remote protocol parser:
- */
-
-static int hex(char ch)
-{
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/* scan for the sequence $<data>#<checksum> */
-static void get_packet(char *buffer)
-{
-	unsigned char checksum;
-	unsigned char xmitcsum;
-	int count;
-	char ch;
-
-	do {
-		/*
-		 * Spin and wait around for the start character, ignore all
-		 * other characters:
-		 */
-		while ((ch = (kgdb_io_ops->read_char())) != '$')
-			/* nothing */;
-
-		kgdb_connected = 1;
-		checksum = 0;
-		xmitcsum = -1;
-
-		count = 0;
-
-		/*
-		 * now, read until a # or end of buffer is found:
-		 */
-		while (count < (BUFMAX - 1)) {
-			ch = kgdb_io_ops->read_char();
-			if (ch == '#')
-				break;
-			checksum = checksum + ch;
-			buffer[count] = ch;
-			count = count + 1;
-		}
-		buffer[count] = 0;
-
-		if (ch == '#') {
-			xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
-			xmitcsum += hex(kgdb_io_ops->read_char());
-
-			if (checksum != xmitcsum)
-				/* failed checksum */
-				kgdb_io_ops->write_char('-');
-			else
-				/* successful transfer */
-				kgdb_io_ops->write_char('+');
-			if (kgdb_io_ops->flush)
-				kgdb_io_ops->flush();
-		}
-	} while (checksum != xmitcsum);
-}
-
-/*
- * Send the packet in buffer.
- * Check for gdb connection if asked for.
- */
-static void put_packet(char *buffer)
-{
-	unsigned char checksum;
-	int count;
-	char ch;
-
-	/*
-	 * $<packet info>#<checksum>.
-	 */
-	while (1) {
-		kgdb_io_ops->write_char('$');
-		checksum = 0;
-		count = 0;
-
-		while ((ch = buffer[count])) {
-			kgdb_io_ops->write_char(ch);
-			checksum += ch;
-			count++;
-		}
-
-		kgdb_io_ops->write_char('#');
-		kgdb_io_ops->write_char(hex_asc_hi(checksum));
-		kgdb_io_ops->write_char(hex_asc_lo(checksum));
-		if (kgdb_io_ops->flush)
-			kgdb_io_ops->flush();
-
-		/* Now see what we get in reply. */
-		ch = kgdb_io_ops->read_char();
-
-		if (ch == 3)
-			ch = kgdb_io_ops->read_char();
-
-		/* If we get an ACK, we are done. */
-		if (ch == '+')
-			return;
-
-		/*
-		 * If we get the start of another packet, this means
-		 * that GDB is attempting to reconnect.  We will NAK
-		 * the packet being sent, and stop trying to send this
-		 * packet.
-		 */
-		if (ch == '$') {
-			kgdb_io_ops->write_char('-');
-			if (kgdb_io_ops->flush)
-				kgdb_io_ops->flush();
-			return;
-		}
-	}
-}
-
-/*
- * Convert the memory pointed to by mem into hex, placing result in buf.
- * Return a pointer to the last char put in buf (null). May return an error.
- */
-int kgdb_mem2hex(char *mem, char *buf, int count)
-{
-	char *tmp;
-	int err;
-
-	/*
-	 * We use the upper half of buf as an intermediate buffer for the
-	 * raw memory copy.  Hex conversion will work against this one.
-	 */
-	tmp = buf + count;
-
-	err = probe_kernel_read(tmp, mem, count);
-	if (!err) {
-		while (count > 0) {
-			buf = pack_hex_byte(buf, *tmp);
-			tmp++;
-			count--;
-		}
-
-		*buf = 0;
-	}
-
-	return err;
-}
-
-/*
- * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
- */
-static int kgdb_ebin2mem(char *buf, char *mem, int count)
-{
-	int size = 0;
-	char *c = buf;
-
-	while (count-- > 0) {
-		c[size] = *buf++;
-		if (c[size] == 0x7d)
-			c[size] = *buf++ ^ 0x20;
-		size++;
-	}
-
-	return probe_kernel_write(mem, c, size);
-}
-
-/*
- * Convert the hex array pointed to by buf into binary to be placed in mem.
- * Return a pointer to the character AFTER the last byte written.
- * May return an error.
- */
-int kgdb_hex2mem(char *buf, char *mem, int count)
-{
-	char *tmp_raw;
-	char *tmp_hex;
-
-	/*
-	 * We use the upper half of buf as an intermediate buffer for the
-	 * raw memory that is converted from hex.
-	 */
-	tmp_raw = buf + count * 2;
-
-	tmp_hex = tmp_raw - 1;
-	while (tmp_hex >= buf) {
-		tmp_raw--;
-		*tmp_raw = hex(*tmp_hex--);
-		*tmp_raw |= hex(*tmp_hex--) << 4;
-	}
-
-	return probe_kernel_write(mem, tmp_raw, count);
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int kgdb_hex2long(char **ptr, unsigned long *long_val)
-{
-	int hex_val;
-	int num = 0;
-	int negate = 0;
-
-	*long_val = 0;
-
-	if (**ptr == '-') {
-		negate = 1;
-		(*ptr)++;
-	}
-	while (**ptr) {
-		hex_val = hex(**ptr);
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		num++;
-		(*ptr)++;
-	}
-
-	if (negate)
-		*long_val = -*long_val;
-
-	return num;
-}
-
-/* Write memory due to an 'M' or 'X' packet. */
-static int write_mem_msg(int binary)
-{
-	char *ptr = &remcom_in_buffer[1];
-	unsigned long addr;
-	unsigned long length;
-	int err;
-
-	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
-	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
-		if (binary)
-			err = kgdb_ebin2mem(ptr, (char *)addr, length);
-		else
-			err = kgdb_hex2mem(ptr, (char *)addr, length);
-		if (err)
-			return err;
-		if (CACHE_FLUSH_IS_SAFE)
-			flush_icache_range(addr, addr + length);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-static void error_packet(char *pkt, int error)
-{
-	error = -error;
-	pkt[0] = 'E';
-	pkt[1] = hex_asc[(error / 10)];
-	pkt[2] = hex_asc[(error % 10)];
-	pkt[3] = '\0';
-}
-
-/*
- * Thread ID accessors. We represent a flat TID space to GDB, where
- * the per CPU idle threads (which under Linux all have PID 0) are
- * remapped to negative TIDs.
- */
-
-#define BUF_THREAD_ID_SIZE	16
-
-static char *pack_threadid(char *pkt, unsigned char *id)
-{
-	char *limit;
-
-	limit = pkt + BUF_THREAD_ID_SIZE;
-	while (pkt < limit)
-		pkt = pack_hex_byte(pkt, *id++);
-
-	return pkt;
-}
-
-static void int_to_threadref(unsigned char *id, int value)
-{
-	unsigned char *scan;
-	int i = 4;
-
-	scan = (unsigned char *)id;
-	while (i--)
-		*scan++ = 0;
-	put_unaligned_be32(value, scan);
-}
-
-static struct task_struct *getthread(struct pt_regs *regs, int tid)
-{
-	/*
-	 * Non-positive TIDs are remapped to the cpu shadow information
-	 */
-	if (tid == 0 || tid == -1)
-		tid = -atomic_read(&kgdb_active) - 2;
-	if (tid < -1 && tid > -NR_CPUS - 2) {
-		if (kgdb_info[-tid - 2].task)
-			return kgdb_info[-tid - 2].task;
-		else
-			return idle_task(-tid - 2);
-	}
-	if (tid <= 0) {
-		printk(KERN_ERR "KGDB: Internal thread select error\n");
-		dump_stack();
-		return NULL;
-	}
-
-	/*
-	 * find_task_by_pid_ns() does not take the tasklist lock anymore
-	 * but is nicely RCU locked - hence is a pretty resilient
-	 * thing to use:
-	 */
-	return find_task_by_pid_ns(tid, &init_pid_ns);
-}
-
 /*
  * Some architectures need cache flushes when we set/clear a
  * breakpoint:
@@ -584,7 +234,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 /*
  * SW breakpoint management:
  */
-static int kgdb_activate_sw_breakpoints(void)
+int dbg_activate_sw_breakpoints(void)
 {
 	unsigned long addr;
 	int error;
@@ -610,7 +260,7 @@ static int kgdb_activate_sw_breakpoints(void)
 	return ret;
 }
 
-static int kgdb_set_sw_break(unsigned long addr)
+int dbg_set_sw_break(unsigned long addr)
 {
 	int err = kgdb_validate_break_address(addr);
 	int breakno = -1;
@@ -675,7 +325,7 @@ static int kgdb_deactivate_sw_breakpoints(void)
 	return ret;
 }
 
-static int kgdb_remove_sw_break(unsigned long addr)
+int dbg_remove_sw_break(unsigned long addr)
 {
 	int i;
 
@@ -701,7 +351,7 @@ int kgdb_isremovedbreak(unsigned long addr)
 	return 0;
 }
 
-static int remove_all_break(void)
+int dbg_remove_all_break(void)
 {
 	unsigned long addr;
 	int error;
@@ -728,53 +378,6 @@ setundefined:
 	return 0;
 }
 
-/*
- * Remap normal tasks to their real PID,
- * CPU shadow threads are mapped to -CPU - 2
- */
-static inline int shadow_pid(int realpid)
-{
-	if (realpid)
-		return realpid;
-
-	return -raw_smp_processor_id() - 2;
-}
-
-static char gdbmsgbuf[BUFMAX + 1];
-
-static void kgdb_msg_write(const char *s, int len)
-{
-	char *bufptr;
-	int wcount;
-	int i;
-
-	/* 'O'utput */
-	gdbmsgbuf[0] = 'O';
-
-	/* Fill and send buffers... */
-	while (len > 0) {
-		bufptr = gdbmsgbuf + 1;
-
-		/* Calculate how many this time */
-		if ((len << 1) > (BUFMAX - 2))
-			wcount = (BUFMAX - 2) >> 1;
-		else
-			wcount = len;
-
-		/* Pack in hex chars */
-		for (i = 0; i < wcount; i++)
-			bufptr = pack_hex_byte(bufptr, s[i]);
-		*bufptr = '\0';
-
-		/* Move up */
-		s += wcount;
-		len -= wcount;
-
-		/* Write packet */
-		put_packet(gdbmsgbuf);
-	}
-}
-
 /*
  * Return true if there is a valid kgdb I/O module.  Also if no
  * debugger is attached a message can be printed to the console about
@@ -786,7 +389,7 @@ static void kgdb_msg_write(const char *s, int len)
  */
 static int kgdb_io_ready(int print_wait)
 {
-	if (!kgdb_io_ops)
+	if (!dbg_io_ops)
 		return 0;
 	if (kgdb_connected)
 		return 1;
@@ -797,525 +400,6 @@ static int kgdb_io_ready(int print_wait)
 	return 1;
 }
 
-/*
- * All the functions that start with gdb_cmd are the various
- * operations to implement the handlers for the gdbserial protocol
- * where KGDB is communicating with an external debugger
- */
-
-/* Handle the '?' status packets */
-static void gdb_cmd_status(struct kgdb_state *ks)
-{
-	/*
-	 * We know that this packet is only sent
-	 * during initial connect.  So to be safe,
-	 * we clear out our breakpoints now in case
-	 * GDB is reconnecting.
-	 */
-	remove_all_break();
-
-	remcom_out_buffer[0] = 'S';
-	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
-}
-
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
-{
-	struct task_struct *thread;
-	void *local_debuggerinfo;
-	int i;
-
-	thread = kgdb_usethread;
-	if (!thread) {
-		thread = kgdb_info[ks->cpu].task;
-		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
-	} else {
-		local_debuggerinfo = NULL;
-		for_each_online_cpu(i) {
-			/*
-			 * Try to find the task on some other
-			 * or possibly this node if we do not
-			 * find the matching task then we try
-			 * to approximate the results.
-			 */
-			if (thread == kgdb_info[i].task)
-				local_debuggerinfo = kgdb_info[i].debuggerinfo;
-		}
-	}
-
-	/*
-	 * All threads that don't have debuggerinfo should be
-	 * in schedule() sleeping, since all other CPUs
-	 * are in kgdb_wait, and thus have debuggerinfo.
-	 */
-	if (local_debuggerinfo) {
-		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
-	} else {
-		/*
-		 * Pull stuff saved during switch_to; nothing
-		 * else is accessible (or even particularly
-		 * relevant).
-		 *
-		 * This should be enough for a stack trace.
-		 */
-		sleeping_thread_to_gdb_regs(gdb_regs, thread);
-	}
-	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
-}
-
-/* Handle the 'G' set registers request */
-static void gdb_cmd_setregs(struct kgdb_state *ks)
-{
-	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
-
-	if (kgdb_usethread && kgdb_usethread != current) {
-		error_packet(remcom_out_buffer, -EINVAL);
-	} else {
-		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
-		strcpy(remcom_out_buffer, "OK");
-	}
-}
-
-/* Handle the 'm' memory read bytes */
-static void gdb_cmd_memread(struct kgdb_state *ks)
-{
-	char *ptr = &remcom_in_buffer[1];
-	unsigned long length;
-	unsigned long addr;
-	int err;
-
-	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
-					kgdb_hex2long(&ptr, &length) > 0) {
-		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-		if (err)
-			error_packet(remcom_out_buffer, err);
-	} else {
-		error_packet(remcom_out_buffer, -EINVAL);
-	}
-}
-
-/* Handle the 'M' memory write bytes */
-static void gdb_cmd_memwrite(struct kgdb_state *ks)
-{
-	int err = write_mem_msg(0);
-
-	if (err)
-		error_packet(remcom_out_buffer, err);
-	else
-		strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'X' memory binary write bytes */
-static void gdb_cmd_binwrite(struct kgdb_state *ks)
-{
-	int err = write_mem_msg(1);
-
-	if (err)
-		error_packet(remcom_out_buffer, err);
-	else
-		strcpy(remcom_out_buffer, "OK");
-}
-
-/* Handle the 'D' or 'k', detach or kill packets */
-static void gdb_cmd_detachkill(struct kgdb_state *ks)
-{
-	int error;
-
-	/* The detach case */
-	if (remcom_in_buffer[0] == 'D') {
-		error = remove_all_break();
-		if (error < 0) {
-			error_packet(remcom_out_buffer, error);
-		} else {
-			strcpy(remcom_out_buffer, "OK");
-			kgdb_connected = 0;
-		}
-		put_packet(remcom_out_buffer);
-	} else {
-		/*
-		 * Assume the kill case, with no exit code checking,
-		 * trying to force detach the debugger:
-		 */
-		remove_all_break();
-		kgdb_connected = 0;
-	}
-}
-
-/* Handle the 'R' reboot packets */
-static int gdb_cmd_reboot(struct kgdb_state *ks)
-{
-	/* For now, only honor R0 */
-	if (strcmp(remcom_in_buffer, "R0") == 0) {
-		printk(KERN_CRIT "Executing emergency reboot\n");
-		strcpy(remcom_out_buffer, "OK");
-		put_packet(remcom_out_buffer);
-
-		/*
-		 * Execution should not return from
-		 * machine_emergency_restart()
-		 */
-		machine_emergency_restart();
-		kgdb_connected = 0;
-
-		return 1;
-	}
-	return 0;
-}
-
-/* Handle the 'q' query packets */
-static void gdb_cmd_query(struct kgdb_state *ks)
-{
-	struct task_struct *g;
-	struct task_struct *p;
-	unsigned char thref[8];
-	char *ptr;
-	int i;
-	int cpu;
-	int finished = 0;
-
-	switch (remcom_in_buffer[1]) {
-	case 's':
-	case 'f':
-		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-
-		i = 0;
-		remcom_out_buffer[0] = 'm';
-		ptr = remcom_out_buffer + 1;
-		if (remcom_in_buffer[1] == 'f') {
-			/* Each cpu is a shadow thread */
-			for_each_online_cpu(cpu) {
-				ks->thr_query = 0;
-				int_to_threadref(thref, -cpu - 2);
-				pack_threadid(ptr, thref);
-				ptr += BUF_THREAD_ID_SIZE;
-				*(ptr++) = ',';
-				i++;
-			}
-		}
-
-		do_each_thread(g, p) {
-			if (i >= ks->thr_query && !finished) {
-				int_to_threadref(thref, p->pid);
-				pack_threadid(ptr, thref);
-				ptr += BUF_THREAD_ID_SIZE;
-				*(ptr++) = ',';
-				ks->thr_query++;
-				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
-					finished = 1;
-			}
-			i++;
-		} while_each_thread(g, p);
-
-		*(--ptr) = '\0';
-		break;
-
-	case 'C':
-		/* Current thread id */
-		strcpy(remcom_out_buffer, "QC");
-		ks->threadid = shadow_pid(current->pid);
-		int_to_threadref(thref, ks->threadid);
-		pack_threadid(remcom_out_buffer + 2, thref);
-		break;
-	case 'T':
-		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		ks->threadid = 0;
-		ptr = remcom_in_buffer + 17;
-		kgdb_hex2long(&ptr, &ks->threadid);
-		if (!getthread(ks->linux_regs, ks->threadid)) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		if ((int)ks->threadid > 0) {
-			kgdb_mem2hex(getthread(ks->linux_regs,
-					ks->threadid)->comm,
-					remcom_out_buffer, 16);
-		} else {
-			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-
-			sprintf(tmpstr, "shadowCPU%d",
-					(int)(-ks->threadid - 2));
-			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
-		}
-		break;
-	}
-}
-
-/* Handle the 'H' task query packets */
-static void gdb_cmd_task(struct kgdb_state *ks)
-{
-	struct task_struct *thread;
-	char *ptr;
-
-	switch (remcom_in_buffer[1]) {
-	case 'g':
-		ptr = &remcom_in_buffer[2];
-		kgdb_hex2long(&ptr, &ks->threadid);
-		thread = getthread(ks->linux_regs, ks->threadid);
-		if (!thread && ks->threadid > 0) {
-			error_packet(remcom_out_buffer, -EINVAL);
-			break;
-		}
-		kgdb_usethread = thread;
-		ks->kgdb_usethreadid = ks->threadid;
-		strcpy(remcom_out_buffer, "OK");
-		break;
-	case 'c':
-		ptr = &remcom_in_buffer[2];
-		kgdb_hex2long(&ptr, &ks->threadid);
-		if (!ks->threadid) {
-			kgdb_contthread = NULL;
-		} else {
-			thread = getthread(ks->linux_regs, ks->threadid);
-			if (!thread && ks->threadid > 0) {
-				error_packet(remcom_out_buffer, -EINVAL);
-				break;
-			}
-			kgdb_contthread = thread;
-		}
-		strcpy(remcom_out_buffer, "OK");
-		break;
-	}
-}
-
-/* Handle the 'T' thread query packets */
-static void gdb_cmd_thread(struct kgdb_state *ks)
-{
-	char *ptr = &remcom_in_buffer[1];
-	struct task_struct *thread;
-
-	kgdb_hex2long(&ptr, &ks->threadid);
-	thread = getthread(ks->linux_regs, ks->threadid);
-	if (thread)
-		strcpy(remcom_out_buffer, "OK");
-	else
-		error_packet(remcom_out_buffer, -EINVAL);
-}
-
-/* Handle the 'z' or 'Z' breakpoint remove or set packets */
-static void gdb_cmd_break(struct kgdb_state *ks)
-{
-	/*
-	 * Since GDB-5.3, it's been drafted that '0' is a software
-	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
-	 */
-	char *bpt_type = &remcom_in_buffer[1];
-	char *ptr = &remcom_in_buffer[2];
-	unsigned long addr;
-	unsigned long length;
-	int error = 0;
-
-	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
-		/* Unsupported */
-		if (*bpt_type > '4')
-			return;
-	} else {
-		if (*bpt_type != '0' && *bpt_type != '1')
-			/* Unsupported. */
-			return;
-	}
-
-	/*
-	 * Test if this is a hardware breakpoint, and
-	 * if we support it:
-	 */
-	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
-		/* Unsupported. */
-		return;
-
-	if (*(ptr++) != ',') {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-	if (!kgdb_hex2long(&ptr, &addr)) {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-	if (*(ptr++) != ',' ||
-		!kgdb_hex2long(&ptr, &length)) {
-		error_packet(remcom_out_buffer, -EINVAL);
-		return;
-	}
-
-	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
-		error = kgdb_set_sw_break(addr);
-	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
-		error = kgdb_remove_sw_break(addr);
-	else if (remcom_in_buffer[0] == 'Z')
-		error = arch_kgdb_ops.set_hw_breakpoint(addr,
-			(int)length, *bpt_type - '0');
-	else if (remcom_in_buffer[0] == 'z')
-		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
-			(int) length, *bpt_type - '0');
-
-	if (error == 0)
-		strcpy(remcom_out_buffer, "OK");
-	else
-		error_packet(remcom_out_buffer, error);
-}
-
-/* Handle the 'C' signal / exception passing packets */
-static int gdb_cmd_exception_pass(struct kgdb_state *ks)
-{
-	/* C09 == pass exception
-	 * C15 == detach kgdb, pass exception
-	 */
-	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
-
-		ks->pass_exception = 1;
-		remcom_in_buffer[0] = 'c';
-
-	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
-
-		ks->pass_exception = 1;
-		remcom_in_buffer[0] = 'D';
-		remove_all_break();
-		kgdb_connected = 0;
-		return 1;
-
-	} else {
-		kgdb_msg_write("KGDB only knows signal 9 (pass)"
-			" and 15 (pass and disconnect)\n"
-			"Executing a continue without signal passing\n", 0);
-		remcom_in_buffer[0] = 'c';
-	}
-
-	/* Indicate fall through */
-	return -1;
-}
-
-/*
- * This function performs all gdbserial command procesing
- */
-static int gdb_serial_stub(struct kgdb_state *ks)
-{
-	int error = 0;
-	int tmp;
-
-	/* Clear the out buffer. */
-	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
-	if (kgdb_connected) {
-		unsigned char thref[8];
-		char *ptr;
-
-		/* Reply to host that an exception has occurred */
-		ptr = remcom_out_buffer;
-		*ptr++ = 'T';
-		ptr = pack_hex_byte(ptr, ks->signo);
-		ptr += strlen(strcpy(ptr, "thread:"));
-		int_to_threadref(thref, shadow_pid(current->pid));
-		ptr = pack_threadid(ptr, thref);
-		*ptr++ = ';';
-		put_packet(remcom_out_buffer);
-	}
-
-	kgdb_usethread = kgdb_info[ks->cpu].task;
-	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-	ks->pass_exception = 0;
-
-	while (1) {
-		error = 0;
-
-		/* Clear the out buffer. */
-		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-
-		get_packet(remcom_in_buffer);
-
-		switch (remcom_in_buffer[0]) {
-		case '?': /* gdbserial status */
-			gdb_cmd_status(ks);
-			break;
-		case 'g': /* return the value of the CPU registers */
-			gdb_cmd_getregs(ks);
-			break;
-		case 'G': /* set the value of the CPU registers - return OK */
-			gdb_cmd_setregs(ks);
-			break;
-		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
-			gdb_cmd_memread(ks);
-			break;
-		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-			gdb_cmd_memwrite(ks);
-			break;
-		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-			gdb_cmd_binwrite(ks);
-			break;
-			/* kill or detach. KGDB should treat this like a
-			 * continue.
-			 */
-		case 'D': /* Debugger detach */
-		case 'k': /* Debugger detach via kill */
-			gdb_cmd_detachkill(ks);
-			goto default_handle;
-		case 'R': /* Reboot */
-			if (gdb_cmd_reboot(ks))
-				goto default_handle;
-			break;
-		case 'q': /* query command */
-			gdb_cmd_query(ks);
-			break;
-		case 'H': /* task related */
-			gdb_cmd_task(ks);
-			break;
-		case 'T': /* Query thread status */
-			gdb_cmd_thread(ks);
-			break;
-		case 'z': /* Break point remove */
-		case 'Z': /* Break point set */
-			gdb_cmd_break(ks);
-			break;
-		case 'C': /* Exception passing */
-			tmp = gdb_cmd_exception_pass(ks);
-			if (tmp > 0)
-				goto default_handle;
-			if (tmp == 0)
-				break;
-			/* Fall through on tmp < 0 */
-		case 'c': /* Continue packet */
-		case 's': /* Single step packet */
-			if (kgdb_contthread && kgdb_contthread != current) {
-				/* Can't switch threads in kgdb */
-				error_packet(remcom_out_buffer, -EINVAL);
-				break;
-			}
-			kgdb_activate_sw_breakpoints();
-			/* Fall through to default processing */
-		default:
-default_handle:
-			error = kgdb_arch_handle_exception(ks->ex_vector,
-						ks->signo,
-						ks->err_code,
-						remcom_in_buffer,
-						remcom_out_buffer,
-						ks->linux_regs);
-			/*
-			 * Leave cmd processing on error, detach,
-			 * kill, continue, or single step.
-			 */
-			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
-			    remcom_in_buffer[0] == 'k') {
-				error = 0;
-				goto kgdb_exit;
-			}
-
-		}
-
-		/* reply to the request */
-		put_packet(remcom_out_buffer);
-	}
-
-kgdb_exit:
-	if (ks->pass_exception)
-		error = 1;
-	return error;
-}
-
 static int kgdb_reenter_check(struct kgdb_state *ks)
 {
 	unsigned long addr;
@@ -1334,17 +418,17 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	 * user because the user planted a breakpoint in a place that
 	 * KGDB needs in order to function.
 	 */
-	if (kgdb_remove_sw_break(addr) == 0) {
+	if (dbg_remove_sw_break(addr) == 0) {
 		exception_level = 0;
 		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-		kgdb_activate_sw_breakpoints();
+		dbg_activate_sw_breakpoints();
 		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
 			addr);
 		WARN_ON_ONCE(1);
 
 		return 1;
 	}
-	remove_all_break();
+	dbg_remove_all_break();
 	kgdb_skipexception(ks->ex_vector, ks->linux_regs);
 
 	if (exception_level > 1) {
@@ -1430,7 +514,7 @@ return_normal:
 
 	if (!kgdb_io_ready(1)) {
 		error = 1;
-		goto kgdb_restore; /* No I/O connection, so resume the system */
+		goto kgdb_restore; /* No I/O connection, resume the system */
 	}
 
 	/*
@@ -1440,8 +524,8 @@ return_normal:
 		goto kgdb_restore;
 
 	/* Call the I/O driver's pre_exception routine */
-	if (kgdb_io_ops->pre_exception)
-		kgdb_io_ops->pre_exception();
+	if (dbg_io_ops->pre_exception)
+		dbg_io_ops->pre_exception();
 
 	kgdb_disable_hw_debug(ks->linux_regs);
 
@@ -1485,8 +569,8 @@ return_normal:
 	error = gdb_serial_stub(ks);
 
 	/* Call the I/O driver's post_exception routine */
-	if (kgdb_io_ops->post_exception)
-		kgdb_io_ops->post_exception();
+	if (dbg_io_ops->post_exception)
+		dbg_io_ops->post_exception();
 
 	atomic_dec(&cpu_in_kgdb[ks->cpu]);
 
@@ -1585,7 +669,7 @@ static void kgdb_console_write(struct console *co, const char *s,
 		return;
 
 	local_irq_save(flags);
-	kgdb_msg_write(s, count);
+	gdbstub_msg_write(s, count);
 	local_irq_restore(flags);
 }
 
@@ -1597,9 +681,9 @@ static struct console kgdbcons = {
 };
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_gdb(int key, struct tty_struct *tty)
+static void sysrq_handle_dbg(int key, struct tty_struct *tty)
 {
-	if (!kgdb_io_ops) {
+	if (!dbg_io_ops) {
 		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
 		return;
 	}
@@ -1609,8 +693,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty)
 	kgdb_breakpoint();
 }
 
-static struct sysrq_key_op sysrq_gdb_op = {
-	.handler	= sysrq_handle_gdb,
+static struct sysrq_key_op sysrq_dbg_op = {
+	.handler	= sysrq_handle_dbg,
 	.help_msg	= "debug(G)",
 	.action_msg	= "DEBUG",
 };
@@ -1622,7 +706,7 @@ static void kgdb_register_callbacks(void)
 		kgdb_io_module_registered = 1;
 		kgdb_arch_init();
 #ifdef CONFIG_MAGIC_SYSRQ
-		register_sysrq_key('g', &sysrq_gdb_op);
+		register_sysrq_key('g', &sysrq_dbg_op);
 #endif
 		if (kgdb_use_con && !kgdb_con_registered) {
 			register_console(&kgdbcons);
@@ -1642,7 +726,7 @@ static void kgdb_unregister_callbacks(void)
 		kgdb_io_module_registered = 0;
 		kgdb_arch_exit();
 #ifdef CONFIG_MAGIC_SYSRQ
-		unregister_sysrq_key('g', &sysrq_gdb_op);
+		unregister_sysrq_key('g', &sysrq_dbg_op);
 #endif
 		if (kgdb_con_registered) {
 			unregister_console(&kgdbcons);
@@ -1661,17 +745,17 @@ static void kgdb_initial_breakpoint(void)
 
 /**
  *	kgdb_register_io_module - register KGDB IO module
- *	@new_kgdb_io_ops: the io ops vector
+ *	@new_dbg_io_ops: the io ops vector
  *
  *	Register it with the KGDB core.
  */
-int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
 {
 	int err;
 
 	spin_lock(&kgdb_registration_lock);
 
-	if (kgdb_io_ops) {
+	if (dbg_io_ops) {
 		spin_unlock(&kgdb_registration_lock);
 
 		printk(KERN_ERR "kgdb: Another I/O driver is already "
@@ -1679,20 +763,20 @@ int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
 		return -EBUSY;
 	}
 
-	if (new_kgdb_io_ops->init) {
-		err = new_kgdb_io_ops->init();
+	if (new_dbg_io_ops->init) {
+		err = new_dbg_io_ops->init();
 		if (err) {
 			spin_unlock(&kgdb_registration_lock);
 			return err;
 		}
 	}
 
-	kgdb_io_ops = new_kgdb_io_ops;
+	dbg_io_ops = new_dbg_io_ops;
 
 	spin_unlock(&kgdb_registration_lock);
 
 	printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
-	       new_kgdb_io_ops->name);
+	       new_dbg_io_ops->name);
 
 	/* Arm KGDB now. */
 	kgdb_register_callbacks();
@@ -1706,11 +790,11 @@ EXPORT_SYMBOL_GPL(kgdb_register_io_module);
 
 /**
  *	kkgdb_unregister_io_module - unregister KGDB IO module
- *	@old_kgdb_io_ops: the io ops vector
+ *	@old_dbg_io_ops: the io ops vector
  *
  *	Unregister it with the KGDB core.
  */
-void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
 {
 	BUG_ON(kgdb_connected);
 
@@ -1722,14 +806,14 @@ void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
 
 	spin_lock(&kgdb_registration_lock);
 
-	WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
-	kgdb_io_ops = NULL;
+	WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+	dbg_io_ops = NULL;
 
 	spin_unlock(&kgdb_registration_lock);
 
 	printk(KERN_INFO
 		"kgdb: Unregistered I/O driver %s, debugger disabled.\n",
-		old_kgdb_io_ops->name);
+		old_dbg_io_ops->name);
 }
 EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..db554f9be51d
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,55 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+
+/* kernel debug core data structures */
+struct kgdb_state {
+	int			ex_vector;
+	int			signo;
+	int			err_code;
+	int			cpu;
+	int			pass_exception;
+	unsigned long		thr_query;
+	unsigned long		threadid;
+	long			kgdb_usethreadid;
+	struct pt_regs		*linux_regs;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
+
+struct debuggerinfo_struct {
+	void			*debuggerinfo;
+	struct task_struct	*task;
+	int			exception_state;
+};
+
+extern struct debuggerinfo_struct kgdb_info[];
+
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..ccdf0929f12d
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,934 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+
+#define KGDB_MAX_THREAD_QUERY 17
+
+/* Our I/O buffers. */
+static char			remcom_in_buffer[BUFMAX];
+static char			remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long		gdb_regs[(NUMREGBYTES +
+					sizeof(unsigned long) - 1) /
+					sizeof(unsigned long)];
+
+/*
+ * GDB remote protocol parser:
+ */
+
+static int hex(char ch)
+{
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+	unsigned char checksum;
+	unsigned char xmitcsum;
+	int count;
+	char ch;
+
+	do {
+		/*
+		 * Spin and wait around for the start character, ignore all
+		 * other characters:
+		 */
+		while ((ch = (dbg_io_ops->read_char())) != '$')
+			/* nothing */;
+
+		kgdb_connected = 1;
+		checksum = 0;
+		xmitcsum = -1;
+
+		count = 0;
+
+		/*
+		 * now, read until a # or end of buffer is found:
+		 */
+		while (count < (BUFMAX - 1)) {
+			ch = dbg_io_ops->read_char();
+			if (ch == '#')
+				break;
+			checksum = checksum + ch;
+			buffer[count] = ch;
+			count = count + 1;
+		}
+		buffer[count] = 0;
+
+		if (ch == '#') {
+			xmitcsum = hex(dbg_io_ops->read_char()) << 4;
+			xmitcsum += hex(dbg_io_ops->read_char());
+
+			if (checksum != xmitcsum)
+				/* failed checksum */
+				dbg_io_ops->write_char('-');
+			else
+				/* successful transfer */
+				dbg_io_ops->write_char('+');
+			if (dbg_io_ops->flush)
+				dbg_io_ops->flush();
+		}
+	} while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+	unsigned char checksum;
+	int count;
+	char ch;
+
+	/*
+	 * $<packet info>#<checksum>.
+	 */
+	while (1) {
+		dbg_io_ops->write_char('$');
+		checksum = 0;
+		count = 0;
+
+		while ((ch = buffer[count])) {
+			dbg_io_ops->write_char(ch);
+			checksum += ch;
+			count++;
+		}
+
+		dbg_io_ops->write_char('#');
+		dbg_io_ops->write_char(hex_asc_hi(checksum));
+		dbg_io_ops->write_char(hex_asc_lo(checksum));
+		if (dbg_io_ops->flush)
+			dbg_io_ops->flush();
+
+		/* Now see what we get in reply. */
+		ch = dbg_io_ops->read_char();
+
+		if (ch == 3)
+			ch = dbg_io_ops->read_char();
+
+		/* If we get an ACK, we are done. */
+		if (ch == '+')
+			return;
+
+		/*
+		 * If we get the start of another packet, this means
+		 * that GDB is attempting to reconnect.  We will NAK
+		 * the packet being sent, and stop trying to send this
+		 * packet.
+		 */
+		if (ch == '$') {
+			dbg_io_ops->write_char('-');
+			if (dbg_io_ops->flush)
+				dbg_io_ops->flush();
+			return;
+		}
+	}
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+void gdbstub_msg_write(const char *s, int len)
+{
+	char *bufptr;
+	int wcount;
+	int i;
+
+	/* 'O'utput */
+	gdbmsgbuf[0] = 'O';
+
+	/* Fill and send buffers... */
+	while (len > 0) {
+		bufptr = gdbmsgbuf + 1;
+
+		/* Calculate how many this time */
+		if ((len << 1) > (BUFMAX - 2))
+			wcount = (BUFMAX - 2) >> 1;
+		else
+			wcount = len;
+
+		/* Pack in hex chars */
+		for (i = 0; i < wcount; i++)
+			bufptr = pack_hex_byte(bufptr, s[i]);
+		*bufptr = '\0';
+
+		/* Move up */
+		s += wcount;
+		len -= wcount;
+
+		/* Write packet */
+		put_packet(gdbmsgbuf);
+	}
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf.  Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+int kgdb_mem2hex(char *mem, char *buf, int count)
+{
+	char *tmp;
+	int err;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory copy.  Hex conversion will work against this one.
+	 */
+	tmp = buf + count;
+
+	err = probe_kernel_read(tmp, mem, count);
+	if (!err) {
+		while (count > 0) {
+			buf = pack_hex_byte(buf, *tmp);
+			tmp++;
+			count--;
+		}
+
+		*buf = 0;
+	}
+
+	return err;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem.  Return a pointer to the character AFTER the last byte
+ * written.  May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+	char *tmp_raw;
+	char *tmp_hex;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory that is converted from hex.
+	 */
+	tmp_raw = buf + count * 2;
+
+	tmp_hex = tmp_raw - 1;
+	while (tmp_hex >= buf) {
+		tmp_raw--;
+		*tmp_raw = hex(*tmp_hex--);
+		*tmp_raw |= hex(*tmp_hex--) << 4;
+	}
+
+	return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+	int hex_val;
+	int num = 0;
+	int negate = 0;
+
+	*long_val = 0;
+
+	if (**ptr == '-') {
+		negate = 1;
+		(*ptr)++;
+	}
+	while (**ptr) {
+		hex_val = hex(**ptr);
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		num++;
+		(*ptr)++;
+	}
+
+	if (negate)
+		*long_val = -*long_val;
+
+	return num;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+	int size = 0;
+	char *c = buf;
+
+	while (count-- > 0) {
+		c[size] = *buf++;
+		if (c[size] == 0x7d)
+			c[size] = *buf++ ^ 0x20;
+		size++;
+	}
+
+	return probe_kernel_write(mem, c, size);
+}
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long addr;
+	unsigned long length;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+		if (binary)
+			err = kgdb_ebin2mem(ptr, (char *)addr, length);
+		else
+			err = kgdb_hex2mem(ptr, (char *)addr, length);
+		if (err)
+			return err;
+		if (CACHE_FLUSH_IS_SAFE)
+			flush_icache_range(addr, addr + length);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+	error = -error;
+	pkt[0] = 'E';
+	pkt[1] = hex_asc[(error / 10)];
+	pkt[2] = hex_asc[(error % 10)];
+	pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE	16
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+	char *limit;
+
+	limit = pkt + BUF_THREAD_ID_SIZE;
+	while (pkt < limit)
+		pkt = pack_hex_byte(pkt, *id++);
+
+	return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+	unsigned char *scan;
+	int i = 4;
+
+	scan = (unsigned char *)id;
+	while (i--)
+		*scan++ = 0;
+	put_unaligned_be32(value, scan);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+	/*
+	 * Non-positive TIDs are remapped to the cpu shadow information
+	 */
+	if (tid == 0 || tid == -1)
+		tid = -atomic_read(&kgdb_active) - 2;
+	if (tid < -1 && tid > -NR_CPUS - 2) {
+		if (kgdb_info[-tid - 2].task)
+			return kgdb_info[-tid - 2].task;
+		else
+			return idle_task(-tid - 2);
+	}
+	if (tid <= 0) {
+		printk(KERN_ERR "KGDB: Internal thread select error\n");
+		dump_stack();
+		return NULL;
+	}
+
+	/*
+	 * find_task_by_pid_ns() does not take the tasklist lock anymore
+	 * but is nicely RCU locked - hence is a pretty resilient
+	 * thing to use:
+	 */
+	return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+	if (realpid)
+		return realpid;
+
+	return -raw_smp_processor_id() - 2;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+	/*
+	 * We know that this packet is only sent
+	 * during initial connect.  So to be safe,
+	 * we clear out our breakpoints now in case
+	 * GDB is reconnecting.
+	 */
+	dbg_remove_all_break();
+
+	remcom_out_buffer[0] = 'S';
+	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	void *local_debuggerinfo;
+	int i;
+
+	thread = kgdb_usethread;
+	if (!thread) {
+		thread = kgdb_info[ks->cpu].task;
+		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+	} else {
+		local_debuggerinfo = NULL;
+		for_each_online_cpu(i) {
+			/*
+			 * Try to find the task on some other
+			 * or possibly this node if we do not
+			 * find the matching task then we try
+			 * to approximate the results.
+			 */
+			if (thread == kgdb_info[i].task)
+				local_debuggerinfo = kgdb_info[i].debuggerinfo;
+		}
+	}
+
+	/*
+	 * All threads that don't have debuggerinfo should be
+	 * in schedule() sleeping, since all other CPUs
+	 * are in kgdb_wait, and thus have debuggerinfo.
+	 */
+	if (local_debuggerinfo) {
+		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+	} else {
+		/*
+		 * Pull stuff saved during switch_to; nothing
+		 * else is accessible (or even particularly
+		 * relevant).
+		 *
+		 * This should be enough for a stack trace.
+		 */
+		sleeping_thread_to_gdb_regs(gdb_regs, thread);
+	}
+	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+	if (kgdb_usethread && kgdb_usethread != current) {
+		error_packet(remcom_out_buffer, -EINVAL);
+	} else {
+		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+		strcpy(remcom_out_buffer, "OK");
+	}
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long length;
+	unsigned long addr;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+					kgdb_hex2long(&ptr, &length) > 0) {
+		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+		if (err)
+			error_packet(remcom_out_buffer, err);
+	} else {
+		error_packet(remcom_out_buffer, -EINVAL);
+	}
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(0);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(1);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+	int error;
+
+	/* The detach case */
+	if (remcom_in_buffer[0] == 'D') {
+		error = dbg_remove_all_break();
+		if (error < 0) {
+			error_packet(remcom_out_buffer, error);
+		} else {
+			strcpy(remcom_out_buffer, "OK");
+			kgdb_connected = 0;
+		}
+		put_packet(remcom_out_buffer);
+	} else {
+		/*
+		 * Assume the kill case, with no exit code checking,
+		 * trying to force detach the debugger:
+		 */
+		dbg_remove_all_break();
+		kgdb_connected = 0;
+	}
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+	/* For now, only honor R0 */
+	if (strcmp(remcom_in_buffer, "R0") == 0) {
+		printk(KERN_CRIT "Executing emergency reboot\n");
+		strcpy(remcom_out_buffer, "OK");
+		put_packet(remcom_out_buffer);
+
+		/*
+		 * Execution should not return from
+		 * machine_emergency_restart()
+		 */
+		machine_emergency_restart();
+		kgdb_connected = 0;
+
+		return 1;
+	}
+	return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+	struct task_struct *g;
+	struct task_struct *p;
+	unsigned char thref[8];
+	char *ptr;
+	int i;
+	int cpu;
+	int finished = 0;
+
+	switch (remcom_in_buffer[1]) {
+	case 's':
+	case 'f':
+		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+
+		i = 0;
+		remcom_out_buffer[0] = 'm';
+		ptr = remcom_out_buffer + 1;
+		if (remcom_in_buffer[1] == 'f') {
+			/* Each cpu is a shadow thread */
+			for_each_online_cpu(cpu) {
+				ks->thr_query = 0;
+				int_to_threadref(thref, -cpu - 2);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				i++;
+			}
+		}
+
+		do_each_thread(g, p) {
+			if (i >= ks->thr_query && !finished) {
+				int_to_threadref(thref, p->pid);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				ks->thr_query++;
+				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+					finished = 1;
+			}
+			i++;
+		} while_each_thread(g, p);
+
+		*(--ptr) = '\0';
+		break;
+
+	case 'C':
+		/* Current thread id */
+		strcpy(remcom_out_buffer, "QC");
+		ks->threadid = shadow_pid(current->pid);
+		int_to_threadref(thref, ks->threadid);
+		pack_threadid(remcom_out_buffer + 2, thref);
+		break;
+	case 'T':
+		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		ks->threadid = 0;
+		ptr = remcom_in_buffer + 17;
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!getthread(ks->linux_regs, ks->threadid)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		if ((int)ks->threadid > 0) {
+			kgdb_mem2hex(getthread(ks->linux_regs,
+					ks->threadid)->comm,
+					remcom_out_buffer, 16);
+		} else {
+			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+			sprintf(tmpstr, "shadowCPU%d",
+					(int)(-ks->threadid - 2));
+			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+		}
+		break;
+	}
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	char *ptr;
+
+	switch (remcom_in_buffer[1]) {
+	case 'g':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		thread = getthread(ks->linux_regs, ks->threadid);
+		if (!thread && ks->threadid > 0) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		kgdb_usethread = thread;
+		ks->kgdb_usethreadid = ks->threadid;
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	case 'c':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!ks->threadid) {
+			kgdb_contthread = NULL;
+		} else {
+			thread = getthread(ks->linux_regs, ks->threadid);
+			if (!thread && ks->threadid > 0) {
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			kgdb_contthread = thread;
+		}
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	}
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	struct task_struct *thread;
+
+	kgdb_hex2long(&ptr, &ks->threadid);
+	thread = getthread(ks->linux_regs, ks->threadid);
+	if (thread)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+	/*
+	 * Since GDB-5.3, it's been drafted that '0' is a software
+	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
+	 */
+	char *bpt_type = &remcom_in_buffer[1];
+	char *ptr = &remcom_in_buffer[2];
+	unsigned long addr;
+	unsigned long length;
+	int error = 0;
+
+	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+		/* Unsupported */
+		if (*bpt_type > '4')
+			return;
+	} else {
+		if (*bpt_type != '0' && *bpt_type != '1')
+			/* Unsupported. */
+			return;
+	}
+
+	/*
+	 * Test if this is a hardware breakpoint, and
+	 * if we support it:
+	 */
+	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+		/* Unsupported. */
+		return;
+
+	if (*(ptr++) != ',') {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (!kgdb_hex2long(&ptr, &addr)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (*(ptr++) != ',' ||
+		!kgdb_hex2long(&ptr, &length)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+
+	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+		error = dbg_set_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+		error = dbg_remove_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'Z')
+		error = arch_kgdb_ops.set_hw_breakpoint(addr,
+			(int)length, *bpt_type - '0');
+	else if (remcom_in_buffer[0] == 'z')
+		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+			(int) length, *bpt_type - '0');
+
+	if (error == 0)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+	/* C09 == pass exception
+	 * C15 == detach kgdb, pass exception
+	 */
+	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'c';
+
+	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'D';
+		dbg_remove_all_break();
+		kgdb_connected = 0;
+		return 1;
+
+	} else {
+		gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+			" and 15 (pass and disconnect)\n"
+			"Executing a continue without signal passing\n", 0);
+		remcom_in_buffer[0] = 'c';
+	}
+
+	/* Indicate fall through */
+	return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+	int error = 0;
+	int tmp;
+
+	/* Clear the out buffer. */
+	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+	if (kgdb_connected) {
+		unsigned char thref[8];
+		char *ptr;
+
+		/* Reply to host that an exception has occurred */
+		ptr = remcom_out_buffer;
+		*ptr++ = 'T';
+		ptr = pack_hex_byte(ptr, ks->signo);
+		ptr += strlen(strcpy(ptr, "thread:"));
+		int_to_threadref(thref, shadow_pid(current->pid));
+		ptr = pack_threadid(ptr, thref);
+		*ptr++ = ';';
+		put_packet(remcom_out_buffer);
+	}
+
+	kgdb_usethread = kgdb_info[ks->cpu].task;
+	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+	ks->pass_exception = 0;
+
+	while (1) {
+		error = 0;
+
+		/* Clear the out buffer. */
+		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+		get_packet(remcom_in_buffer);
+
+		switch (remcom_in_buffer[0]) {
+		case '?': /* gdbserial status */
+			gdb_cmd_status(ks);
+			break;
+		case 'g': /* return the value of the CPU registers */
+			gdb_cmd_getregs(ks);
+			break;
+		case 'G': /* set the value of the CPU registers - return OK */
+			gdb_cmd_setregs(ks);
+			break;
+		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+			gdb_cmd_memread(ks);
+			break;
+		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_memwrite(ks);
+			break;
+		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_binwrite(ks);
+			break;
+			/* kill or detach. KGDB should treat this like a
+			 * continue.
+			 */
+		case 'D': /* Debugger detach */
+		case 'k': /* Debugger detach via kill */
+			gdb_cmd_detachkill(ks);
+			goto default_handle;
+		case 'R': /* Reboot */
+			if (gdb_cmd_reboot(ks))
+				goto default_handle;
+			break;
+		case 'q': /* query command */
+			gdb_cmd_query(ks);
+			break;
+		case 'H': /* task related */
+			gdb_cmd_task(ks);
+			break;
+		case 'T': /* Query thread status */
+			gdb_cmd_thread(ks);
+			break;
+		case 'z': /* Break point remove */
+		case 'Z': /* Break point set */
+			gdb_cmd_break(ks);
+			break;
+		case 'C': /* Exception passing */
+			tmp = gdb_cmd_exception_pass(ks);
+			if (tmp > 0)
+				goto default_handle;
+			if (tmp == 0)
+				break;
+			/* Fall through on tmp < 0 */
+		case 'c': /* Continue packet */
+		case 's': /* Single step packet */
+			if (kgdb_contthread && kgdb_contthread != current) {
+				/* Can't switch threads in kgdb */
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			dbg_activate_sw_breakpoints();
+			/* Fall through to default processing */
+		default:
+default_handle:
+			error = kgdb_arch_handle_exception(ks->ex_vector,
+						ks->signo,
+						ks->err_code,
+						remcom_in_buffer,
+						remcom_out_buffer,
+						ks->linux_regs);
+			/*
+			 * Leave cmd processing on error, detach,
+			 * kill, continue, or single step.
+			 */
+			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+			    remcom_in_buffer[0] == 'k') {
+				error = 0;
+				goto kgdb_exit;
+			}
+
+		}
+
+		/* reply to the request */
+		put_packet(remcom_out_buffer);
+	}
+
+kgdb_exit:
+	if (ks->pass_exception)
+		error = 1;
+	return error;
+}
-- 
cgit v1.2.3-55-g7522


From 5d5314d6795f3c1c0f415348ff8c51f7de042b77 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:20 -0500
Subject: kdb: core for kgdb back end (1 of 2)

This patch contains only the kdb core.  Because the change set was
large, it was split.  The next patch in the series includes the
instrumentation into the core kernel which are mainly helper functions
for kdb.

This work is directly derived from kdb v4.4 found at:

ftp://oss.sgi.com/projects/kdb/download/v4.4/

The kdb internals have been re-organized to make them mostly platform
independent and to connect everything to the debug core which is used by
gdbstub (which has long been known as kgdb).

The original version of kdb was 58,000 lines worth of changes to
support x86.  From that implementation only the kdb shell, and basic
commands for memory access, runcontrol, lsmod, and dmesg where carried
forward.

This is a generic implementation which aims to cover all the current
architectures using the kgdb core: ppc, arm, x86, mips, sparc, sh and
blackfin.  More archictectures can be added by implementing the
architecture specific kgdb functions.

[mort@sgi.com: Compile fix with hugepages enabled]
[mort@sgi.com: Clean breakpoint code renaming kdba_ -> kdb_]
[mort@sgi.com: fix new line after printing registers]
[mort@sgi.com: Remove the concept of global vs. local breakpoints]
[mort@sgi.com: Rework kdb_si_swapinfo to use more generic name]
[mort@sgi.com: fix the information dump macros, remove 'arch' from the names]
[sfr@canb.auug.org.au: include fixup to include linux/slab.h]

CC: linux-arch@vger.kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Martin Hicks <mort@sgi.com>
---
 include/linux/kdb.h             |  113 ++
 kernel/debug/Makefile           |    1 +
 kernel/debug/kdb/.gitignore     |    1 +
 kernel/debug/kdb/Makefile       |   24 +
 kernel/debug/kdb/kdb_bp.c       |  564 ++++++++
 kernel/debug/kdb/kdb_bt.c       |  208 +++
 kernel/debug/kdb/kdb_cmds       |   35 +
 kernel/debug/kdb/kdb_debugger.c |  159 +++
 kernel/debug/kdb/kdb_io.c       |  789 +++++++++++
 kernel/debug/kdb/kdb_main.c     | 2845 +++++++++++++++++++++++++++++++++++++++
 kernel/debug/kdb/kdb_private.h  |  301 +++++
 kernel/debug/kdb/kdb_support.c  |  927 +++++++++++++
 12 files changed, 5967 insertions(+)
 create mode 100644 include/linux/kdb.h
 create mode 100644 kernel/debug/kdb/.gitignore
 create mode 100644 kernel/debug/kdb/Makefile
 create mode 100644 kernel/debug/kdb/kdb_bp.c
 create mode 100644 kernel/debug/kdb/kdb_bt.c
 create mode 100644 kernel/debug/kdb/kdb_cmds
 create mode 100644 kernel/debug/kdb/kdb_debugger.c
 create mode 100644 kernel/debug/kdb/kdb_io.c
 create mode 100644 kernel/debug/kdb/kdb_main.c
 create mode 100644 kernel/debug/kdb/kdb_private.h
 create mode 100644 kernel/debug/kdb/kdb_support.c

(limited to 'kernel')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
new file mode 100644
index 000000000000..4d93790faec3
--- /dev/null
+++ b/include/linux/kdb.h
@@ -0,0 +1,113 @@
+#ifndef _KDB_H
+#define _KDB_H
+
+/*
+ * Kernel Debugger Architecture Independent Global Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2007 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2009 Jason Wessel <jason.wessel@windriver.com>
+ */
+
+#ifdef	CONFIG_KGDB_KDB
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+
+#define KDB_POLL_FUNC_MAX	5
+
+/*
+ * kdb_initial_cpu is initialized to -1, and is set to the cpu
+ * number whenever the kernel debugger is entered.
+ */
+extern int kdb_initial_cpu;
+extern atomic_t kdb_event;
+
+/*
+ * kdb_diemsg
+ *
+ *	Contains a pointer to the last string supplied to the
+ *	kernel 'die' panic function.
+ */
+extern const char *kdb_diemsg;
+
+#define KDB_FLAG_EARLYKDB	(1 << 0) /* set from boot parameter kdb=early */
+#define KDB_FLAG_CATASTROPHIC	(1 << 1) /* A catastrophic event has occurred */
+#define KDB_FLAG_CMD_INTERRUPT	(1 << 2) /* Previous command was interrupted */
+#define KDB_FLAG_NOIPI		(1 << 3) /* Do not send IPIs */
+#define KDB_FLAG_ONLY_DO_DUMP	(1 << 4) /* Only do a dump, used when
+					  * kdb is off */
+#define KDB_FLAG_NO_CONSOLE	(1 << 5) /* No console is available,
+					  * kdb is disabled */
+#define KDB_FLAG_NO_VT_CONSOLE	(1 << 6) /* No VT console is available, do
+					  * not use keyboard */
+#define KDB_FLAG_NO_I8042	(1 << 7) /* No i8042 chip is available, do
+					  * not use keyboard */
+
+extern int kdb_flags;	/* Global flags, see kdb_state for per cpu state */
+
+extern void kdb_save_flags(void);
+extern void kdb_restore_flags(void);
+
+#define KDB_FLAG(flag)		(kdb_flags & KDB_FLAG_##flag)
+#define KDB_FLAG_SET(flag)	((void)(kdb_flags |= KDB_FLAG_##flag))
+#define KDB_FLAG_CLEAR(flag)	((void)(kdb_flags &= ~KDB_FLAG_##flag))
+
+/*
+ * External entry point for the kernel debugger.  The pt_regs
+ * at the time of entry are supplied along with the reason for
+ * entry to the kernel debugger.
+ */
+
+typedef enum {
+	KDB_REASON_ENTER = 1,	/* KDB_ENTER() trap/fault - regs valid */
+	KDB_REASON_ENTER_SLAVE,	/* KDB_ENTER_SLAVE() trap/fault - regs valid */
+	KDB_REASON_BREAK,	/* Breakpoint inst. - regs valid */
+	KDB_REASON_DEBUG,	/* Debug Fault - regs valid */
+	KDB_REASON_OOPS,	/* Kernel Oops - regs valid */
+	KDB_REASON_SWITCH,	/* CPU switch - regs valid*/
+	KDB_REASON_KEYBOARD,	/* Keyboard entry - regs valid */
+	KDB_REASON_NMI,		/* Non-maskable interrupt; regs valid */
+	KDB_REASON_RECURSE,	/* Recursive entry to kdb;
+				 * regs probably valid */
+	KDB_REASON_SSTEP,	/* Single Step trap. - regs valid */
+} kdb_reason_t;
+
+extern int kdb_printf(const char *, ...)
+	    __attribute__ ((format (printf, 1, 2)));
+typedef int (*kdb_printf_t)(const char *, ...)
+	     __attribute__ ((format (printf, 1, 2)));
+
+extern void kdb_init(int level);
+
+/* Access to kdb specific polling devices */
+typedef int (*get_char_func)(void);
+extern get_char_func kdb_poll_funcs[];
+extern int kdb_get_kbd_char(void);
+
+static inline
+int kdb_process_cpu(const struct task_struct *p)
+{
+	unsigned int cpu = task_thread_info(p)->cpu;
+	if (cpu > num_possible_cpus())
+		cpu = 0;
+	return cpu;
+}
+
+/* kdb access to register set for stack dumping */
+extern struct pt_regs *kdb_current_regs;
+
+#else /* ! CONFIG_KGDB_KDB */
+#define kdb_printf(...)
+#define kdb_init(x)
+#endif	/* CONFIG_KGDB_KDB */
+enum {
+	KDB_NOT_INITIALIZED,
+	KDB_INIT_EARLY,
+	KDB_INIT_FULL,
+};
+#endif	/* !_KDB_H */
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index fd4dc6e7782c..a85edc339985 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d1e925eddbcd
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,24 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+
+CCVERSION	:= $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+
+clean-files := gen-kdb_cmds.c
+
+quiet_cmd_gen-kdb = GENKDB  $@
+      cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+		/^\#/{next} \
+		/^[ \t]*$$/{next} \
+		{gsub(/"/, "\\\"", $$0); \
+		  print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+		END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print "  kdb_cmd" i ","}; print("  NULL\n};");}' \
+		$(filter-out %/Makefile,$^) > $@#
+
+$(obj)/gen-kdb_cmds.c:	$(src)/kdb_cmds $(src)/Makefile
+	$(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+	KDB_STATE_SET(DOING_SS);
+}
+
+static char *kdb_rwtypes[] = {
+	"Instruction(i)",
+	"Instruction(Register)",
+	"Data Write",
+	"I/O",
+	"Data Access"
+};
+
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+	if (bp->bp_type < 0 || bp->bp_type > 4)
+		return "";
+
+	return kdb_rwtypes[bp->bp_type];
+}
+
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+	int nextarg = *nextargp;
+	int diag;
+
+	bp->bph_length = 1;
+	if ((argc + 1) != nextarg) {
+		if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+			bp->bp_type = BP_ACCESS_WATCHPOINT;
+		else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+			bp->bp_type = BP_WRITE_WATCHPOINT;
+		else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+			bp->bp_type = BP_HARDWARE_BREAKPOINT;
+		else
+			return KDB_ARGCOUNT;
+
+		bp->bph_length = 1;
+
+		nextarg++;
+
+		if ((argc + 1) != nextarg) {
+			unsigned long len;
+
+			diag = kdbgetularg((char *)argv[nextarg],
+					   &len);
+			if (diag)
+				return diag;
+
+
+			if (len > 8)
+				return KDB_BADLENGTH;
+
+			bp->bph_length = len;
+			nextarg++;
+		}
+
+		if ((argc + 1) != nextarg)
+			return KDB_ARGCOUNT;
+	}
+
+	*nextargp = nextarg;
+	return 0;
+}
+
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+	int ret = 1;
+	if (!bp->bp_installed)
+		return ret;
+	if (!bp->bp_type)
+		ret = dbg_remove_sw_break(bp->bp_addr);
+	else
+		ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+			 bp->bph_length,
+			 bp->bp_type);
+	if (ret == 0)
+		bp->bp_installed = 0;
+	return ret;
+}
+
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+	if (KDB_DEBUG(BP))
+		kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+
+	/*
+	 * Setup single step
+	 */
+	kdb_setsinglestep(regs);
+
+	/*
+	 * Reset delay attribute
+	 */
+	bp->bp_delay = 0;
+	bp->bp_delayed = 1;
+}
+
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+	int ret;
+	/*
+	 * Install the breakpoint, if it is not already installed.
+	 */
+
+	if (KDB_DEBUG(BP))
+		kdb_printf("%s: bp_installed %d\n",
+			   __func__, bp->bp_installed);
+	if (!KDB_STATE(SSBPT))
+		bp->bp_delay = 0;
+	if (bp->bp_installed)
+		return 1;
+	if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+		if (KDB_DEBUG(BP))
+			kdb_printf("%s: delayed bp\n", __func__);
+		kdb_handle_bp(regs, bp);
+		return 0;
+	}
+	if (!bp->bp_type)
+		ret = dbg_set_sw_break(bp->bp_addr);
+	else
+		ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+			 bp->bph_length,
+			 bp->bp_type);
+	if (ret == 0) {
+		bp->bp_installed = 1;
+	} else {
+		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+			   __func__, bp->bp_addr);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * kdb_bp_install
+ *
+ *	Install kdb_breakpoints prior to returning from the
+ *	kernel debugger.  This allows the kdb_breakpoints to be set
+ *	upon functions that are used internally by kdb, such as
+ *	printk().  This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+	int i;
+
+	for (i = 0; i < KDB_MAXBPT; i++) {
+		kdb_bp_t *bp = &kdb_breakpoints[i];
+
+		if (KDB_DEBUG(BP)) {
+			kdb_printf("%s: bp %d bp_enabled %d\n",
+				   __func__, i, bp->bp_enabled);
+		}
+		if (bp->bp_enabled)
+			_kdb_bp_install(regs, bp);
+	}
+}
+
+/*
+ * kdb_bp_remove
+ *
+ *	Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ *	None.
+ * Outputs:
+ *	None.
+ * Returns:
+ *	None.
+ * Locking:
+ *	None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+	int i;
+
+	for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+		kdb_bp_t *bp = &kdb_breakpoints[i];
+
+		if (KDB_DEBUG(BP)) {
+			kdb_printf("%s: bp %d bp_enabled %d\n",
+				   __func__, i, bp->bp_enabled);
+		}
+		if (bp->bp_enabled)
+			_kdb_bp_remove(bp);
+	}
+}
+
+
+/*
+ * kdb_printbp
+ *
+ *	Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ *	None.
+ * Outputs:
+ *	None.
+ * Returns:
+ *	None.
+ * Locking:
+ *	None.
+ * Remarks:
+ */
+
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+	kdb_printf("%s ", kdb_bptype(bp));
+	kdb_printf("BP #%d at ", i);
+	kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+
+	if (bp->bp_enabled)
+		kdb_printf("\n    is enabled");
+	else
+		kdb_printf("\n    is disabled");
+
+	kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+		   bp->bp_addr, bp->bp_type, bp->bp_installed);
+
+	kdb_printf("\n");
+}
+
+/*
+ * kdb_bp
+ *
+ *	Handle the bp commands.
+ *
+ *	[bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ *	argc	Count of arguments in argv
+ *	argv	Space delimited command line arguments
+ * Outputs:
+ *	None.
+ * Returns:
+ *	Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ *	None.
+ * Remarks:
+ *
+ *	bp	Set breakpoint on all cpus.  Only use hardware assist if need.
+ *	bph	Set breakpoint on all cpus.  Force hardware register
+ */
+
+static int kdb_bp(int argc, const char **argv)
+{
+	int i, bpno;
+	kdb_bp_t *bp, *bp_check;
+	int diag;
+	int free;
+	char *symname = NULL;
+	long offset = 0ul;
+	int nextarg;
+	kdb_bp_t template = {0};
+
+	if (argc == 0) {
+		/*
+		 * Display breakpoint table
+		 */
+		for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+		     bpno++, bp++) {
+			if (bp->bp_free)
+				continue;
+			kdb_printbp(bp, bpno);
+		}
+
+		return 0;
+	}
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+			     &offset, &symname);
+	if (diag)
+		return diag;
+	if (!template.bp_addr)
+		return KDB_BADINT;
+
+	/*
+	 * Find an empty bp structure to allocate
+	 */
+	free = KDB_MAXBPT;
+	for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+		if (bp->bp_free)
+			break;
+	}
+
+	if (bpno == KDB_MAXBPT)
+		return KDB_TOOMANYBPT;
+
+	if (strcmp(argv[0], "bph") == 0) {
+		template.bp_type = BP_HARDWARE_BREAKPOINT;
+		diag = kdb_parsebp(argc, argv, &nextarg, &template);
+		if (diag)
+			return diag;
+	} else {
+		template.bp_type = BP_BREAKPOINT;
+	}
+
+	/*
+	 * Check for clashing breakpoints.
+	 *
+	 * Note, in this design we can't have hardware breakpoints
+	 * enabled for both read and write on the same address.
+	 */
+	for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+	     i++, bp_check++) {
+		if (!bp_check->bp_free &&
+		    bp_check->bp_addr == template.bp_addr) {
+			kdb_printf("You already have a breakpoint at "
+				   kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+			return KDB_DUPBPT;
+		}
+	}
+
+	template.bp_enabled = 1;
+
+	/*
+	 * Actually allocate the breakpoint found earlier
+	 */
+	*bp = template;
+	bp->bp_free = 0;
+
+	kdb_printbp(bp, bpno);
+
+	return 0;
+}
+
+/*
+ * kdb_bc
+ *
+ *	Handles the 'bc', 'be', and 'bd' commands
+ *
+ *	[bd|bc|be] <breakpoint-number>
+ *	[bd|bc|be] *
+ *
+ * Parameters:
+ *	argc	Count of arguments in argv
+ *	argv	Space delimited command line arguments
+ * Outputs:
+ *	None.
+ * Returns:
+ *	Zero for success, a kdb diagnostic for failure
+ * Locking:
+ *	None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+	unsigned long addr;
+	kdb_bp_t *bp = NULL;
+	int lowbp = KDB_MAXBPT;
+	int highbp = 0;
+	int done = 0;
+	int i;
+	int diag = 0;
+
+	int cmd;			/* KDBCMD_B? */
+#define KDBCMD_BC	0
+#define KDBCMD_BE	1
+#define KDBCMD_BD	2
+
+	if (strcmp(argv[0], "be") == 0)
+		cmd = KDBCMD_BE;
+	else if (strcmp(argv[0], "bd") == 0)
+		cmd = KDBCMD_BD;
+	else
+		cmd = KDBCMD_BC;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	if (strcmp(argv[1], "*") == 0) {
+		lowbp = 0;
+		highbp = KDB_MAXBPT;
+	} else {
+		diag = kdbgetularg(argv[1], &addr);
+		if (diag)
+			return diag;
+
+		/*
+		 * For addresses less than the maximum breakpoint number,
+		 * assume that the breakpoint number is desired.
+		 */
+		if (addr < KDB_MAXBPT) {
+			bp = &kdb_breakpoints[addr];
+			lowbp = highbp = addr;
+			highbp++;
+		} else {
+			for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+			    i++, bp++) {
+				if (bp->bp_addr == addr) {
+					lowbp = highbp = i;
+					highbp++;
+					break;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Now operate on the set of breakpoints matching the input
+	 * criteria (either '*' for all, or an individual breakpoint).
+	 */
+	for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+	    i < highbp;
+	    i++, bp++) {
+		if (bp->bp_free)
+			continue;
+
+		done++;
+
+		switch (cmd) {
+		case KDBCMD_BC:
+			bp->bp_enabled = 0;
+
+			kdb_printf("Breakpoint %d at "
+				   kdb_bfd_vma_fmt " cleared\n",
+				   i, bp->bp_addr);
+
+			bp->bp_addr = 0;
+			bp->bp_free = 1;
+
+			break;
+		case KDBCMD_BE:
+			bp->bp_enabled = 1;
+
+			kdb_printf("Breakpoint %d at "
+				   kdb_bfd_vma_fmt " enabled",
+				   i, bp->bp_addr);
+
+			kdb_printf("\n");
+			break;
+		case KDBCMD_BD:
+			if (!bp->bp_enabled)
+				break;
+
+			bp->bp_enabled = 0;
+
+			kdb_printf("Breakpoint %d at "
+				   kdb_bfd_vma_fmt " disabled\n",
+				   i, bp->bp_addr);
+
+			break;
+		}
+		if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+			bp->bp_delay = 0;
+			KDB_STATE_CLEAR(SSBPT);
+		}
+	}
+
+	return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+
+/*
+ * kdb_ss
+ *
+ *	Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
+ *	commands.
+ *
+ *	ss
+ *	ssb
+ *
+ * Parameters:
+ *	argc	Argument count
+ *	argv	Argument vector
+ * Outputs:
+ *	None.
+ * Returns:
+ *	KDB_CMD_SS[B] for success, a kdb error if failure.
+ * Locking:
+ *	None.
+ * Remarks:
+ *
+ *	Set the arch specific option to trigger a debug trap after the next
+ *	instruction.
+ *
+ *	For 'ssb', set the trace flag in the debug trap handler
+ *	after printing the current insn and return directly without
+ *	invoking the kdb command processor, until a branch instruction
+ *	is encountered.
+ */
+
+static int kdb_ss(int argc, const char **argv)
+{
+	int ssb = 0;
+
+	ssb = (strcmp(argv[0], "ssb") == 0);
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+	/*
+	 * Set trace flag and go.
+	 */
+	KDB_STATE_SET(DOING_SS);
+	if (ssb) {
+		KDB_STATE_SET(DOING_SSB);
+		return KDB_CMD_SSB;
+	}
+	return KDB_CMD_SS;
+}
+
+/* Initialize the breakpoint table and register	breakpoint commands. */
+
+void __init kdb_initbptab(void)
+{
+	int i;
+	kdb_bp_t *bp;
+
+	/*
+	 * First time initialization.
+	 */
+	memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+
+	for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+		bp->bp_free = 1;
+
+	kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+		"Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+		"Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+	if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+		kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+		"[datar [length]|dataw [length]]   Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+		"Clear Breakpoint", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("be", kdb_bc, "<bpnum>",
+		"Enable Breakpoint", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+		"Disable Breakpoint", 0, KDB_REPEAT_NONE);
+
+	kdb_register_repeat("ss", kdb_ss, "",
+		"Single Step", 1, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("ssb", kdb_ss, "",
+		"Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
+	/*
+	 * Architecture dependent initialization.
+	 */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..483fa4e7aaac
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,208 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include <asm/system.h>
+#include "kdb_private.h"
+
+
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+	int old_lvl = console_loglevel;
+	console_loglevel = 15;
+	kdb_set_current_task(p);
+	if (addr) {
+		show_stack((struct task_struct *)p, addr);
+	} else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+		show_stack(p, &kdb_current_regs->sp);
+#else
+		show_stack(p, NULL);
+#endif
+	} else {
+		show_stack(p, NULL);
+	}
+	console_loglevel = old_lvl;
+}
+
+/*
+ * kdb_bt
+ *
+ *	This function implements the 'bt' command.  Print a stack
+ *	traceback.
+ *
+ *	bt [<address-expression>]	(addr-exp is for alternate stacks)
+ *	btp <pid>			Kernel stack for <pid>
+ *	btt <address-expression>	Kernel stack for task structure at
+ *					<address-expression>
+ *	bta [DRSTCZEUIMA]		All useful processes, optionally
+ *					filtered by state
+ *	btc [<cpu>]			The current process on one cpu,
+ *					default is all cpus
+ *
+ *	bt <address-expression> refers to a address on the stack, that location
+ *	is assumed to contain a return address.
+ *
+ *	btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ *	argc	argument count
+ *	argv	argument vector
+ * Outputs:
+ *	None.
+ * Returns:
+ *	zero for success, a kdb diagnostic if error
+ * Locking:
+ *	none.
+ * Remarks:
+ *	Backtrack works best when the code uses frame pointers.  But even
+ *	without frame pointers we should get a reasonable trace.
+ *
+ *	mds comes in handy when examining the stack to do a manual traceback or
+ *	to get a starting point for bt <address-expression>.
+ */
+
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+	int argcount, int btaprompt)
+{
+	char buffer[2];
+	if (kdb_getarea(buffer[0], (unsigned long)p) ||
+	    kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+		return KDB_BADADDR;
+	if (!kdb_task_state(p, mask))
+		return 0;
+	kdb_printf("Stack traceback for pid %d\n", p->pid);
+	kdb_ps1(p);
+	kdb_show_stack(p, NULL);
+	if (btaprompt) {
+		kdb_getstr(buffer, sizeof(buffer),
+			   "Enter <q> to end, <cr> to continue:");
+		if (buffer[0] == 'q') {
+			kdb_printf("\n");
+			return 1;
+		}
+	}
+	touch_nmi_watchdog();
+	return 0;
+}
+
+int
+kdb_bt(int argc, const char **argv)
+{
+	int diag;
+	int argcount = 5;
+	int btaprompt = 1;
+	int nextarg;
+	unsigned long addr;
+	long offset;
+
+	kdbgetintenv("BTARGS", &argcount);	/* Arguments to print */
+	kdbgetintenv("BTAPROMPT", &btaprompt);	/* Prompt after each
+						 * proc in bta */
+
+	if (strcmp(argv[0], "bta") == 0) {
+		struct task_struct *g, *p;
+		unsigned long cpu;
+		unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+							   NULL);
+		if (argc == 0)
+			kdb_ps_suppressed();
+		/* Run the active tasks first */
+		for_each_online_cpu(cpu) {
+			p = kdb_curr_task(cpu);
+			if (kdb_bt1(p, mask, argcount, btaprompt))
+				return 0;
+		}
+		/* Now the inactive tasks */
+		kdb_do_each_thread(g, p) {
+			if (task_curr(p))
+				continue;
+			if (kdb_bt1(p, mask, argcount, btaprompt))
+				return 0;
+		} kdb_while_each_thread(g, p);
+	} else if (strcmp(argv[0], "btp") == 0) {
+		struct task_struct *p;
+		unsigned long pid;
+		if (argc != 1)
+			return KDB_ARGCOUNT;
+		diag = kdbgetularg((char *)argv[1], &pid);
+		if (diag)
+			return diag;
+		p = find_task_by_pid_ns(pid, &init_pid_ns);
+		if (p) {
+			kdb_set_current_task(p);
+			return kdb_bt1(p, ~0UL, argcount, 0);
+		}
+		kdb_printf("No process with pid == %ld found\n", pid);
+		return 0;
+	} else if (strcmp(argv[0], "btt") == 0) {
+		if (argc != 1)
+			return KDB_ARGCOUNT;
+		diag = kdbgetularg((char *)argv[1], &addr);
+		if (diag)
+			return diag;
+		kdb_set_current_task((struct task_struct *)addr);
+		return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+	} else if (strcmp(argv[0], "btc") == 0) {
+		unsigned long cpu = ~0;
+		struct task_struct *save_current_task = kdb_current_task;
+		char buf[80];
+		if (argc > 1)
+			return KDB_ARGCOUNT;
+		if (argc == 1) {
+			diag = kdbgetularg((char *)argv[1], &cpu);
+			if (diag)
+				return diag;
+		}
+		/* Recursive use of kdb_parse, do not use argv after
+		 * this point */
+		argv = NULL;
+		if (cpu != ~0) {
+			if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+				kdb_printf("no process for cpu %ld\n", cpu);
+				return 0;
+			}
+			sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+			kdb_parse(buf);
+			return 0;
+		}
+		kdb_printf("btc: cpu status: ");
+		kdb_parse("cpu\n");
+		for_each_online_cpu(cpu) {
+			sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+			kdb_parse(buf);
+			touch_nmi_watchdog();
+		}
+		kdb_set_current_task(save_current_task);
+		return 0;
+	} else {
+		if (argc) {
+			nextarg = 1;
+			diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+					     &offset, NULL);
+			if (diag)
+				return diag;
+			kdb_show_stack(kdb_current_task, (void *)addr);
+			return 0;
+		} else {
+			return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+		}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes.  Commands that require process data (including stack or
+# registers) are not reliable this early.  set and bp commands should
+# be safe.  Global breakpoint commands affect each cpu as it is booted.
+
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+
+defcmd dumpcommon "" "Common kdb debugging"
+  set BTAPROMPT 0
+  set LINES 10000
+  -summary
+  -cpu
+  -ps
+  -dmesg 600
+  -bt
+endefcmd
+
+defcmd dumpall "" "First line debugging"
+  set BTSYMARG 1
+  set BTARGS 9
+  pid R
+  -dumpcommon
+  -bta
+endefcmd
+
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+  set BTSYMARG 1
+  set BTARGS 9
+  pid R
+  -dumpcommon
+  -btc
+endefcmd
+
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..f024c0c4b8c4
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,159 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+	dbg_io_get_char,
+	NULL,
+};
+
+int kdb_stub(struct kgdb_state *ks)
+{
+	int error = 0;
+	kdb_bp_t *bp;
+	unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+	kdb_reason_t reason = KDB_REASON_OOPS;
+	kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+	int i;
+
+	if (KDB_STATE(REENTRY)) {
+		reason = KDB_REASON_SWITCH;
+		KDB_STATE_CLEAR(REENTRY);
+		addr = instruction_pointer(ks->linux_regs);
+	}
+	ks->pass_exception = 0;
+	if (atomic_read(&kgdb_setting_breakpoint))
+		reason = KDB_REASON_KEYBOARD;
+
+	for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+		if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+			reason = KDB_REASON_BREAK;
+			db_result = KDB_DB_BPT;
+			if (addr != instruction_pointer(ks->linux_regs))
+				kgdb_arch_set_pc(ks->linux_regs, addr);
+			break;
+		}
+	}
+	if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+		for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+			if (bp->bp_free)
+				continue;
+			if (bp->bp_addr == addr) {
+				bp->bp_delay = 1;
+				bp->bp_delayed = 1;
+	/*
+	 * SSBPT is set when the kernel debugger must single step a
+	 * task in order to re-establish an instruction breakpoint
+	 * which uses the instruction replacement mechanism.  It is
+	 * cleared by any action that removes the need to single-step
+	 * the breakpoint.
+	 */
+				reason = KDB_REASON_BREAK;
+				db_result = KDB_DB_BPT;
+				KDB_STATE_SET(SSBPT);
+				break;
+			}
+		}
+	}
+
+	if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+		ks->signo == SIGTRAP) {
+		reason = KDB_REASON_SSTEP;
+		db_result = KDB_DB_BPT;
+	}
+	/* Set initial kdb state variables */
+	KDB_STATE_CLEAR(KGDB_TRANS);
+	kdb_initial_cpu = ks->cpu;
+	kdb_current_task = kgdb_info[ks->cpu].task;
+	kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+	/* Remove any breakpoints as needed by kdb and clear single step */
+	kdb_bp_remove();
+	KDB_STATE_CLEAR(DOING_SS);
+	KDB_STATE_CLEAR(DOING_SSB);
+	/* zero out any offline cpu data */
+	for_each_present_cpu(i) {
+		if (!cpu_online(i)) {
+			kgdb_info[i].debuggerinfo = NULL;
+			kgdb_info[i].task = NULL;
+		}
+	}
+	if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+		ks->pass_exception = 1;
+		KDB_FLAG_SET(CATASTROPHIC);
+	}
+	kdb_initial_cpu = ks->cpu;
+	if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+		KDB_STATE_CLEAR(SSBPT);
+		KDB_STATE_CLEAR(DOING_SS);
+	} else {
+		/* Start kdb main loop */
+		error = kdb_main_loop(KDB_REASON_ENTER, reason,
+				      ks->err_code, db_result, ks->linux_regs);
+	}
+	/*
+	 * Upon exit from the kdb main loop setup break points and restart
+	 * the system based on the requested continue state
+	 */
+	kdb_initial_cpu = -1;
+	kdb_current_task = NULL;
+	kdb_current_regs = NULL;
+	kdbnearsym_cleanup();
+	if (error == KDB_CMD_KGDB) {
+		if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
+	/*
+	 * This inteface glue which allows kdb to transition in into
+	 * the gdb stub.  In order to do this the '?' or '' gdb serial
+	 * packet response is processed here.  And then control is
+	 * passed to the gdbstub.
+	 */
+			if (KDB_STATE(DOING_KGDB))
+				gdbstub_state(ks, "?");
+			else
+				gdbstub_state(ks, "");
+			KDB_STATE_CLEAR(DOING_KGDB);
+			KDB_STATE_CLEAR(DOING_KGDB2);
+		}
+		return DBG_PASS_EVENT;
+	}
+	kdb_bp_install(ks->linux_regs);
+	dbg_activate_sw_breakpoints();
+	/* Set the exit state to a single step or a continue */
+	if (KDB_STATE(DOING_SS))
+		gdbstub_state(ks, "s");
+	else
+		gdbstub_state(ks, "c");
+
+	KDB_FLAG_CLEAR(CATASTROPHIC);
+
+	/* Invoke arch specific exception handling prior to system resume */
+	kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+	if (ks->pass_exception)
+		kgdb_info[ks->cpu].ret_state = 1;
+	if (error == KDB_CMD_CPU) {
+		KDB_STATE_SET(REENTRY);
+		/*
+		 * Force clear the single step bit because kdb emulates this
+		 * differently vs the gdbstub
+		 */
+		kgdb_single_step = 0;
+		dbg_deactivate_sw_breakpoints();
+		return DBG_SWITCH_CPU_EVENT;
+	}
+	return kgdb_info[ks->cpu].ret_state;
+}
+
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..9e3cec7a925c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,789 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+
+
+static void kgdb_transition_check(char *buffer)
+{
+	int slen = strlen(buffer);
+	if (strncmp(buffer, "$?#3f", slen) != 0 &&
+	    strncmp(buffer, "$qSupported#37", slen) != 0 &&
+	    strncmp(buffer, "+$qSupported#37", slen) != 0) {
+		KDB_STATE_SET(KGDB_TRANS);
+		kdb_printf("%s", buffer);
+	}
+}
+
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+	char escape_data[5];	/* longest vt100 escape sequence is 4 bytes */
+	char *ped = escape_data;
+	int escape_delay = 0;
+	get_char_func *f, *f_escape = NULL;
+	int key;
+
+	for (f = &kdb_poll_funcs[0]; ; ++f) {
+		if (*f == NULL) {
+			/* Reset NMI watchdog once per poll loop */
+			touch_nmi_watchdog();
+			f = &kdb_poll_funcs[0];
+		}
+		if (escape_delay == 2) {
+			*ped = '\0';
+			ped = escape_data;
+			--escape_delay;
+		}
+		if (escape_delay == 1) {
+			key = *ped++;
+			if (!*ped)
+				--escape_delay;
+			break;
+		}
+		key = (*f)();
+		if (key == -1) {
+			if (escape_delay) {
+				udelay(ESCAPE_UDELAY);
+				--escape_delay;
+			}
+			continue;
+		}
+		if (bufsize <= 2) {
+			if (key == '\r')
+				key = '\n';
+			*buffer++ = key;
+			*buffer = '\0';
+			return -1;
+		}
+		if (escape_delay == 0 && key == '\e') {
+			escape_delay = ESCAPE_DELAY;
+			ped = escape_data;
+			f_escape = f;
+		}
+		if (escape_delay) {
+			*ped++ = key;
+			if (f_escape != f) {
+				escape_delay = 2;
+				continue;
+			}
+			if (ped - escape_data == 1) {
+				/* \e */
+				continue;
+			} else if (ped - escape_data == 2) {
+				/* \e<something> */
+				if (key != '[')
+					escape_delay = 2;
+				continue;
+			} else if (ped - escape_data == 3) {
+				/* \e[<something> */
+				int mapkey = 0;
+				switch (key) {
+				case 'A': /* \e[A, up arrow */
+					mapkey = 16;
+					break;
+				case 'B': /* \e[B, down arrow */
+					mapkey = 14;
+					break;
+				case 'C': /* \e[C, right arrow */
+					mapkey = 6;
+					break;
+				case 'D': /* \e[D, left arrow */
+					mapkey = 2;
+					break;
+				case '1': /* dropthrough */
+				case '3': /* dropthrough */
+				/* \e[<1,3,4>], may be home, del, end */
+				case '4':
+					mapkey = -1;
+					break;
+				}
+				if (mapkey != -1) {
+					if (mapkey > 0) {
+						escape_data[0] = mapkey;
+						escape_data[1] = '\0';
+					}
+					escape_delay = 2;
+				}
+				continue;
+			} else if (ped - escape_data == 4) {
+				/* \e[<1,3,4><something> */
+				int mapkey = 0;
+				if (key == '~') {
+					switch (escape_data[2]) {
+					case '1': /* \e[1~, home */
+						mapkey = 1;
+						break;
+					case '3': /* \e[3~, del */
+						mapkey = 4;
+						break;
+					case '4': /* \e[4~, end */
+						mapkey = 5;
+						break;
+					}
+				}
+				if (mapkey > 0) {
+					escape_data[0] = mapkey;
+					escape_data[1] = '\0';
+				}
+				escape_delay = 2;
+				continue;
+			}
+		}
+		break;	/* A key to process */
+	}
+	return key;
+}
+
+/*
+ * kdb_read
+ *
+ *	This function reads a string of characters, terminated by
+ *	a newline, or by reaching the end of the supplied buffer,
+ *	from the current kernel debugger console device.
+ * Parameters:
+ *	buffer	- Address of character buffer to receive input characters.
+ *	bufsize - size, in bytes, of the character buffer
+ * Returns:
+ *	Returns a pointer to the buffer containing the received
+ *	character string.  This string will be terminated by a
+ *	newline character.
+ * Locking:
+ *	No locks are required to be held upon entry to this
+ *	function.  It is not reentrant - it relies on the fact
+ *	that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2.  A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right.  The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters.  kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key.  Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+	char *cp = buffer;
+	char *bufend = buffer+bufsize-2;	/* Reserve space for newline
+						 * and null byte */
+	char *lastchar;
+	char *p_tmp;
+	char tmp;
+	static char tmpbuffer[CMD_BUFLEN];
+	int len = strlen(buffer);
+	int len_tmp;
+	int tab = 0;
+	int count;
+	int i;
+	int diag, dtab_count;
+	int key;
+
+
+	diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+	if (diag)
+		dtab_count = 30;
+
+	if (len > 0) {
+		cp += len;
+		if (*(buffer+len-1) == '\n')
+			cp--;
+	}
+
+	lastchar = cp;
+	*cp = '\0';
+	kdb_printf("%s", buffer);
+poll_again:
+	key = kdb_read_get_key(buffer, bufsize);
+	if (key == -1)
+		return buffer;
+	if (key != 9)
+		tab = 0;
+	switch (key) {
+	case 8: /* backspace */
+		if (cp > buffer) {
+			if (cp < lastchar) {
+				memcpy(tmpbuffer, cp, lastchar - cp);
+				memcpy(cp-1, tmpbuffer, lastchar - cp);
+			}
+			*(--lastchar) = '\0';
+			--cp;
+			kdb_printf("\b%s \r", cp);
+			tmp = *cp;
+			*cp = '\0';
+			kdb_printf(kdb_prompt_str);
+			kdb_printf("%s", buffer);
+			*cp = tmp;
+		}
+		break;
+	case 13: /* enter */
+		*lastchar++ = '\n';
+		*lastchar++ = '\0';
+		kdb_printf("\n");
+		return buffer;
+	case 4: /* Del */
+		if (cp < lastchar) {
+			memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+			memcpy(cp, tmpbuffer, lastchar - cp - 1);
+			*(--lastchar) = '\0';
+			kdb_printf("%s \r", cp);
+			tmp = *cp;
+			*cp = '\0';
+			kdb_printf(kdb_prompt_str);
+			kdb_printf("%s", buffer);
+			*cp = tmp;
+		}
+		break;
+	case 1: /* Home */
+		if (cp > buffer) {
+			kdb_printf("\r");
+			kdb_printf(kdb_prompt_str);
+			cp = buffer;
+		}
+		break;
+	case 5: /* End */
+		if (cp < lastchar) {
+			kdb_printf("%s", cp);
+			cp = lastchar;
+		}
+		break;
+	case 2: /* Left */
+		if (cp > buffer) {
+			kdb_printf("\b");
+			--cp;
+		}
+		break;
+	case 14: /* Down */
+		memset(tmpbuffer, ' ',
+		       strlen(kdb_prompt_str) + (lastchar-buffer));
+		*(tmpbuffer+strlen(kdb_prompt_str) +
+		  (lastchar-buffer)) = '\0';
+		kdb_printf("\r%s\r", tmpbuffer);
+		*lastchar = (char)key;
+		*(lastchar+1) = '\0';
+		return lastchar;
+	case 6: /* Right */
+		if (cp < lastchar) {
+			kdb_printf("%c", *cp);
+			++cp;
+		}
+		break;
+	case 16: /* Up */
+		memset(tmpbuffer, ' ',
+		       strlen(kdb_prompt_str) + (lastchar-buffer));
+		*(tmpbuffer+strlen(kdb_prompt_str) +
+		  (lastchar-buffer)) = '\0';
+		kdb_printf("\r%s\r", tmpbuffer);
+		*lastchar = (char)key;
+		*(lastchar+1) = '\0';
+		return lastchar;
+	case 9: /* Tab */
+		if (tab < 2)
+			++tab;
+		p_tmp = buffer;
+		while (*p_tmp == ' ')
+			p_tmp++;
+		if (p_tmp > cp)
+			break;
+		memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+		*(tmpbuffer + (cp-p_tmp)) = '\0';
+		p_tmp = strrchr(tmpbuffer, ' ');
+		if (p_tmp)
+			++p_tmp;
+		else
+			p_tmp = tmpbuffer;
+		len = strlen(p_tmp);
+		count = kallsyms_symbol_complete(p_tmp,
+						 sizeof(tmpbuffer) -
+						 (p_tmp - tmpbuffer));
+		if (tab == 2 && count > 0) {
+			kdb_printf("\n%d symbols are found.", count);
+			if (count > dtab_count) {
+				count = dtab_count;
+				kdb_printf(" But only first %d symbols will"
+					   " be printed.\nYou can change the"
+					   " environment variable DTABCOUNT.",
+					   count);
+			}
+			kdb_printf("\n");
+			for (i = 0; i < count; i++) {
+				if (kallsyms_symbol_next(p_tmp, i) < 0)
+					break;
+				kdb_printf("%s ", p_tmp);
+				*(p_tmp + len) = '\0';
+			}
+			if (i >= dtab_count)
+				kdb_printf("...");
+			kdb_printf("\n");
+			kdb_printf(kdb_prompt_str);
+			kdb_printf("%s", buffer);
+		} else if (tab != 2 && count > 0) {
+			len_tmp = strlen(p_tmp);
+			strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+			len_tmp = strlen(p_tmp);
+			strncpy(cp, p_tmp+len, len_tmp-len + 1);
+			len = len_tmp - len;
+			kdb_printf("%s", cp);
+			cp += len;
+			lastchar += len;
+		}
+		kdb_nextline = 1; /* reset output line number */
+		break;
+	default:
+		if (key >= 32 && lastchar < bufend) {
+			if (cp < lastchar) {
+				memcpy(tmpbuffer, cp, lastchar - cp);
+				memcpy(cp+1, tmpbuffer, lastchar - cp);
+				*++lastchar = '\0';
+				*cp = key;
+				kdb_printf("%s\r", cp);
+				++cp;
+				tmp = *cp;
+				*cp = '\0';
+				kdb_printf(kdb_prompt_str);
+				kdb_printf("%s", buffer);
+				*cp = tmp;
+			} else {
+				*++lastchar = '\0';
+				*cp++ = key;
+				/* The kgdb transition check will hide
+				 * printed characters if we think that
+				 * kgdb is connecting, until the check
+				 * fails */
+				if (!KDB_STATE(KGDB_TRANS))
+					kgdb_transition_check(buffer);
+				else
+					kdb_printf("%c", key);
+			}
+			/* Special escape to kgdb */
+			if (lastchar - buffer >= 5 &&
+			    strcmp(lastchar - 5, "$?#3f") == 0) {
+				strcpy(buffer, "kgdb");
+				KDB_STATE_SET(DOING_KGDB);
+				return buffer;
+			}
+			if (lastchar - buffer >= 14 &&
+			    strcmp(lastchar - 14, "$qSupported#37") == 0) {
+				strcpy(buffer, "kgdb");
+				KDB_STATE_SET(DOING_KGDB2);
+				return buffer;
+			}
+		}
+		break;
+	}
+	goto poll_again;
+}
+
+/*
+ * kdb_getstr
+ *
+ *	Print the prompt string and read a command from the
+ *	input device.
+ *
+ * Parameters:
+ *	buffer	Address of buffer to receive command
+ *	bufsize Size of buffer in bytes
+ *	prompt	Pointer to string to use as prompt string
+ * Returns:
+ *	Pointer to command buffer.
+ * Locking:
+ *	None.
+ * Remarks:
+ *	For SMP kernels, the processor number will be
+ *	substituted for %d, %x or %o in the prompt.
+ */
+
+char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+{
+	if (prompt && kdb_prompt_str != prompt)
+		strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+	kdb_printf(kdb_prompt_str);
+	kdb_nextline = 1;	/* Prompt and input resets line number */
+	return kdb_read(buffer, bufsize);
+}
+
+/*
+ * kdb_input_flush
+ *
+ *	Get rid of any buffered console input.
+ *
+ * Parameters:
+ *	none
+ * Returns:
+ *	nothing
+ * Locking:
+ *	none
+ * Remarks:
+ *	Call this function whenever you want to flush input.  If there is any
+ *	outstanding input, it ignores all characters until there has been no
+ *	data for approximately 1ms.
+ */
+
+static void kdb_input_flush(void)
+{
+	get_char_func *f;
+	int res;
+	int flush_delay = 1;
+	while (flush_delay) {
+		flush_delay--;
+empty:
+		touch_nmi_watchdog();
+		for (f = &kdb_poll_funcs[0]; *f; ++f) {
+			res = (*f)();
+			if (res != -1) {
+				flush_delay = 1;
+				goto empty;
+			}
+		}
+		if (flush_delay)
+			mdelay(1);
+	}
+}
+
+/*
+ * kdb_printf
+ *
+ *	Print a string to the output device(s).
+ *
+ * Parameters:
+ *	printf-like format and optional args.
+ * Returns:
+ *	0
+ * Locking:
+ *	None.
+ * Remarks:
+ *	use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ *	kdb output.
+ *
+ *  If the user is doing a cmd args | grep srch
+ *  then kdb_grepping_flag is set.
+ *  In that case we need to accumulate full lines (ending in \n) before
+ *  searching for the pattern.
+ */
+
+static char kdb_buffer[256];	/* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int  size_avail;
+static int  suspend_grep;
+
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+	char firstchar, *cp;
+	int len1, len2;
+
+	/* not counting the newline at the end of "searched" */
+	len1 = strlen(searched)-1;
+	len2 = strlen(searchfor);
+	if (len1 < len2)
+		return 0;
+	if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+		return 0;
+	if (kdb_grep_leading) {
+		if (!strncmp(searched, searchfor, len2))
+			return 1;
+	} else if (kdb_grep_trailing) {
+		if (!strncmp(searched+len1-len2, searchfor, len2))
+			return 1;
+	} else {
+		firstchar = *searchfor;
+		cp = searched;
+		while ((cp = strchr(cp, firstchar))) {
+			if (!strncmp(cp, searchfor, len2))
+				return 1;
+			cp++;
+		}
+	}
+	return 0;
+}
+
+int kdb_printf(const char *fmt, ...)
+{
+	va_list ap;
+	int diag;
+	int linecount;
+	int logging, saved_loglevel = 0;
+	int got_printf_lock = 0;
+	int retlen = 0;
+	int fnd, len;
+	char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+	char *moreprompt = "more> ";
+	struct console *c = console_drivers;
+	static DEFINE_SPINLOCK(kdb_printf_lock);
+	unsigned long uninitialized_var(flags);
+
+	preempt_disable();
+	/* Serialize kdb_printf if multiple cpus try to write at once.
+	 * But if any cpu goes recursive in kdb, just print the output,
+	 * even if it is interleaved with any other text.
+	 */
+	if (!KDB_STATE(PRINTF_LOCK)) {
+		KDB_STATE_SET(PRINTF_LOCK);
+		spin_lock_irqsave(&kdb_printf_lock, flags);
+		got_printf_lock = 1;
+		atomic_inc(&kdb_event);
+	} else {
+		__acquire(kdb_printf_lock);
+	}
+
+	diag = kdbgetintenv("LINES", &linecount);
+	if (diag || linecount <= 1)
+		linecount = 24;
+
+	diag = kdbgetintenv("LOGGING", &logging);
+	if (diag)
+		logging = 0;
+
+	if (!kdb_grepping_flag || suspend_grep) {
+		/* normally, every vsnprintf starts a new buffer */
+		next_avail = kdb_buffer;
+		size_avail = sizeof(kdb_buffer);
+	}
+	va_start(ap, fmt);
+	vsnprintf(next_avail, size_avail, fmt, ap);
+	va_end(ap);
+
+	/*
+	 * If kdb_parse() found that the command was cmd xxx | grep yyy
+	 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+	 *
+	 * Accumulate the print data up to a newline before searching it.
+	 * (vsnprintf does null-terminate the string that it generates)
+	 */
+
+	/* skip the search if prints are temporarily unconditional */
+	if (!suspend_grep && kdb_grepping_flag) {
+		cp = strchr(kdb_buffer, '\n');
+		if (!cp) {
+			/*
+			 * Special cases that don't end with newlines
+			 * but should be written without one:
+			 *   The "[nn]kdb> " prompt should
+			 *   appear at the front of the buffer.
+			 *
+			 *   The "[nn]more " prompt should also be
+			 *     (MOREPROMPT -> moreprompt)
+			 *   written *   but we print that ourselves,
+			 *   we set the suspend_grep flag to make
+			 *   it unconditional.
+			 *
+			 */
+			if (next_avail == kdb_buffer) {
+				/*
+				 * these should occur after a newline,
+				 * so they will be at the front of the
+				 * buffer
+				 */
+				cp2 = kdb_buffer;
+				len = strlen(kdb_prompt_str);
+				if (!strncmp(cp2, kdb_prompt_str, len)) {
+					/*
+					 * We're about to start a new
+					 * command, so we can go back
+					 * to normal mode.
+					 */
+					kdb_grepping_flag = 0;
+					goto kdb_printit;
+				}
+			}
+			/* no newline; don't search/write the buffer
+			   until one is there */
+			len = strlen(kdb_buffer);
+			next_avail = kdb_buffer + len;
+			size_avail = sizeof(kdb_buffer) - len;
+			goto kdb_print_out;
+		}
+
+		/*
+		 * The newline is present; print through it or discard
+		 * it, depending on the results of the search.
+		 */
+		cp++;	 	     /* to byte after the newline */
+		replaced_byte = *cp; /* remember what/where it was */
+		cphold = cp;
+		*cp = '\0';	     /* end the string for our search */
+
+		/*
+		 * We now have a newline at the end of the string
+		 * Only continue with this output if it contains the
+		 * search string.
+		 */
+		fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+		if (!fnd) {
+			/*
+			 * At this point the complete line at the start
+			 * of kdb_buffer can be discarded, as it does
+			 * not contain what the user is looking for.
+			 * Shift the buffer left.
+			 */
+			*cphold = replaced_byte;
+			strcpy(kdb_buffer, cphold);
+			len = strlen(kdb_buffer);
+			next_avail = kdb_buffer + len;
+			size_avail = sizeof(kdb_buffer) - len;
+			goto kdb_print_out;
+		}
+		/*
+		 * at this point the string is a full line and
+		 * should be printed, up to the null.
+		 */
+	}
+kdb_printit:
+
+	/*
+	 * Write to all consoles.
+	 */
+	retlen = strlen(kdb_buffer);
+	while (c) {
+		c->write(c, kdb_buffer, retlen);
+		touch_nmi_watchdog();
+		c = c->next;
+	}
+	if (logging) {
+		saved_loglevel = console_loglevel;
+		console_loglevel = 0;
+		printk(KERN_INFO "%s", kdb_buffer);
+	}
+
+	if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
+		kdb_nextline++;
+
+	/* check for having reached the LINES number of printed lines */
+	if (kdb_nextline == linecount) {
+		char buf1[16] = "";
+#if defined(CONFIG_SMP)
+		char buf2[32];
+#endif
+
+		/* Watch out for recursion here.  Any routine that calls
+		 * kdb_printf will come back through here.  And kdb_read
+		 * uses kdb_printf to echo on serial consoles ...
+		 */
+		kdb_nextline = 1;	/* In case of recursion */
+
+		/*
+		 * Pause until cr.
+		 */
+		moreprompt = kdbgetenv("MOREPROMPT");
+		if (moreprompt == NULL)
+			moreprompt = "more> ";
+
+#if defined(CONFIG_SMP)
+		if (strchr(moreprompt, '%')) {
+			sprintf(buf2, moreprompt, get_cpu());
+			put_cpu();
+			moreprompt = buf2;
+		}
+#endif
+
+		kdb_input_flush();
+		c = console_drivers;
+
+		while (c) {
+			c->write(c, moreprompt, strlen(moreprompt));
+			touch_nmi_watchdog();
+			c = c->next;
+		}
+
+		if (logging)
+			printk("%s", moreprompt);
+
+		kdb_read(buf1, 2); /* '2' indicates to return
+				    * immediately after getting one key. */
+		kdb_nextline = 1;	/* Really set output line 1 */
+
+		/* empty and reset the buffer: */
+		kdb_buffer[0] = '\0';
+		next_avail = kdb_buffer;
+		size_avail = sizeof(kdb_buffer);
+		if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+			/* user hit q or Q */
+			KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+			KDB_STATE_CLEAR(PAGER);
+			/* end of command output; back to normal mode */
+			kdb_grepping_flag = 0;
+			kdb_printf("\n");
+		} else if (buf1[0] == ' ') {
+			kdb_printf("\n");
+			suspend_grep = 1; /* for this recursion */
+		} else if (buf1[0] == '\n') {
+			kdb_nextline = linecount - 1;
+			kdb_printf("\r");
+			suspend_grep = 1; /* for this recursion */
+		} else if (buf1[0] && buf1[0] != '\n') {
+			/* user hit something other than enter */
+			suspend_grep = 1; /* for this recursion */
+			kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+				   "prompt, input ignored\n");
+		} else if (kdb_grepping_flag) {
+			/* user hit enter */
+			suspend_grep = 1; /* for this recursion */
+			kdb_printf("\n");
+		}
+		kdb_input_flush();
+	}
+
+	/*
+	 * For grep searches, shift the printed string left.
+	 *  replaced_byte contains the character that was overwritten with
+	 *  the terminating null, and cphold points to the null.
+	 * Then adjust the notion of available space in the buffer.
+	 */
+	if (kdb_grepping_flag && !suspend_grep) {
+		*cphold = replaced_byte;
+		strcpy(kdb_buffer, cphold);
+		len = strlen(kdb_buffer);
+		next_avail = kdb_buffer + len;
+		size_avail = sizeof(kdb_buffer) - len;
+	}
+
+kdb_print_out:
+	suspend_grep = 0; /* end of what may have been a recursive call */
+	if (logging)
+		console_loglevel = saved_loglevel;
+	if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+		got_printf_lock = 0;
+		spin_unlock_irqrestore(&kdb_printf_lock, flags);
+		KDB_STATE_CLEAR(PRINTF_LOCK);
+		atomic_dec(&kdb_event);
+	} else {
+		__release(kdb_printf_lock);
+	}
+	preempt_enable();
+	return retlen;
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..64ef9ac14ba9
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2845 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+#define GREP_LEN 256
+char kdb_grep_string[GREP_LEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+
+/*
+ * kdb_lock protects updates to kdb_initial_cpu.  Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1;	/* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state;			/* General KDB state */
+
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+	CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[50];
+#define for_each_kdbcmd(cmd, num)					\
+	for ((cmd) = kdb_base_commands, (num) = 0;			\
+	     num < kdb_max_commands;					\
+	     num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+
+typedef struct _kdbmsg {
+	int	km_diag;	/* kdb diagnostic */
+	char	*km_msg;	/* Corresponding message text */
+} kdbmsg_t;
+
+#define KDBMSG(msgnum, text) \
+	{ KDB_##msgnum, text }
+
+static kdbmsg_t kdbmsgs[] = {
+	KDBMSG(NOTFOUND, "Command Not Found"),
+	KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+	KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+	       "8 is only allowed on 64 bit systems"),
+	KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+	KDBMSG(NOTENV, "Cannot find environment variable"),
+	KDBMSG(NOENVVALUE, "Environment variable should have value"),
+	KDBMSG(NOTIMP, "Command not implemented"),
+	KDBMSG(ENVFULL, "Environment full"),
+	KDBMSG(ENVBUFFULL, "Environment buffer full"),
+	KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+	KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+	KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+	KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+	KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+	KDBMSG(BADMODE, "Invalid IDMODE"),
+	KDBMSG(BADINT, "Illegal numeric value"),
+	KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+	KDBMSG(BADREG, "Invalid register name"),
+	KDBMSG(BADCPUNUM, "Invalid cpu number"),
+	KDBMSG(BADLENGTH, "Invalid length field"),
+	KDBMSG(NOBP, "No Breakpoint exists"),
+	KDBMSG(BADADDR, "Invalid address"),
+};
+#undef KDBMSG
+
+static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+
+
+/*
+ * Initial environment.   This is all kept static and local to
+ * this file.   We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables.  The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+ "MOREPROMPT=[%d]more> ",
+#else
+ "PROMPT=kdb> ",
+ "MOREPROMPT=more> ",
+#endif
+ "RADIX=16",
+ "MDCOUNT=8",			/* lines of md output */
+ "BTARGS=9",			/* 9 possible args in bt */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+
+static const int __nenv = (sizeof(__env) / sizeof(char *));
+
+struct task_struct *kdb_curr_task(int cpu)
+{
+	struct task_struct *p = curr_task(cpu);
+#ifdef	_TIF_MCA_INIT
+	if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+		p = krp->p;
+#endif
+	return p;
+}
+
+/*
+ * kdbgetenv - This function will return the character string value of
+ *	an environment variable.
+ * Parameters:
+ *	match	A character string representing an environment variable.
+ * Returns:
+ *	NULL	No environment variable matches 'match'
+ *	char*	Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+	char **ep = __env;
+	int matchlen = strlen(match);
+	int i;
+
+	for (i = 0; i < __nenv; i++) {
+		char *e = *ep++;
+
+		if (!e)
+			continue;
+
+		if ((strncmp(match, e, matchlen) == 0)
+		 && ((e[matchlen] == '\0')
+		   || (e[matchlen] == '='))) {
+			char *cp = strchr(e, '=');
+			return cp ? ++cp : "";
+		}
+	}
+	return NULL;
+}
+
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ *	environment entries.
+ * Parameters:
+ *	match	A character string representing a numeric value
+ * Outputs:
+ *	*value  the unsigned long representation of the env variable 'match'
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ *	We use a static environment buffer (envbuffer) to hold the values
+ *	of dynamically generated environment variables (see kdb_set).  Buffer
+ *	space once allocated is never free'd, so over time, the amount of space
+ *	(currently 512 bytes) will be exhausted if env variables are changed
+ *	frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define	KDB_ENVBUFSIZE	512
+	static char envbuffer[KDB_ENVBUFSIZE];
+	static int envbufsize;
+	char *ep = NULL;
+
+	if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+		ep = &envbuffer[envbufsize];
+		envbufsize += bytes;
+	}
+	return ep;
+}
+
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ *	long-valued environment variable.
+ * Parameters:
+ *	match	A character string representing a numeric value
+ * Outputs:
+ *	*value  the unsigned long represntation of the env variable 'match'
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+	char *ep;
+
+	ep = kdbgetenv(match);
+	if (!ep)
+		return KDB_NOTENV;
+	if (strlen(ep) == 0)
+		return KDB_NOENVVALUE;
+
+	*value = simple_strtoul(ep, NULL, 0);
+
+	return 0;
+}
+
+/*
+ * kdbgetintenv - This function will return the value of an
+ *	integer-valued environment variable.
+ * Parameters:
+ *	match	A character string representing an integer-valued env variable
+ * Outputs:
+ *	*value  the integer representation of the environment variable 'match'
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+	unsigned long val;
+	int diag;
+
+	diag = kdbgetulenv(match, &val);
+	if (!diag)
+		*value = (int) val;
+	return diag;
+}
+
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ *	unsigned long value.
+ * Parameters:
+ *	arg	A character string representing a numeric value
+ * Outputs:
+ *	*value  the unsigned long represntation of arg.
+ * Returns:
+ *	Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+	char *endp;
+	unsigned long val;
+
+	val = simple_strtoul(arg, &endp, 0);
+
+	if (endp == arg) {
+		/*
+		 * Try base 16, for us folks too lazy to type the
+		 * leading 0x...
+		 */
+		val = simple_strtoul(arg, &endp, 16);
+		if (endp == arg)
+			return KDB_BADINT;
+	}
+
+	*value = val;
+
+	return 0;
+}
+
+/*
+ * kdb_set - This function implements the 'set' command.  Alter an
+ *	existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+	int i;
+	char *ep;
+	size_t varlen, vallen;
+
+	/*
+	 * we can be invoked two ways:
+	 *   set var=value    argv[1]="var", argv[2]="value"
+	 *   set var = value  argv[1]="var", argv[2]="=", argv[3]="value"
+	 * - if the latter, shift 'em down.
+	 */
+	if (argc == 3) {
+		argv[2] = argv[3];
+		argc--;
+	}
+
+	if (argc != 2)
+		return KDB_ARGCOUNT;
+
+	/*
+	 * Check for internal variables
+	 */
+	if (strcmp(argv[1], "KDBDEBUG") == 0) {
+		unsigned int debugflags;
+		char *cp;
+
+		debugflags = simple_strtoul(argv[2], &cp, 0);
+		if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+			kdb_printf("kdb: illegal debug flags '%s'\n",
+				    argv[2]);
+			return 0;
+		}
+		kdb_flags = (kdb_flags &
+			     ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+			| (debugflags << KDB_DEBUG_FLAG_SHIFT);
+
+		return 0;
+	}
+
+	/*
+	 * Tokenizer squashed the '=' sign.  argv[1] is variable
+	 * name, argv[2] = value.
+	 */
+	varlen = strlen(argv[1]);
+	vallen = strlen(argv[2]);
+	ep = kdballocenv(varlen + vallen + 2);
+	if (ep == (char *)0)
+		return KDB_ENVBUFFULL;
+
+	sprintf(ep, "%s=%s", argv[1], argv[2]);
+
+	ep[varlen+vallen+1] = '\0';
+
+	for (i = 0; i < __nenv; i++) {
+		if (__env[i]
+		 && ((strncmp(__env[i], argv[1], varlen) == 0)
+		   && ((__env[i][varlen] == '\0')
+		    || (__env[i][varlen] == '=')))) {
+			__env[i] = ep;
+			return 0;
+		}
+	}
+
+	/*
+	 * Wasn't existing variable.  Fit into slot.
+	 */
+	for (i = 0; i < __nenv-1; i++) {
+		if (__env[i] == (char *)0) {
+			__env[i] = ep;
+			return 0;
+		}
+	}
+
+	return KDB_ENVFULL;
+}
+
+static int kdb_check_regs(void)
+{
+	if (!kdb_current_regs) {
+		kdb_printf("No current kdb registers."
+			   "  You may need to select another task\n");
+		return KDB_BADREG;
+	}
+	return 0;
+}
+
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ *	address-expression and returning the value of the expression,
+ *	symbol name, and offset to the caller.
+ *
+ *	The argument may consist of a numeric value (decimal or
+ *	hexidecimal), a symbol name, a register name (preceeded by the
+ *	percent sign), an environment variable with a numeric value
+ *	(preceeded by a dollar sign) or a simple arithmetic expression
+ *	consisting of a symbol name, +/-, and a numeric constant value
+ *	(offset).
+ * Parameters:
+ *	argc	- count of arguments in argv
+ *	argv	- argument vector
+ *	*nextarg - index to next unparsed argument in argv[]
+ *	regs	- Register state at time of KDB entry
+ * Outputs:
+ *	*value	- receives the value of the address-expression
+ *	*offset - receives the offset specified, if any
+ *	*name   - receives the symbol name, if any
+ *	*nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ *	zero is returned on success, a kdb diagnostic code is
+ *      returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+		  unsigned long *value,  long *offset,
+		  char **name)
+{
+	unsigned long addr;
+	unsigned long off = 0;
+	int positive;
+	int diag;
+	int found = 0;
+	char *symname;
+	char symbol = '\0';
+	char *cp;
+	kdb_symtab_t symtab;
+
+	/*
+	 * Process arguments which follow the following syntax:
+	 *
+	 *  symbol | numeric-address [+/- numeric-offset]
+	 *  %register
+	 *  $environment-variable
+	 */
+
+	if (*nextarg > argc)
+		return KDB_ARGCOUNT;
+
+	symname = (char *)argv[*nextarg];
+
+	/*
+	 * If there is no whitespace between the symbol
+	 * or address and the '+' or '-' symbols, we
+	 * remember the character and replace it with a
+	 * null so the symbol/value can be properly parsed
+	 */
+	cp = strpbrk(symname, "+-");
+	if (cp != NULL) {
+		symbol = *cp;
+		*cp++ = '\0';
+	}
+
+	if (symname[0] == '$') {
+		diag = kdbgetulenv(&symname[1], &addr);
+		if (diag)
+			return diag;
+	} else if (symname[0] == '%') {
+		diag = kdb_check_regs();
+		if (diag)
+			return diag;
+		/* Implement register values with % at a later time as it is
+		 * arch optional.
+		 */
+		return KDB_NOTIMP;
+	} else {
+		found = kdbgetsymval(symname, &symtab);
+		if (found) {
+			addr = symtab.sym_start;
+		} else {
+			diag = kdbgetularg(argv[*nextarg], &addr);
+			if (diag)
+				return diag;
+		}
+	}
+
+	if (!found)
+		found = kdbnearsym(addr, &symtab);
+
+	(*nextarg)++;
+
+	if (name)
+		*name = symname;
+	if (value)
+		*value = addr;
+	if (offset && name && *name)
+		*offset = addr - symtab.sym_start;
+
+	if ((*nextarg > argc)
+	 && (symbol == '\0'))
+		return 0;
+
+	/*
+	 * check for +/- and offset
+	 */
+
+	if (symbol == '\0') {
+		if ((argv[*nextarg][0] != '+')
+		 && (argv[*nextarg][0] != '-')) {
+			/*
+			 * Not our argument.  Return.
+			 */
+			return 0;
+		} else {
+			positive = (argv[*nextarg][0] == '+');
+			(*nextarg)++;
+		}
+	} else
+		positive = (symbol == '+');
+
+	/*
+	 * Now there must be an offset!
+	 */
+	if ((*nextarg > argc)
+	 && (symbol == '\0')) {
+		return KDB_INVADDRFMT;
+	}
+
+	if (!symbol) {
+		cp = (char *)argv[*nextarg];
+		(*nextarg)++;
+	}
+
+	diag = kdbgetularg(cp, &off);
+	if (diag)
+		return diag;
+
+	if (!positive)
+		off = -off;
+
+	if (offset)
+		*offset += off;
+
+	if (value)
+		*value += off;
+
+	return 0;
+}
+
+static void kdb_cmderror(int diag)
+{
+	int i;
+
+	if (diag >= 0) {
+		kdb_printf("no error detected (diagnostic is %d)\n", diag);
+		return;
+	}
+
+	for (i = 0; i < __nkdb_err; i++) {
+		if (kdbmsgs[i].km_diag == diag) {
+			kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+			return;
+		}
+	}
+
+	kdb_printf("Unknown diag %d\n", -diag);
+}
+
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ *	command which defines one command as a set of other commands,
+ *	terminated by endefcmd.  kdb_defcmd processes the initial
+ *	'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ *	the following commands until 'endefcmd'.
+ * Inputs:
+ *	argc	argument count
+ *	argv	argument vector
+ * Returns:
+ *	zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+	int count;
+	int usable;
+	char *name;
+	char *usage;
+	char *help;
+	char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+	struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+	char **save_command = s->command;
+	if (strcmp(argv0, "endefcmd") == 0) {
+		defcmd_in_progress = 0;
+		if (!s->count)
+			s->usable = 0;
+		if (s->usable)
+			kdb_register(s->name, kdb_exec_defcmd,
+				     s->usage, s->help, 0);
+		return 0;
+	}
+	if (!s->usable)
+		return KDB_NOTIMP;
+	s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+	if (!s->command) {
+		kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+			   cmdstr);
+		s->usable = 0;
+		return KDB_NOTIMP;
+	}
+	memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+	s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+	kfree(save_command);
+	return 0;
+}
+
+static int kdb_defcmd(int argc, const char **argv)
+{
+	struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+	if (defcmd_in_progress) {
+		kdb_printf("kdb: nested defcmd detected, assuming missing "
+			   "endefcmd\n");
+		kdb_defcmd2("endefcmd", "endefcmd");
+	}
+	if (argc == 0) {
+		int i;
+		for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+			kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+				   s->usage, s->help);
+			for (i = 0; i < s->count; ++i)
+				kdb_printf("%s", s->command[i]);
+			kdb_printf("endefcmd\n");
+		}
+		return 0;
+	}
+	if (argc != 3)
+		return KDB_ARGCOUNT;
+	defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+			     GFP_KDB);
+	if (!defcmd_set) {
+		kdb_printf("Could not allocate new defcmd_set entry for %s\n",
+			   argv[1]);
+		defcmd_set = save_defcmd_set;
+		return KDB_NOTIMP;
+	}
+	memcpy(defcmd_set, save_defcmd_set,
+	       defcmd_set_count * sizeof(*defcmd_set));
+	kfree(save_defcmd_set);
+	s = defcmd_set + defcmd_set_count;
+	memset(s, 0, sizeof(*s));
+	s->usable = 1;
+	s->name = kdb_strdup(argv[1], GFP_KDB);
+	s->usage = kdb_strdup(argv[2], GFP_KDB);
+	s->help = kdb_strdup(argv[3], GFP_KDB);
+	if (s->usage[0] == '"') {
+		strcpy(s->usage, s->usage+1);
+		s->usage[strlen(s->usage)-1] = '\0';
+	}
+	if (s->help[0] == '"') {
+		strcpy(s->help, s->help+1);
+		s->help[strlen(s->help)-1] = '\0';
+	}
+	++defcmd_set_count;
+	defcmd_in_progress = 1;
+	return 0;
+}
+
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ *	defcmd name.
+ * Inputs:
+ *	argc	argument count
+ *	argv	argument vector
+ * Returns:
+ *	zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+	int i, ret;
+	struct defcmd_set *s;
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+	for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+		if (strcmp(s->name, argv[0]) == 0)
+			break;
+	}
+	if (i == defcmd_set_count) {
+		kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+			   argv[0]);
+		return KDB_NOTIMP;
+	}
+	for (i = 0; i < s->count; ++i) {
+		/* Recursive use of kdb_parse, do not use argv after
+		 * this point */
+		argv = NULL;
+		kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+		ret = kdb_parse(s->command[i]);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT	32
+#define CMD_BUFLEN		200	/* kdb_printf: max printline
+					 * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+
+/*
+ * The "str" argument may point to something like  | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+	int	len;
+	char	*cp = (char *)str, *cp2;
+
+	/* sanity check: we should have been called with the \ first */
+	if (*cp != '|')
+		return;
+	cp++;
+	while (isspace(*cp))
+		cp++;
+	if (strncmp(cp, "grep ", 5)) {
+		kdb_printf("invalid 'pipe', see grephelp\n");
+		return;
+	}
+	cp += 5;
+	while (isspace(*cp))
+		cp++;
+	cp2 = strchr(cp, '\n');
+	if (cp2)
+		*cp2 = '\0'; /* remove the trailing newline */
+	len = strlen(cp);
+	if (len == 0) {
+		kdb_printf("invalid 'pipe', see grephelp\n");
+		return;
+	}
+	/* now cp points to a nonzero length search string */
+	if (*cp == '"') {
+		/* allow it be "x y z" by removing the "'s - there must
+		   be two of them */
+		cp++;
+		cp2 = strchr(cp, '"');
+		if (!cp2) {
+			kdb_printf("invalid quoted string, see grephelp\n");
+			return;
+		}
+		*cp2 = '\0'; /* end the string where the 2nd " was */
+	}
+	kdb_grep_leading = 0;
+	if (*cp == '^') {
+		kdb_grep_leading = 1;
+		cp++;
+	}
+	len = strlen(cp);
+	kdb_grep_trailing = 0;
+	if (*(cp+len-1) == '$') {
+		kdb_grep_trailing = 1;
+		*(cp+len-1) = '\0';
+	}
+	len = strlen(cp);
+	if (!len)
+		return;
+	if (len >= GREP_LEN) {
+		kdb_printf("search string too long\n");
+		return;
+	}
+	strcpy(kdb_grep_string, cp);
+	kdb_grepping_flag++;
+	return;
+}
+
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ *	matching command and invoke the command function.  This
+ *	function may be called recursively, if it is, the second call
+ *	will overwrite argv and cbuf.  It is the caller's
+ *	responsibility to save their argv if they recursively call
+ *	kdb_parse().
+ * Parameters:
+ *      cmdstr	The input command line to be parsed.
+ *	regs	The registers at the time kdb was entered.
+ * Returns:
+ *	Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ *	Limited to 20 tokens.
+ *
+ *	Real rudimentary tokenization. Basically only whitespace
+ *	is considered a token delimeter (but special consideration
+ *	is taken of the '=' sign as used by the 'set' command).
+ *
+ *	The algorithm used to tokenize the input string relies on
+ *	there being at least one whitespace (or otherwise useless)
+ *	character between tokens as the character immediately following
+ *	the token is altered in-place to a null-byte to terminate the
+ *	token string.
+ */
+
+#define MAXARGC	20
+
+int kdb_parse(const char *cmdstr)
+{
+	static char *argv[MAXARGC];
+	static int argc;
+	static char cbuf[CMD_BUFLEN+2];
+	char *cp;
+	char *cpp, quoted;
+	kdbtab_t *tp;
+	int i, escaped, ignore_errors = 0, check_grep;
+
+	/*
+	 * First tokenize the command string.
+	 */
+	cp = (char *)cmdstr;
+	kdb_grepping_flag = check_grep = 0;
+
+	if (KDB_FLAG(CMD_INTERRUPT)) {
+		/* Previous command was interrupted, newline must not
+		 * repeat the command */
+		KDB_FLAG_CLEAR(CMD_INTERRUPT);
+		KDB_STATE_SET(PAGER);
+		argc = 0;	/* no repeat */
+	}
+
+	if (*cp != '\n' && *cp != '\0') {
+		argc = 0;
+		cpp = cbuf;
+		while (*cp) {
+			/* skip whitespace */
+			while (isspace(*cp))
+				cp++;
+			if ((*cp == '\0') || (*cp == '\n') ||
+			    (*cp == '#' && !defcmd_in_progress))
+				break;
+			/* special case: check for | grep pattern */
+			if (*cp == '|') {
+				check_grep++;
+				break;
+			}
+			if (cpp >= cbuf + CMD_BUFLEN) {
+				kdb_printf("kdb_parse: command buffer "
+					   "overflow, command ignored\n%s\n",
+					   cmdstr);
+				return KDB_NOTFOUND;
+			}
+			if (argc >= MAXARGC - 1) {
+				kdb_printf("kdb_parse: too many arguments, "
+					   "command ignored\n%s\n", cmdstr);
+				return KDB_NOTFOUND;
+			}
+			argv[argc++] = cpp;
+			escaped = 0;
+			quoted = '\0';
+			/* Copy to next unquoted and unescaped
+			 * whitespace or '=' */
+			while (*cp && *cp != '\n' &&
+			       (escaped || quoted || !isspace(*cp))) {
+				if (cpp >= cbuf + CMD_BUFLEN)
+					break;
+				if (escaped) {
+					escaped = 0;
+					*cpp++ = *cp++;
+					continue;
+				}
+				if (*cp == '\\') {
+					escaped = 1;
+					++cp;
+					continue;
+				}
+				if (*cp == quoted)
+					quoted = '\0';
+				else if (*cp == '\'' || *cp == '"')
+					quoted = *cp;
+				*cpp = *cp++;
+				if (*cpp == '=' && !quoted)
+					break;
+				++cpp;
+			}
+			*cpp++ = '\0';	/* Squash a ws or '=' character */
+		}
+	}
+	if (!argc)
+		return 0;
+	if (check_grep)
+		parse_grep(cp);
+	if (defcmd_in_progress) {
+		int result = kdb_defcmd2(cmdstr, argv[0]);
+		if (!defcmd_in_progress) {
+			argc = 0;	/* avoid repeat on endefcmd */
+			*(argv[0]) = '\0';
+		}
+		return result;
+	}
+	if (argv[0][0] == '-' && argv[0][1] &&
+	    (argv[0][1] < '0' || argv[0][1] > '9')) {
+		ignore_errors = 1;
+		++argv[0];
+	}
+
+	for_each_kdbcmd(tp, i) {
+		if (tp->cmd_name) {
+			/*
+			 * If this command is allowed to be abbreviated,
+			 * check to see if this is it.
+			 */
+
+			if (tp->cmd_minlen
+			 && (strlen(argv[0]) <= tp->cmd_minlen)) {
+				if (strncmp(argv[0],
+					    tp->cmd_name,
+					    tp->cmd_minlen) == 0) {
+					break;
+				}
+			}
+
+			if (strcmp(argv[0], tp->cmd_name) == 0)
+				break;
+		}
+	}
+
+	/*
+	 * If we don't find a command by this name, see if the first
+	 * few characters of this match any of the known commands.
+	 * e.g., md1c20 should match md.
+	 */
+	if (i == kdb_max_commands) {
+		for_each_kdbcmd(tp, i) {
+			if (tp->cmd_name) {
+				if (strncmp(argv[0],
+					    tp->cmd_name,
+					    strlen(tp->cmd_name)) == 0) {
+					break;
+				}
+			}
+		}
+	}
+
+	if (i < kdb_max_commands) {
+		int result;
+		KDB_STATE_SET(CMD);
+		result = (*tp->cmd_func)(argc-1, (const char **)argv);
+		if (result && ignore_errors && result > KDB_CMD_GO)
+			result = 0;
+		KDB_STATE_CLEAR(CMD);
+		switch (tp->cmd_repeat) {
+		case KDB_REPEAT_NONE:
+			argc = 0;
+			if (argv[0])
+				*(argv[0]) = '\0';
+			break;
+		case KDB_REPEAT_NO_ARGS:
+			argc = 1;
+			if (argv[1])
+				*(argv[1]) = '\0';
+			break;
+		case KDB_REPEAT_WITH_ARGS:
+			break;
+		}
+		return result;
+	}
+
+	/*
+	 * If the input with which we were presented does not
+	 * map to an existing command, attempt to parse it as an
+	 * address argument and display the result.   Useful for
+	 * obtaining the address of a variable, or the nearest symbol
+	 * to an address contained in a register.
+	 */
+	{
+		unsigned long value;
+		char *name = NULL;
+		long offset;
+		int nextarg = 0;
+
+		if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+				  &value, &offset, &name)) {
+			return KDB_NOTFOUND;
+		}
+
+		kdb_printf("%s = ", argv[0]);
+		kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+		kdb_printf("\n");
+		return 0;
+	}
+}
+
+
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P	16
+#define CTRL_N	14
+
+	/* initial situation */
+	if (cmd_head == cmd_tail)
+		return 0;
+	switch (*cmd) {
+	case CTRL_P:
+		if (cmdptr != cmd_tail)
+			cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+		strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+		return 1;
+	case CTRL_N:
+		if (cmdptr != cmd_head)
+			cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+		strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * kdb_reboot - This function implements the 'reboot' command.  Reboot
+ *	the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+	emergency_restart();
+	kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+	while (1)
+		cpu_relax();
+	/* NOTREACHED */
+	return 0;
+}
+
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+	int old_lvl = console_loglevel;
+	console_loglevel = 15;
+	show_regs(regs);
+	kdb_printf("\n");
+	console_loglevel = old_lvl;
+}
+
+void kdb_set_current_task(struct task_struct *p)
+{
+	kdb_current_task = p;
+
+	if (kdb_task_has_cpu(p)) {
+		kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+		return;
+	}
+	kdb_current_regs = NULL;
+}
+
+/*
+ * kdb_local - The main code for kdb.  This routine is invoked on a
+ *	specific processor, it is not global.  The main kdb() routine
+ *	ensures that only one processor at a time is in this routine.
+ *	This code is called with the real reason code on the first
+ *	entry to a kdb session, thereafter it is called with reason
+ *	SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ *	reason		The reason KDB was invoked
+ *	error		The hardware-defined error code
+ *	regs		The exception frame at time of fault/breakpoint.
+ *	db_result	Result code from the break or debug point.
+ * Returns:
+ *	0	KDB was invoked for an event which it wasn't responsible
+ *	1	KDB handled the event for which it was invoked.
+ *	KDB_CMD_GO	User typed 'go'.
+ *	KDB_CMD_CPU	User switched to another cpu.
+ *	KDB_CMD_SS	Single step.
+ *	KDB_CMD_SSB	Single step until branch.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+		     kdb_dbtrap_t db_result)
+{
+	char *cmdbuf;
+	int diag;
+	struct task_struct *kdb_current =
+		kdb_curr_task(raw_smp_processor_id());
+
+	KDB_DEBUG_STATE("kdb_local 1", reason);
+	kdb_go_count = 0;
+	if (reason == KDB_REASON_DEBUG) {
+		/* special case below */
+	} else {
+		kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+			   kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+		kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+	}
+
+	switch (reason) {
+	case KDB_REASON_DEBUG:
+	{
+		/*
+		 * If re-entering kdb after a single step
+		 * command, don't print the message.
+		 */
+		switch (db_result) {
+		case KDB_DB_BPT:
+			kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+				   kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+			kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+			kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+				   instruction_pointer(regs));
+			break;
+		case KDB_DB_SSB:
+			/*
+			 * In the midst of ssb command. Just return.
+			 */
+			KDB_DEBUG_STATE("kdb_local 3", reason);
+			return KDB_CMD_SSB;	/* Continue with SSB command */
+
+			break;
+		case KDB_DB_SS:
+			break;
+		case KDB_DB_SSBPT:
+			KDB_DEBUG_STATE("kdb_local 4", reason);
+			return 1;	/* kdba_db_trap did the work */
+		default:
+			kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+				   db_result);
+			break;
+		}
+
+	}
+		break;
+	case KDB_REASON_ENTER:
+		if (KDB_STATE(KEYBOARD))
+			kdb_printf("due to Keyboard Entry\n");
+		else
+			kdb_printf("due to KDB_ENTER()\n");
+		break;
+	case KDB_REASON_KEYBOARD:
+		KDB_STATE_SET(KEYBOARD);
+		kdb_printf("due to Keyboard Entry\n");
+		break;
+	case KDB_REASON_ENTER_SLAVE:
+		/* drop through, slaves only get released via cpu switch */
+	case KDB_REASON_SWITCH:
+		kdb_printf("due to cpu switch\n");
+		break;
+	case KDB_REASON_OOPS:
+		kdb_printf("Oops: %s\n", kdb_diemsg);
+		kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+			   instruction_pointer(regs));
+		kdb_dumpregs(regs);
+		break;
+	case KDB_REASON_NMI:
+		kdb_printf("due to NonMaskable Interrupt @ "
+			   kdb_machreg_fmt "\n",
+			   instruction_pointer(regs));
+		kdb_dumpregs(regs);
+		break;
+	case KDB_REASON_SSTEP:
+	case KDB_REASON_BREAK:
+		kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+			   reason == KDB_REASON_BREAK ?
+			   "Breakpoint" : "SS trap", instruction_pointer(regs));
+		/*
+		 * Determine if this breakpoint is one that we
+		 * are interested in.
+		 */
+		if (db_result != KDB_DB_BPT) {
+			kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+				   db_result);
+			KDB_DEBUG_STATE("kdb_local 6", reason);
+			return 0;	/* Not for us, dismiss it */
+		}
+		break;
+	case KDB_REASON_RECURSE:
+		kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+			   instruction_pointer(regs));
+		break;
+	default:
+		kdb_printf("kdb: unexpected reason code: %d\n", reason);
+		KDB_DEBUG_STATE("kdb_local 8", reason);
+		return 0;	/* Not for us, dismiss it */
+	}
+
+	while (1) {
+		/*
+		 * Initialize pager context.
+		 */
+		kdb_nextline = 1;
+		KDB_STATE_CLEAR(SUPPRESS);
+
+		cmdbuf = cmd_cur;
+		*cmdbuf = '\0';
+		*(cmd_hist[cmd_head]) = '\0';
+
+		if (KDB_FLAG(ONLY_DO_DUMP)) {
+			/* kdb is off but a catastrophic error requires a dump.
+			 * Take the dump and reboot.
+			 * Turn on logging so the kdb output appears in the log
+			 * buffer in the dump.
+			 */
+			const char *setargs[] = { "set", "LOGGING", "1" };
+			kdb_set(2, setargs);
+			kdb_reboot(0, NULL);
+			/*NOTREACHED*/
+		}
+
+do_full_getstr:
+#if defined(CONFIG_SMP)
+		snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+			 raw_smp_processor_id());
+#else
+		snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+		if (defcmd_in_progress)
+			strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+
+		/*
+		 * Fetch command from keyboard
+		 */
+		cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+		if (*cmdbuf != '\n') {
+			if (*cmdbuf < 32) {
+				if (cmdptr == cmd_head) {
+					strncpy(cmd_hist[cmd_head], cmd_cur,
+						CMD_BUFLEN);
+					*(cmd_hist[cmd_head] +
+					  strlen(cmd_hist[cmd_head])-1) = '\0';
+				}
+				if (!handle_ctrl_cmd(cmdbuf))
+					*(cmd_cur+strlen(cmd_cur)-1) = '\0';
+				cmdbuf = cmd_cur;
+				goto do_full_getstr;
+			} else {
+				strncpy(cmd_hist[cmd_head], cmd_cur,
+					CMD_BUFLEN);
+			}
+
+			cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+			if (cmd_head == cmd_tail)
+				cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+		}
+
+		cmdptr = cmd_head;
+		diag = kdb_parse(cmdbuf);
+		if (diag == KDB_NOTFOUND) {
+			kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+			diag = 0;
+		}
+		if (diag == KDB_CMD_GO
+		 || diag == KDB_CMD_CPU
+		 || diag == KDB_CMD_SS
+		 || diag == KDB_CMD_SSB
+		 || diag == KDB_CMD_KGDB)
+			break;
+
+		if (diag)
+			kdb_cmderror(diag);
+	}
+	KDB_DEBUG_STATE("kdb_local 9", diag);
+	return diag;
+}
+
+
+/*
+ * kdb_print_state - Print the state data for the current processor
+ *	for debugging.
+ * Inputs:
+ *	text		Identifies the debug point
+ *	value		Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+	kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+		   text, raw_smp_processor_id(), value, kdb_initial_cpu,
+		   kdb_state);
+}
+
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ *	controlling cpu, all cpus are in this loop.  One cpu is in
+ *	control and will issue the kdb prompt, the others will spin
+ *	until 'go' or cpu switch.
+ *
+ *	To get a consistent view of the kernel stacks for all
+ *	processes, this routine is invoked from the main kdb code via
+ *	an architecture specific routine.  kdba_main_loop is
+ *	responsible for making the kernel stacks consistent for all
+ *	processes, there should be no difference between a blocked
+ *	process and a running process as far as kdb is concerned.
+ * Inputs:
+ *	reason		The reason KDB was invoked
+ *	error		The hardware-defined error code
+ *	reason2		kdb's current reason code.
+ *			Initially error but can change
+ *			acording to kdb state.
+ *	db_result	Result code from break or debug point.
+ *	regs		The exception frame at time of fault/breakpoint.
+ *			should always be valid.
+ * Returns:
+ *	0	KDB was invoked for an event which it wasn't responsible
+ *	1	KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+	      kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+	int result = 1;
+	/* Stay in kdb() until 'go', 'ss[b]' or an error */
+	while (1) {
+		/*
+		 * All processors except the one that is in control
+		 * will spin here.
+		 */
+		KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+		while (KDB_STATE(HOLD_CPU)) {
+			/* state KDB is turned off by kdb_cpu to see if the
+			 * other cpus are still live, each cpu in this loop
+			 * turns it back on.
+			 */
+			if (!KDB_STATE(KDB))
+				KDB_STATE_SET(KDB);
+		}
+
+		KDB_STATE_CLEAR(SUPPRESS);
+		KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+		if (KDB_STATE(LEAVING))
+			break;	/* Another cpu said 'go' */
+		/* Still using kdb, this processor is in control */
+		result = kdb_local(reason2, error, regs, db_result);
+		KDB_DEBUG_STATE("kdb_main_loop 3", result);
+
+		if (result == KDB_CMD_CPU)
+			break;
+
+		if (result == KDB_CMD_SS) {
+			KDB_STATE_SET(DOING_SS);
+			break;
+		}
+
+		if (result == KDB_CMD_SSB) {
+			KDB_STATE_SET(DOING_SS);
+			KDB_STATE_SET(DOING_SSB);
+			break;
+		}
+
+		if (result == KDB_CMD_KGDB) {
+			if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+				kdb_printf("Entering please attach debugger "
+					   "or use $D#44+ or $3#33\n");
+			break;
+		}
+		if (result && result != 1 && result != KDB_CMD_GO)
+			kdb_printf("\nUnexpected kdb_local return code %d\n",
+				   result);
+		KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+		break;
+	}
+	if (KDB_STATE(DOING_SS))
+		KDB_STATE_CLEAR(SSBPT);
+
+	return result;
+}
+
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ *	mdr  <addr arg>,<byte count>
+ * Inputs:
+ *	addr	Start address
+ *	count	Number of bytes
+ * Returns:
+ *	Always 0.  Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+	unsigned char c;
+	while (count--) {
+		if (kdb_getarea(c, addr))
+			return 0;
+		kdb_printf("%02x", c);
+		addr++;
+	}
+	kdb_printf("\n");
+	return 0;
+}
+
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ *	'md8' 'mdr' and 'mds' commands.
+ *
+ *	md|mds  [<addr arg> [<line count> [<radix>]]]
+ *	mdWcN	[<addr arg> [<line count> [<radix>]]]
+ *		where W = is the width (1, 2, 4 or 8) and N is the count.
+ *		for eg., md1c20 reads 20 bytes, 1 at a time.
+ *	mdr  <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+			int symbolic, int nosect, int bytesperword,
+			int num, int repeat, int phys)
+{
+	/* print just one line of data */
+	kdb_symtab_t symtab;
+	char cbuf[32];
+	char *c = cbuf;
+	int i;
+	unsigned long word;
+
+	memset(cbuf, '\0', sizeof(cbuf));
+	if (phys)
+		kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+	else
+		kdb_printf(kdb_machreg_fmt0 " ", addr);
+
+	for (i = 0; i < num && repeat--; i++) {
+		if (phys) {
+			if (kdb_getphysword(&word, addr, bytesperword))
+				break;
+		} else if (kdb_getword(&word, addr, bytesperword))
+			break;
+		kdb_printf(fmtstr, word);
+		if (symbolic)
+			kdbnearsym(word, &symtab);
+		else
+			memset(&symtab, 0, sizeof(symtab));
+		if (symtab.sym_name) {
+			kdb_symbol_print(word, &symtab, 0);
+			if (!nosect) {
+				kdb_printf("\n");
+				kdb_printf("                       %s %s "
+					   kdb_machreg_fmt " "
+					   kdb_machreg_fmt " "
+					   kdb_machreg_fmt, symtab.mod_name,
+					   symtab.sec_name, symtab.sec_start,
+					   symtab.sym_start, symtab.sym_end);
+			}
+			addr += bytesperword;
+		} else {
+			union {
+				u64 word;
+				unsigned char c[8];
+			} wc;
+			unsigned char *cp;
+#ifdef	__BIG_ENDIAN
+			cp = wc.c + 8 - bytesperword;
+#else
+			cp = wc.c;
+#endif
+			wc.word = word;
+#define printable_char(c) \
+	({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+			switch (bytesperword) {
+			case 8:
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				addr += 4;
+			case 4:
+				*c++ = printable_char(*cp++);
+				*c++ = printable_char(*cp++);
+				addr += 2;
+			case 2:
+				*c++ = printable_char(*cp++);
+				addr++;
+			case 1:
+				*c++ = printable_char(*cp++);
+				addr++;
+				break;
+			}
+#undef printable_char
+		}
+	}
+	kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+		   " ", cbuf);
+}
+
+static int kdb_md(int argc, const char **argv)
+{
+	static unsigned long last_addr;
+	static int last_radix, last_bytesperword, last_repeat;
+	int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+	int nosect = 0;
+	char fmtchar, fmtstr[64];
+	unsigned long addr;
+	unsigned long word;
+	long offset = 0;
+	int symbolic = 0;
+	int valid = 0;
+	int phys = 0;
+
+	kdbgetintenv("MDCOUNT", &mdcount);
+	kdbgetintenv("RADIX", &radix);
+	kdbgetintenv("BYTESPERWORD", &bytesperword);
+
+	/* Assume 'md <addr>' and start with environment values */
+	repeat = mdcount * 16 / bytesperword;
+
+	if (strcmp(argv[0], "mdr") == 0) {
+		if (argc != 2)
+			return KDB_ARGCOUNT;
+		valid = 1;
+	} else if (isdigit(argv[0][2])) {
+		bytesperword = (int)(argv[0][2] - '0');
+		if (bytesperword == 0) {
+			bytesperword = last_bytesperword;
+			if (bytesperword == 0)
+				bytesperword = 4;
+		}
+		last_bytesperword = bytesperword;
+		repeat = mdcount * 16 / bytesperword;
+		if (!argv[0][3])
+			valid = 1;
+		else if (argv[0][3] == 'c' && argv[0][4]) {
+			char *p;
+			repeat = simple_strtoul(argv[0] + 4, &p, 10);
+			mdcount = ((repeat * bytesperword) + 15) / 16;
+			valid = !*p;
+		}
+		last_repeat = repeat;
+	} else if (strcmp(argv[0], "md") == 0)
+		valid = 1;
+	else if (strcmp(argv[0], "mds") == 0)
+		valid = 1;
+	else if (strcmp(argv[0], "mdp") == 0) {
+		phys = valid = 1;
+	}
+	if (!valid)
+		return KDB_NOTFOUND;
+
+	if (argc == 0) {
+		if (last_addr == 0)
+			return KDB_ARGCOUNT;
+		addr = last_addr;
+		radix = last_radix;
+		bytesperword = last_bytesperword;
+		repeat = last_repeat;
+		mdcount = ((repeat * bytesperword) + 15) / 16;
+	}
+
+	if (argc) {
+		unsigned long val;
+		int diag, nextarg = 1;
+		diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+				     &offset, NULL);
+		if (diag)
+			return diag;
+		if (argc > nextarg+2)
+			return KDB_ARGCOUNT;
+
+		if (argc >= nextarg) {
+			diag = kdbgetularg(argv[nextarg], &val);
+			if (!diag) {
+				mdcount = (int) val;
+				repeat = mdcount * 16 / bytesperword;
+			}
+		}
+		if (argc >= nextarg+1) {
+			diag = kdbgetularg(argv[nextarg+1], &val);
+			if (!diag)
+				radix = (int) val;
+		}
+	}
+
+	if (strcmp(argv[0], "mdr") == 0)
+		return kdb_mdr(addr, mdcount);
+
+	switch (radix) {
+	case 10:
+		fmtchar = 'd';
+		break;
+	case 16:
+		fmtchar = 'x';
+		break;
+	case 8:
+		fmtchar = 'o';
+		break;
+	default:
+		return KDB_BADRADIX;
+	}
+
+	last_radix = radix;
+
+	if (bytesperword > KDB_WORD_SIZE)
+		return KDB_BADWIDTH;
+
+	switch (bytesperword) {
+	case 8:
+		sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+		break;
+	case 4:
+		sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+		break;
+	case 2:
+		sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+		break;
+	case 1:
+		sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+		break;
+	default:
+		return KDB_BADWIDTH;
+	}
+
+	last_repeat = repeat;
+	last_bytesperword = bytesperword;
+
+	if (strcmp(argv[0], "mds") == 0) {
+		symbolic = 1;
+		/* Do not save these changes as last_*, they are temporary mds
+		 * overrides.
+		 */
+		bytesperword = KDB_WORD_SIZE;
+		repeat = mdcount;
+		kdbgetintenv("NOSECT", &nosect);
+	}
+
+	/* Round address down modulo BYTESPERWORD */
+
+	addr &= ~(bytesperword-1);
+
+	while (repeat > 0) {
+		unsigned long a;
+		int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+			if (phys) {
+				if (kdb_getphysword(&word, a, bytesperword)
+						|| word)
+					break;
+			} else if (kdb_getword(&word, a, bytesperword) || word)
+				break;
+		}
+		n = min(num, repeat);
+		kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+			    num, repeat, phys);
+		addr += bytesperword * n;
+		repeat -= n;
+		z = (z + num - 1) / num;
+		if (z > 2) {
+			int s = num * (z-2);
+			kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+				   " zero suppressed\n",
+				addr, addr + bytesperword * s - 1);
+			addr += bytesperword * s;
+			repeat -= s;
+		}
+	}
+	last_addr = addr;
+
+	return 0;
+}
+
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ *	mm address-expression new-value
+ * Remarks:
+ *	mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+	int diag;
+	unsigned long addr;
+	long offset = 0;
+	unsigned long contents;
+	int nextarg;
+	int width;
+
+	if (argv[0][2] && !isdigit(argv[0][2]))
+		return KDB_NOTFOUND;
+
+	if (argc < 2)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+	if (diag)
+		return diag;
+
+	if (nextarg > argc)
+		return KDB_ARGCOUNT;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+	if (diag)
+		return diag;
+
+	if (nextarg != argc + 1)
+		return KDB_ARGCOUNT;
+
+	width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+	diag = kdb_putword(addr, contents, width);
+	if (diag)
+		return diag;
+
+	kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+
+	return 0;
+}
+
+/*
+ * kdb_go - This function implements the 'go' command.
+ *	go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+	unsigned long addr;
+	int diag;
+	int nextarg;
+	long offset;
+
+	if (argc == 1) {
+		if (raw_smp_processor_id() != kdb_initial_cpu) {
+			kdb_printf("go <address> must be issued from the "
+				   "initial cpu, do cpu %d first\n",
+				   kdb_initial_cpu);
+			return KDB_ARGCOUNT;
+		}
+		nextarg = 1;
+		diag = kdbgetaddrarg(argc, argv, &nextarg,
+				     &addr, &offset, NULL);
+		if (diag)
+			return diag;
+	} else if (argc) {
+		return KDB_ARGCOUNT;
+	}
+
+	diag = KDB_CMD_GO;
+	if (KDB_FLAG(CATASTROPHIC)) {
+		kdb_printf("Catastrophic error detected\n");
+		kdb_printf("kdb_continue_catastrophic=%d, ",
+			kdb_continue_catastrophic);
+		if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+			kdb_printf("type go a second time if you really want "
+				   "to continue\n");
+			return 0;
+		}
+		if (kdb_continue_catastrophic == 2) {
+			kdb_printf("forcing reboot\n");
+			kdb_reboot(0, NULL);
+		}
+		kdb_printf("attempting to continue\n");
+	}
+	return diag;
+}
+
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+	int diag = kdb_check_regs();
+	if (diag)
+		return diag;
+
+	kdb_dumpregs(kdb_current_regs);
+	return 0;
+}
+
+/*
+ * kdb_rm - This function implements the 'rm' (register modify)  command.
+ *	rm register-name new-contents
+ * Remarks:
+ *	Currently doesn't allow modification of control or
+ *	debug registers.
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+	int diag;
+	int ind = 0;
+	unsigned long contents;
+
+	if (argc != 2)
+		return KDB_ARGCOUNT;
+	/*
+	 * Allow presence or absence of leading '%' symbol.
+	 */
+	if (argv[1][0] == '%')
+		ind = 1;
+
+	diag = kdbgetularg(argv[2], &contents);
+	if (diag)
+		return diag;
+
+	diag = kdb_check_regs();
+	if (diag)
+		return diag;
+	kdb_printf("ERROR: Register set currently not implemented\n");
+	return 0;
+}
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ *	which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ *		sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+	sysrq_toggle_support(1);
+	handle_sysrq(*argv[1], NULL);
+
+	return 0;
+}
+#endif	/* CONFIG_MAGIC_SYSRQ */
+
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ *	frame) command.  This command takes an address and expects to
+ *	find an exception frame at that address, formats and prints
+ *	it.
+ *		regs address-expression
+ * Remarks:
+ *	Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+	int diag;
+	unsigned long addr;
+	long offset;
+	int nextarg;
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+	if (diag)
+		return diag;
+	show_regs((struct pt_regs *)addr);
+	return 0;
+}
+
+#if defined(CONFIG_MODULES)
+/* modules using other modules */
+struct module_use {
+	struct list_head list;
+	struct module *module_which_uses;
+};
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command.  Lists
+ *	currently loaded kernel modules.
+ *	Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+	struct module *mod;
+
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	kdb_printf("Module                  Size  modstruct     Used by\n");
+	list_for_each_entry(mod, kdb_modules, list) {
+
+		kdb_printf("%-20s%8u  0x%p ", mod->name,
+			   mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+		kdb_printf("%4d ", module_refcount(mod));
+#endif
+		if (mod->state == MODULE_STATE_GOING)
+			kdb_printf(" (Unloading)");
+		else if (mod->state == MODULE_STATE_COMING)
+			kdb_printf(" (Loading)");
+		else
+			kdb_printf(" (Live)");
+
+#ifdef CONFIG_MODULE_UNLOAD
+		{
+			struct module_use *use;
+			kdb_printf(" [ ");
+			list_for_each_entry(use, &mod->modules_which_use_me,
+					    list)
+				kdb_printf("%s ", use->module_which_uses->name);
+			kdb_printf("]\n");
+		}
+#endif
+	}
+
+	return 0;
+}
+
+#endif	/* CONFIG_MODULES */
+
+/*
+ * kdb_env - This function implements the 'env' command.  Display the
+ *	current environment variables.
+ */
+
+static int kdb_env(int argc, const char **argv)
+{
+	int i;
+
+	for (i = 0; i < __nenv; i++) {
+		if (__env[i])
+			kdb_printf("%s\n", __env[i]);
+	}
+
+	if (KDB_DEBUG(MASK))
+		kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+
+	return 0;
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ *	the contents of the syslog buffer.
+ *		dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+	char *syslog_data[4], *start, *end, c = '\0', *p;
+	int diag, logging, logsize, lines = 0, adjust = 0, n;
+
+	if (argc > 2)
+		return KDB_ARGCOUNT;
+	if (argc) {
+		char *cp;
+		lines = simple_strtol(argv[1], &cp, 0);
+		if (*cp)
+			lines = 0;
+		if (argc > 1) {
+			adjust = simple_strtoul(argv[2], &cp, 0);
+			if (*cp || adjust < 0)
+				adjust = 0;
+		}
+	}
+
+	/* disable LOGGING if set */
+	diag = kdbgetintenv("LOGGING", &logging);
+	if (!diag && logging) {
+		const char *setargs[] = { "set", "LOGGING", "0" };
+		kdb_set(2, setargs);
+	}
+
+	/* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+	 * logical start, end+1. */
+	kdb_syslog_data(syslog_data);
+	if (syslog_data[2] == syslog_data[3])
+		return 0;
+	logsize = syslog_data[1] - syslog_data[0];
+	start = syslog_data[2];
+	end = syslog_data[3];
+#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
+	for (n = 0, p = start; p < end; ++p) {
+		c = *KDB_WRAP(p);
+		if (c == '\n')
+			++n;
+	}
+	if (c != '\n')
+		++n;
+	if (lines < 0) {
+		if (adjust >= n)
+			kdb_printf("buffer only contains %d lines, nothing "
+				   "printed\n", n);
+		else if (adjust - lines >= n)
+			kdb_printf("buffer only contains %d lines, last %d "
+				   "lines printed\n", n, n - adjust);
+		if (adjust) {
+			for (; start < end && adjust; ++start) {
+				if (*KDB_WRAP(start) == '\n')
+					--adjust;
+			}
+			if (start < end)
+				++start;
+		}
+		for (p = start; p < end && lines; ++p) {
+			if (*KDB_WRAP(p) == '\n')
+				++lines;
+		}
+		end = p;
+	} else if (lines > 0) {
+		int skip = n - (adjust + lines);
+		if (adjust >= n) {
+			kdb_printf("buffer only contains %d lines, "
+				   "nothing printed\n", n);
+			skip = n;
+		} else if (skip < 0) {
+			lines += skip;
+			skip = 0;
+			kdb_printf("buffer only contains %d lines, first "
+				   "%d lines printed\n", n, lines);
+		}
+		for (; start < end && skip; ++start) {
+			if (*KDB_WRAP(start) == '\n')
+				--skip;
+		}
+		for (p = start; p < end && lines; ++p) {
+			if (*KDB_WRAP(p) == '\n')
+				--lines;
+		}
+		end = p;
+	}
+	/* Do a line at a time (max 200 chars) to reduce protocol overhead */
+	c = '\n';
+	while (start != end) {
+		char buf[201];
+		p = buf;
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		while (start < end && (c = *KDB_WRAP(start)) &&
+		       (p - buf) < sizeof(buf)-1) {
+			++start;
+			*p++ = c;
+			if (c == '\n')
+				break;
+		}
+		*p = '\0';
+		kdb_printf("%s", buf);
+	}
+	if (c != '\n')
+		kdb_printf("\n");
+
+	return 0;
+}
+#endif /* CONFIG_PRINTK */
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ *	cpu	[<cpunum>]
+ * Returns:
+ *	KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+	int i, start_cpu, first_print = 1;
+	char state, prev_state = '?';
+
+	kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+	kdb_printf("Available cpus: ");
+	for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i)) {
+			state = 'F';	/* cpu is offline */
+		} else {
+			state = ' ';	/* cpu is responding to kdb */
+			if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+				state = 'I';	/* idle task */
+		}
+		if (state != prev_state) {
+			if (prev_state != '?') {
+				if (!first_print)
+					kdb_printf(", ");
+				first_print = 0;
+				kdb_printf("%d", start_cpu);
+				if (start_cpu < i-1)
+					kdb_printf("-%d", i-1);
+				if (prev_state != ' ')
+					kdb_printf("(%c)", prev_state);
+			}
+			prev_state = state;
+			start_cpu = i;
+		}
+	}
+	/* print the trailing cpus, ignoring them if they are all offline */
+	if (prev_state != 'F') {
+		if (!first_print)
+			kdb_printf(", ");
+		kdb_printf("%d", start_cpu);
+		if (start_cpu < i-1)
+			kdb_printf("-%d", i-1);
+		if (prev_state != ' ')
+			kdb_printf("(%c)", prev_state);
+	}
+	kdb_printf("\n");
+}
+
+static int kdb_cpu(int argc, const char **argv)
+{
+	unsigned long cpunum;
+	int diag;
+
+	if (argc == 0) {
+		kdb_cpu_status();
+		return 0;
+	}
+
+	if (argc != 1)
+		return KDB_ARGCOUNT;
+
+	diag = kdbgetularg(argv[1], &cpunum);
+	if (diag)
+		return diag;
+
+	/*
+	 * Validate cpunum
+	 */
+	if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+		return KDB_BADCPUNUM;
+
+	dbg_switch_cpu = cpunum;
+
+	/*
+	 * Switch to other cpu
+	 */
+	return KDB_CMD_CPU;
+}
+
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+	int idle = 0, daemon = 0;
+	unsigned long mask_I = kdb_task_state_string("I"),
+		      mask_M = kdb_task_state_string("M");
+	unsigned long cpu;
+	const struct task_struct *p, *g;
+	for_each_online_cpu(cpu) {
+		p = kdb_curr_task(cpu);
+		if (kdb_task_state(p, mask_I))
+			++idle;
+	}
+	kdb_do_each_thread(g, p) {
+		if (kdb_task_state(p, mask_M))
+			++daemon;
+	} kdb_while_each_thread(g, p);
+	if (idle || daemon) {
+		if (idle)
+			kdb_printf("%d idle process%s (state I)%s\n",
+				   idle, idle == 1 ? "" : "es",
+				   daemon ? " and " : "");
+		if (daemon)
+			kdb_printf("%d sleeping system daemon (state M) "
+				   "process%s", daemon,
+				   daemon == 1 ? "" : "es");
+		kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+	}
+}
+
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ *	list of the active processes.
+ *		ps [DRSTCZEUIMA]   All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+	int cpu;
+	unsigned long tmp;
+
+	if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+		return;
+
+	cpu = kdb_process_cpu(p);
+	kdb_printf("0x%p %8d %8d  %d %4d   %c  0x%p %c%s\n",
+		   (void *)p, p->pid, p->parent->pid,
+		   kdb_task_has_cpu(p), kdb_process_cpu(p),
+		   kdb_task_state_char(p),
+		   (void *)(&p->thread),
+		   p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+		   p->comm);
+	if (kdb_task_has_cpu(p)) {
+		if (!KDB_TSK(cpu)) {
+			kdb_printf("  Error: no saved data for this cpu\n");
+		} else {
+			if (KDB_TSK(cpu) != p)
+				kdb_printf("  Error: does not match running "
+				   "process table (0x%p)\n", KDB_TSK(cpu));
+		}
+	}
+}
+
+static int kdb_ps(int argc, const char **argv)
+{
+	struct task_struct *g, *p;
+	unsigned long mask, cpu;
+
+	if (argc == 0)
+		kdb_ps_suppressed();
+	kdb_printf("%-*s      Pid   Parent [*] cpu State %-*s Command\n",
+		(int)(2*sizeof(void *))+2, "Task Addr",
+		(int)(2*sizeof(void *))+2, "Thread");
+	mask = kdb_task_state_string(argc ? argv[1] : NULL);
+	/* Run the active tasks first */
+	for_each_online_cpu(cpu) {
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		p = kdb_curr_task(cpu);
+		if (kdb_task_state(p, mask))
+			kdb_ps1(p);
+	}
+	kdb_printf("\n");
+	/* Now the real tasks */
+	kdb_do_each_thread(g, p) {
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+		if (kdb_task_state(p, mask))
+			kdb_ps1(p);
+	} kdb_while_each_thread(g, p);
+
+	return 0;
+}
+
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ *	the currently active process.
+ *		pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+	struct task_struct *p;
+	unsigned long val;
+	int diag;
+
+	if (argc > 1)
+		return KDB_ARGCOUNT;
+
+	if (argc) {
+		if (strcmp(argv[1], "R") == 0) {
+			p = KDB_TSK(kdb_initial_cpu);
+		} else {
+			diag = kdbgetularg(argv[1], &val);
+			if (diag)
+				return KDB_BADINT;
+
+			p = find_task_by_pid_ns((pid_t)val,	&init_pid_ns);
+			if (!p) {
+				kdb_printf("No task with pid=%d\n", (pid_t)val);
+				return 0;
+			}
+		}
+		kdb_set_current_task(p);
+	}
+	kdb_printf("KDB current process is %s(pid=%d)\n",
+		   kdb_current_task->comm,
+		   kdb_current_task->pid);
+
+	return 0;
+}
+
+/*
+ * kdb_ll - This function implements the 'll' command which follows a
+ *	linked list and executes an arbitrary command for each
+ *	element.
+ */
+static int kdb_ll(int argc, const char **argv)
+{
+	int diag;
+	unsigned long addr;
+	long offset = 0;
+	unsigned long va;
+	unsigned long linkoffset;
+	int nextarg;
+	const char *command;
+
+	if (argc != 3)
+		return KDB_ARGCOUNT;
+
+	nextarg = 1;
+	diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+	if (diag)
+		return diag;
+
+	diag = kdbgetularg(argv[2], &linkoffset);
+	if (diag)
+		return diag;
+
+	/*
+	 * Using the starting address as
+	 * the first element in the list, and assuming that
+	 * the list ends with a null pointer.
+	 */
+
+	va = addr;
+	command = kdb_strdup(argv[3], GFP_KDB);
+	if (!command) {
+		kdb_printf("%s: cannot duplicate command\n", __func__);
+		return 0;
+	}
+	/* Recursive use of kdb_parse, do not use argv after this point */
+	argv = NULL;
+
+	while (va) {
+		char buf[80];
+
+		sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
+		diag = kdb_parse(buf);
+		if (diag)
+			return diag;
+
+		addr = va + linkoffset;
+		if (kdb_getword(&va, addr, sizeof(va)))
+			return 0;
+	}
+	kfree(command);
+
+	return 0;
+}
+
+static int kdb_kgdb(int argc, const char **argv)
+{
+	return KDB_CMD_KGDB;
+}
+
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+	kdbtab_t *kt;
+	int i;
+
+	kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+	kdb_printf("-----------------------------"
+		   "-----------------------------\n");
+	for_each_kdbcmd(kt, i) {
+		if (kt->cmd_name)
+			kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
+				   kt->cmd_usage, kt->cmd_help);
+		if (KDB_FLAG(CMD_INTERRUPT))
+			return 0;
+	}
+	return 0;
+}
+
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+	long sig, pid;
+	char *endp;
+	struct task_struct *p;
+	struct siginfo info;
+
+	if (argc != 2)
+		return KDB_ARGCOUNT;
+
+	sig = simple_strtol(argv[1], &endp, 0);
+	if (*endp)
+		return KDB_BADINT;
+	if (sig >= 0) {
+		kdb_printf("Invalid signal parameter.<-signal>\n");
+		return 0;
+	}
+	sig = -sig;
+
+	pid = simple_strtol(argv[2], &endp, 0);
+	if (*endp)
+		return KDB_BADINT;
+	if (pid <= 0) {
+		kdb_printf("Process ID must be large than 0.\n");
+		return 0;
+	}
+
+	/* Find the process. */
+	p = find_task_by_pid_ns(pid, &init_pid_ns);
+	if (!p) {
+		kdb_printf("The specified process isn't found.\n");
+		return 0;
+	}
+	p = p->group_leader;
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_USER;
+	info.si_pid = pid;  /* same capabilities as process being signalled */
+	info.si_uid = 0;    /* kdb has root authority */
+	kdb_send_sig_info(p, &info);
+	return 0;
+}
+
+struct kdb_tm {
+	int tm_sec;	/* seconds */
+	int tm_min;	/* minutes */
+	int tm_hour;	/* hours */
+	int tm_mday;	/* day of the month */
+	int tm_mon;	/* month */
+	int tm_year;	/* year */
+};
+
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+	/* This will work from 1970-2099, 2100 is not a leap year */
+	static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+				 31, 30, 31, 30, 31 };
+	memset(tm, 0, sizeof(*tm));
+	tm->tm_sec  = tv->tv_sec % (24 * 60 * 60);
+	tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+		(2 * 365 + 1); /* shift base from 1970 to 1968 */
+	tm->tm_min =  tm->tm_sec / 60 % 60;
+	tm->tm_hour = tm->tm_sec / 60 / 60;
+	tm->tm_sec =  tm->tm_sec % 60;
+	tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+	tm->tm_mday %= (4*365+1);
+	mon_day[1] = 29;
+	while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+		tm->tm_mday -= mon_day[tm->tm_mon];
+		if (++tm->tm_mon == 12) {
+			tm->tm_mon = 0;
+			++tm->tm_year;
+			mon_day[1] = 28;
+		}
+	}
+	++tm->tm_mday;
+}
+
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+	struct timespec uptime;
+	do_posix_clock_monotonic_gettime(&uptime);
+	memset(val, 0, sizeof(*val));
+	val->uptime = uptime.tv_sec;
+	val->loads[0] = avenrun[0];
+	val->loads[1] = avenrun[1];
+	val->loads[2] = avenrun[2];
+	val->procs = nr_threads-1;
+	si_meminfo(val);
+
+	return;
+}
+
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+	struct kdb_tm tm;
+	struct sysinfo val;
+
+	if (argc)
+		return KDB_ARGCOUNT;
+
+	kdb_printf("sysname    %s\n", init_uts_ns.name.sysname);
+	kdb_printf("release    %s\n", init_uts_ns.name.release);
+	kdb_printf("version    %s\n", init_uts_ns.name.version);
+	kdb_printf("machine    %s\n", init_uts_ns.name.machine);
+	kdb_printf("nodename   %s\n", init_uts_ns.name.nodename);
+	kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+	kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
+
+	kdb_gmtime(&xtime, &tm);
+	kdb_printf("date       %04d-%02d-%02d %02d:%02d:%02d "
+		   "tz_minuteswest %d\n",
+		1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+		tm.tm_hour, tm.tm_min, tm.tm_sec,
+		sys_tz.tz_minuteswest);
+
+	kdb_sysinfo(&val);
+	kdb_printf("uptime     ");
+	if (val.uptime > (24*60*60)) {
+		int days = val.uptime / (24*60*60);
+		val.uptime %= (24*60*60);
+		kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+	}
+	kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+
+	/* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+	kdb_printf("load avg   %ld.%02ld %ld.%02ld %ld.%02ld\n",
+		LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+		LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+		LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+	/* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	kdb_printf("\nMemTotal:       %8lu kB\nMemFree:        %8lu kB\n"
+		   "Buffers:        %8lu kB\n",
+		   val.totalram, val.freeram, val.bufferram);
+	return 0;
+}
+
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+	char buf[256], fmtstr[64];
+	kdb_symtab_t symtab;
+	cpumask_t suppress = CPU_MASK_NONE;
+	int cpu, diag;
+	unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
+
+	if (argc < 1 || argc > 3)
+		return KDB_ARGCOUNT;
+
+	snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+	if (!kdbgetsymval(buf, &symtab)) {
+		kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+		return KDB_BADADDR;
+	}
+	if (argc >= 2) {
+		diag = kdbgetularg(argv[2], &bytesperword);
+		if (diag)
+			return diag;
+	}
+	if (!bytesperword)
+		bytesperword = KDB_WORD_SIZE;
+	else if (bytesperword > KDB_WORD_SIZE)
+		return KDB_BADWIDTH;
+	sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+	if (argc >= 3) {
+		diag = kdbgetularg(argv[3], &whichcpu);
+		if (diag)
+			return diag;
+		if (!cpu_online(whichcpu)) {
+			kdb_printf("cpu %ld is not online\n", whichcpu);
+			return KDB_BADCPUNUM;
+		}
+	}
+
+	/* Most architectures use __per_cpu_offset[cpu], some use
+	 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+	 */
+#ifdef	__per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef	CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (whichcpu != ~0UL && whichcpu != cpu)
+			continue;
+		addr = symtab.sym_start + KDB_PCU(cpu);
+		diag = kdb_getword(&val, addr, bytesperword);
+		if (diag) {
+			kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+				   "read, diag=%d\n", cpu, addr, diag);
+			continue;
+		}
+#ifdef	CONFIG_SMP
+		if (!val) {
+			cpu_set(cpu, suppress);
+			continue;
+		}
+#endif	/* CONFIG_SMP */
+		kdb_printf("%5d ", cpu);
+		kdb_md_line(fmtstr, addr,
+			bytesperword == KDB_WORD_SIZE,
+			1, bytesperword, 1, 1, 0);
+	}
+	if (cpus_weight(suppress) == 0)
+		return 0;
+	kdb_printf("Zero suppressed cpu(s):");
+	for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
+	     cpu = next_cpu(cpu, suppress)) {
+		kdb_printf(" %d", cpu);
+		if (cpu == num_possible_cpus() - 1 ||
+		    next_cpu(cpu, suppress) != cpu + 1)
+			continue;
+		while (cpu < num_possible_cpus() &&
+		       next_cpu(cpu, suppress) == cpu + 1)
+			++cpu;
+		kdb_printf("-%d", cpu);
+	}
+	kdb_printf("\n");
+
+#undef KDB_PCU
+
+	return 0;
+}
+
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+	kdb_printf("Usage of  cmd args | grep pattern:\n");
+	kdb_printf("  Any command's output may be filtered through an ");
+	kdb_printf("emulated 'pipe'.\n");
+	kdb_printf("  'grep' is just a key word.\n");
+	kdb_printf("  The pattern may include a very limited set of "
+		   "metacharacters:\n");
+	kdb_printf("   pattern or ^pattern or pattern$ or ^pattern$\n");
+	kdb_printf("  And if there are spaces in the pattern, you may "
+		   "quote it:\n");
+	kdb_printf("   \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+		   " or \"^pat tern$\"\n");
+	return 0;
+}
+
+/*
+ * kdb_register_repeat - This function is used to register a kernel
+ * 	debugger command.
+ * Inputs:
+ *	cmd	Command name
+ *	func	Function to execute the command
+ *	usage	A simple usage string showing arguments
+ *	help	A simple help string describing command
+ *	repeat	Does the command auto repeat on enter?
+ * Returns:
+ *	zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50	/* arbitrary */
+int kdb_register_repeat(char *cmd,
+			kdb_func_t func,
+			char *usage,
+			char *help,
+			short minlen,
+			kdb_repeat_t repeat)
+{
+	int i;
+	kdbtab_t *kp;
+
+	/*
+	 *  Brute force method to determine duplicates
+	 */
+	for_each_kdbcmd(kp, i) {
+		if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+			kdb_printf("Duplicate kdb command registered: "
+				"%s, func %p help %s\n", cmd, func, help);
+			return 1;
+		}
+	}
+
+	/*
+	 * Insert command into first available location in table
+	 */
+	for_each_kdbcmd(kp, i) {
+		if (kp->cmd_name == NULL)
+			break;
+	}
+
+	if (i >= kdb_max_commands) {
+		kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+			 kdb_command_extend) * sizeof(*new), GFP_KDB);
+		if (!new) {
+			kdb_printf("Could not allocate new kdb_command "
+				   "table\n");
+			return 1;
+		}
+		if (kdb_commands) {
+			memcpy(new, kdb_commands,
+			       kdb_max_commands * sizeof(*new));
+			kfree(kdb_commands);
+		}
+		memset(new + kdb_max_commands, 0,
+		       kdb_command_extend * sizeof(*new));
+		kdb_commands = new;
+		kp = kdb_commands + kdb_max_commands;
+		kdb_max_commands += kdb_command_extend;
+	}
+
+	kp->cmd_name   = cmd;
+	kp->cmd_func   = func;
+	kp->cmd_usage  = usage;
+	kp->cmd_help   = help;
+	kp->cmd_flags  = 0;
+	kp->cmd_minlen = minlen;
+	kp->cmd_repeat = repeat;
+
+	return 0;
+}
+
+/*
+ * kdb_register - Compatibility register function for commands that do
+ *	not need to specify a repeat state.  Equivalent to
+ *	kdb_register_repeat with KDB_REPEAT_NONE.
+ * Inputs:
+ *	cmd	Command name
+ *	func	Function to execute the command
+ *	usage	A simple usage string showing arguments
+ *	help	A simple help string describing command
+ * Returns:
+ *	zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+	     kdb_func_t func,
+	     char *usage,
+	     char *help,
+	     short minlen)
+{
+	return kdb_register_repeat(cmd, func, usage, help, minlen,
+				   KDB_REPEAT_NONE);
+}
+
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ *	debugger command.  It is generally called when a module which
+ *	implements kdb commands is unloaded.
+ * Inputs:
+ *	cmd	Command name
+ * Returns:
+ *	zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+	int i;
+	kdbtab_t *kp;
+
+	/*
+	 *  find the command.
+	 */
+	for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+		if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+			kp->cmd_name = NULL;
+			return 0;
+		}
+	}
+
+	/* Couldn't find it.  */
+	return 1;
+}
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+	int i;
+	kdbtab_t *kp;
+
+	for_each_kdbcmd(kp, i)
+		kp->cmd_name = NULL;
+
+	kdb_register_repeat("md", kdb_md, "<vaddr>",
+	  "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+			    KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+	  "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+	  "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mds", kdb_md, "<vaddr>",
+	  "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+	  "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+	kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+	  "Continue Execution", 1, KDB_REPEAT_NONE);
+	kdb_register_repeat("rd", kdb_rd, "",
+	  "Display Registers", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+	  "Modify Registers", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+	  "Display exception frame", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+	  "Stack traceback", 1, KDB_REPEAT_NONE);
+	kdb_register_repeat("btp", kdb_bt, "<pid>",
+	  "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
+	  "Display stack all processes", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("btc", kdb_bt, "",
+	  "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+	  "Backtrace process given its struct task address", 0,
+			    KDB_REPEAT_NONE);
+	kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
+	  "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("env", kdb_env, "",
+	  "Show environment variables", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("set", kdb_set, "",
+	  "Set environment variables", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("help", kdb_help, "",
+	  "Display Help Message", 1, KDB_REPEAT_NONE);
+	kdb_register_repeat("?", kdb_help, "",
+	  "Display Help Message", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+	  "Switch to new cpu", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("kgdb", kdb_kgdb, "",
+	  "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+	  "Display active task list", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+	  "Switch to another task", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("reboot", kdb_reboot, "",
+	  "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+#if defined(CONFIG_MODULES)
+	kdb_register_repeat("lsmod", kdb_lsmod, "",
+	  "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+	kdb_register_repeat("sr", kdb_sr, "<key>",
+	  "Magic SysRq key", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_PRINTK)
+	kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+	  "Display syslog buffer", 0, KDB_REPEAT_NONE);
+#endif
+	kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+	  "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+	  "Send a signal to a process", 0, KDB_REPEAT_NONE);
+	kdb_register_repeat("summary", kdb_summary, "",
+	  "Summarize the system", 4, KDB_REPEAT_NONE);
+	kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+	  "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+	kdb_register_repeat("grephelp", kdb_grep_help, "",
+	  "Display help on | grep", 0, KDB_REPEAT_NONE);
+}
+
+/* Execute any commands defined in kdb_cmds.  */
+static void __init kdb_cmd_init(void)
+{
+	int i, diag;
+	for (i = 0; kdb_cmds[i]; ++i) {
+		diag = kdb_parse(kdb_cmds[i]);
+		if (diag)
+			kdb_printf("kdb command %s failed, kdb diag %d\n",
+				kdb_cmds[i], diag);
+	}
+	if (defcmd_in_progress) {
+		kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+		kdb_parse("endefcmd");
+	}
+}
+
+/* Intialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+	static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+	int i;
+
+	if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+		return;
+	for (i = kdb_init_lvl; i < lvl; i++) {
+		switch (i) {
+		case KDB_NOT_INITIALIZED:
+			kdb_inittab();		/* Initialize Command Table */
+			kdb_initbptab();	/* Initialize Breakpoints */
+			break;
+		case KDB_INIT_EARLY:
+			kdb_cmd_init();		/* Build kdb_cmds tables */
+			break;
+		}
+	}
+	kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..69ed2eff3fea
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,301 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+
+/* Kernel Debugger Error codes.  Must not overlap with command codes. */
+#define KDB_NOTFOUND	(-1)
+#define KDB_ARGCOUNT	(-2)
+#define KDB_BADWIDTH	(-3)
+#define KDB_BADRADIX	(-4)
+#define KDB_NOTENV	(-5)
+#define KDB_NOENVVALUE	(-6)
+#define KDB_NOTIMP	(-7)
+#define KDB_ENVFULL	(-8)
+#define KDB_ENVBUFFULL	(-9)
+#define KDB_TOOMANYBPT	(-10)
+#define KDB_TOOMANYDBREGS (-11)
+#define KDB_DUPBPT	(-12)
+#define KDB_BPTNOTFOUND	(-13)
+#define KDB_BADMODE	(-14)
+#define KDB_BADINT	(-15)
+#define KDB_INVADDRFMT  (-16)
+#define KDB_BADREG      (-17)
+#define KDB_BADCPUNUM   (-18)
+#define KDB_BADLENGTH	(-19)
+#define KDB_NOBP	(-20)
+#define KDB_BADADDR	(-21)
+
+/* Kernel Debugger Command codes.  Must not overlap with error codes. */
+#define KDB_CMD_GO	(-1001)
+#define KDB_CMD_CPU	(-1002)
+#define KDB_CMD_SS	(-1003)
+#define KDB_CMD_SSB	(-1004)
+#define KDB_CMD_KGDB (-1005)
+#define KDB_CMD_KGDB2 (-1006)
+
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP	0x0002	/* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM	0x0004	/* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR	0x0008	/* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA	0x0010	/* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB	0x0020	/* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE	0x0040	/* State flags */
+#define KDB_DEBUG_FLAG_MASK	0xffff	/* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT	16	/* Shift factor for dbflags */
+
+#define KDB_DEBUG(flag)	(kdb_flags & \
+	(KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+		kdb_print_state(text, value)
+
+#if BITS_PER_LONG == 32
+
+#define KDB_PLATFORM_ENV	"BYTESPERWORD=4"
+
+#define kdb_machreg_fmt		"0x%lx"
+#define kdb_machreg_fmt0	"0x%08lx"
+#define kdb_bfd_vma_fmt		"0x%lx"
+#define kdb_bfd_vma_fmt0	"0x%08lx"
+#define kdb_elfw_addr_fmt	"0x%x"
+#define kdb_elfw_addr_fmt0	"0x%08x"
+#define kdb_f_count_fmt		"%d"
+
+#elif BITS_PER_LONG == 64
+
+#define KDB_PLATFORM_ENV	"BYTESPERWORD=8"
+
+#define kdb_machreg_fmt		"0x%lx"
+#define kdb_machreg_fmt0	"0x%016lx"
+#define kdb_bfd_vma_fmt		"0x%lx"
+#define kdb_bfd_vma_fmt0	"0x%016lx"
+#define kdb_elfw_addr_fmt	"0x%x"
+#define kdb_elfw_addr_fmt0	"0x%016x"
+#define kdb_f_count_fmt		"%ld"
+
+#endif
+
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT	16
+
+/* Maximum number of arguments to a function  */
+#define KDB_MAXARGS    16
+
+typedef enum {
+	KDB_REPEAT_NONE = 0,	/* Do not repeat this command */
+	KDB_REPEAT_NO_ARGS,	/* Repeat the command without arguments */
+	KDB_REPEAT_WITH_ARGS,	/* Repeat the command including its arguments */
+} kdb_repeat_t;
+
+typedef int (*kdb_func_t)(int, const char **);
+
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+		unsigned long value;	/* Address of symbol */
+		const char *mod_name;	/* Module containing symbol or
+					 * "kernel" */
+		unsigned long mod_start;
+		unsigned long mod_end;
+		const char *sec_name;	/* Section containing symbol */
+		unsigned long sec_start;
+		unsigned long sec_end;
+		const char *sym_name;	/* Full symbol name, including
+					 * any version */
+		unsigned long sym_start;
+		unsigned long sym_end;
+		} kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_register(char *, kdb_func_t, char *, char *, short);
+extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
+			       short, kdb_repeat_t);
+extern int kdb_unregister(char *);
+
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers.  The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+
+extern int kdb_getphysword(unsigned long *word,
+			unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+
+extern int kdbgetularg(const char *, unsigned long *);
+extern int kdb_set(int, const char **);
+extern char *kdbgetenv(const char *);
+extern int kdbgetintenv(const char *, int *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+			 long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+
+extern int kdb_state;
+#define KDB_STATE_KDB		0x00000001	/* Cpu is inside kdb */
+#define KDB_STATE_LEAVING	0x00000002	/* Cpu is leaving kdb */
+#define KDB_STATE_CMD		0x00000004	/* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL	0x00000008	/* This cpu is under
+						 * kdb control */
+#define KDB_STATE_HOLD_CPU	0x00000010	/* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS	0x00000020	/* Doing ss command */
+#define KDB_STATE_DOING_SSB	0x00000040	/* Doing ssb command,
+						 * DOING_SS is also set */
+#define KDB_STATE_SSBPT		0x00000080	/* Install breakpoint
+						 * after one ss, independent of
+						 * DOING_SS */
+#define KDB_STATE_REENTRY	0x00000100	/* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS	0x00000200	/* Suppress error messages */
+#define KDB_STATE_PAGER		0x00000400	/* pager is available */
+#define KDB_STATE_GO_SWITCH	0x00000800	/* go is switching
+						 * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK	0x00001000	/* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI	0x00002000	/* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE	0x00004000	/* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED	0x00008000	/* Restart IP has been
+						 * adjusted */
+#define KDB_STATE_GO1		0x00010000	/* go only releases one cpu */
+#define KDB_STATE_KEYBOARD	0x00020000	/* kdb entered via
+						 * keyboard on this cpu */
+#define KDB_STATE_KEXEC		0x00040000	/* kexec issued */
+#define KDB_STATE_DOING_KGDB	0x00080000	/* kgdb enter now issued */
+#define KDB_STATE_DOING_KGDB2	0x00100000	/* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS	0x00200000	/* Transition to kgdb */
+#define KDB_STATE_ARCH		0xff000000	/* Reserved for arch
+						 * specific use */
+
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+
+extern int kdb_nextline; /* Current number of lines displayed */
+
+typedef struct _kdb_bp {
+	unsigned long	bp_addr;	/* Address breakpoint is present at */
+	unsigned int	bp_free:1;	/* This entry is available */
+	unsigned int	bp_enabled:1;	/* Breakpoint is active in register */
+	unsigned int	bp_type:4;	/* Uses hardware register */
+	unsigned int	bp_installed:1;	/* Breakpoint is installed */
+	unsigned int	bp_delay:1;	/* Do delayed bp handling */
+	unsigned int	bp_delayed:1;	/* Delayed breakpoint */
+	unsigned int	bph_length;	/* HW break length */
+} kdb_bp_t;
+
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+
+/* The KDB shell command table */
+typedef struct _kdbtab {
+	char    *cmd_name;		/* Command name */
+	kdb_func_t cmd_func;		/* Function to execute command */
+	char    *cmd_usage;		/* Usage String for this command */
+	char    *cmd_help;		/* Help message for this command */
+	short    cmd_flags;		/* Parsing flags */
+	short    cmd_minlen;		/* Minimum legal # command
+					 * chars required */
+	kdb_repeat_t cmd_repeat;	/* Does command auto repeat on enter? */
+} kdbtab_t;
+
+extern int kdb_bt(int, const char **);	/* KDB display back trace */
+
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+
+typedef enum {
+	KDB_DB_BPT,	/* Breakpoint */
+	KDB_DB_SS,	/* Single-step trap */
+	KDB_DB_SSB,	/* Single step to branch */
+	KDB_DB_SSBPT,	/* Single step over breakpoint */
+	KDB_DB_NOBPT	/* Spurious breakpoint */
+} kdb_dbtrap_t;
+
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+			 int, kdb_dbtrap_t, struct pt_regs *);
+
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+extern char kdb_grep_string[];
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern void kdb_syslog_data(char *syslog_data[]);
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+				    unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern int kdb_parse(const char *cmdstr);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+extern const char *kdb_walk_kallsyms(loff_t *pos);
+extern char *kdb_getstr(char *, size_t, char *);
+
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB	0x0001		/* Space before string */
+#define KDB_SP_SPACEA	0x0002		/* Space after string */
+#define KDB_SP_PAREN	0x0004		/* Parenthesis around string */
+#define KDB_SP_VALUE	0x0008		/* Print the value of the address */
+#define KDB_SP_SYMSIZE	0x0010		/* Print the size of the symbol */
+#define KDB_SP_NEWLINE	0x0020		/* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+
+extern struct task_struct *kdb_curr_task(int);
+
+#define kdb_task_has_cpu(p) (task_curr(p))
+
+/* Simplify coexistence with NPTL */
+#define	kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define	kdb_while_each_thread(g, p) while_each_thread(g, p)
+
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+
+extern char kdb_prompt_str[];
+
+#define	KDB_WORD_SIZE	((int)sizeof(unsigned long))
+
+#endif /* CONFIG_KGDB_KDB */
+#endif	/* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ * 03/02/13    added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ *	symname	Character string containing symbol name
+ *      symtab  Structure to receive results
+ * Returns:
+ *	0	Symbol not found, symtab zero filled
+ *	1	Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+			   symtab);
+	memset(symtab, 0, sizeof(*symtab));
+	symtab->sym_start = kallsyms_lookup_name(symname);
+	if (symtab->sym_start) {
+		if (KDB_DEBUG(AR))
+			kdb_printf("kdbgetsymval: returns 1, "
+				   "symtab->sym_start=0x%lx\n",
+				   symtab->sym_start);
+		return 1;
+	}
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbgetsymval: returns 0\n");
+	return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+
+static char *kdb_name_table[100];	/* arbitrary size */
+
+/*
+ * kdbnearsym -	Return the name of the symbol with the nearest address
+ *	less than 'addr'.
+ *
+ * Parameters:
+ *	addr	Address to check for symbol near
+ *	symtab  Structure to receive results
+ * Returns:
+ *	0	No sections contain this address, symtab zero filled
+ *	1	Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ *	2.6 kallsyms has a "feature" where it unpacks the name into a
+ *	string.  If that string is reused before the caller expects it
+ *	then the caller sees its string change without warning.  To
+ *	avoid cluttering up the main kdb code with lots of kdb_strdup,
+ *	tests and kfree calls, kdbnearsym maintains an LRU list of the
+ *	last few unique strings.  The list is sized large enough to
+ *	hold active strings, no kdb caller of kdbnearsym makes more
+ *	than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+	int ret = 0;
+	unsigned long symbolsize;
+	unsigned long offset;
+#define knt1_size 128		/* must be >= kallsyms table size */
+	char *knt1 = NULL;
+
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+	memset(symtab, 0, sizeof(*symtab));
+
+	if (addr < 4096)
+		goto out;
+	knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+	if (!knt1) {
+		kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+			   addr);
+		goto out;
+	}
+	symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+				(char **)(&symtab->mod_name), knt1);
+	if (offset > 8*1024*1024) {
+		symtab->sym_name = NULL;
+		addr = offset = symbolsize = 0;
+	}
+	symtab->sym_start = addr - offset;
+	symtab->sym_end = symtab->sym_start + symbolsize;
+	ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+
+	if (ret) {
+		int i;
+		/* Another 2.6 kallsyms "feature".  Sometimes the sym_name is
+		 * set but the buffer passed into kallsyms_lookup is not used,
+		 * so it contains garbage.  The caller has to work out which
+		 * buffer needs to be saved.
+		 *
+		 * What was Rusty smoking when he wrote that code?
+		 */
+		if (symtab->sym_name != knt1) {
+			strncpy(knt1, symtab->sym_name, knt1_size);
+			knt1[knt1_size-1] = '\0';
+		}
+		for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+			if (kdb_name_table[i] &&
+			    strcmp(kdb_name_table[i], knt1) == 0)
+				break;
+		}
+		if (i >= ARRAY_SIZE(kdb_name_table)) {
+			debug_kfree(kdb_name_table[0]);
+			memcpy(kdb_name_table, kdb_name_table+1,
+			       sizeof(kdb_name_table[0]) *
+			       (ARRAY_SIZE(kdb_name_table)-1));
+		} else {
+			debug_kfree(knt1);
+			knt1 = kdb_name_table[i];
+			memcpy(kdb_name_table+i, kdb_name_table+i+1,
+			       sizeof(kdb_name_table[0]) *
+			       (ARRAY_SIZE(kdb_name_table)-i-1));
+		}
+		i = ARRAY_SIZE(kdb_name_table) - 1;
+		kdb_name_table[i] = knt1;
+		symtab->sym_name = kdb_name_table[i];
+		knt1 = NULL;
+	}
+
+	if (symtab->mod_name == NULL)
+		symtab->mod_name = "kernel";
+	if (KDB_DEBUG(AR))
+		kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+		   "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+		   symtab->sym_start, symtab->mod_name, symtab->sym_name,
+		   symtab->sym_name);
+
+out:
+	debug_kfree(knt1);
+	return ret;
+}
+
+void kdbnearsym_cleanup(void)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+		if (kdb_name_table[i]) {
+			debug_kfree(kdb_name_table[i]);
+			kdb_name_table[i] = NULL;
+		}
+	}
+}
+
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ *	prefix_name	prefix of a symbol name to lookup
+ *	max_len		maximum length that can be returned
+ * Returns:
+ *	Number of symbols which match the given prefix.
+ * Notes:
+ *	prefix_name is changed to contain the longest unique prefix that
+ *	starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+	loff_t pos = 0;
+	int prefix_len = strlen(prefix_name), prev_len = 0;
+	int i, number = 0;
+	const char *name;
+
+	while ((name = kdb_walk_kallsyms(&pos))) {
+		if (strncmp(name, prefix_name, prefix_len) == 0) {
+			strcpy(ks_namebuf, name);
+			/* Work out the longest name that matches the prefix */
+			if (++number == 1) {
+				prev_len = min_t(int, max_len-1,
+						 strlen(ks_namebuf));
+				memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+				ks_namebuf_prev[prev_len] = '\0';
+				continue;
+			}
+			for (i = 0; i < prev_len; i++) {
+				if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+					prev_len = i;
+					ks_namebuf_prev[i] = '\0';
+					break;
+				}
+			}
+		}
+	}
+	if (prev_len > prefix_len)
+		memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+	return number;
+}
+
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ *	prefix_name	prefix of a symbol name to lookup
+ *	flag	0 means search from the head, 1 means continue search.
+ * Returns:
+ *	1 if a symbol matches the given prefix.
+ *	0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+	int prefix_len = strlen(prefix_name);
+	static loff_t pos;
+	const char *name;
+
+	if (!flag)
+		pos = 0;
+
+	while ((name = kdb_walk_kallsyms(&pos))) {
+		if (strncmp(name, prefix_name, prefix_len) == 0) {
+			strncpy(prefix_name, name, strlen(name)+1);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ *	addr	Address to be printed.
+ *	symtab	Address of symbol data, if NULL this routine does its
+ *		own lookup.
+ *	punc	Punctuation for string, bit field.
+ * Remarks:
+ *	The string and its punctuation is only printed if the address
+ *	is inside the kernel, except that the value is always printed
+ *	when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+		      unsigned int punc)
+{
+	kdb_symtab_t symtab, *symtab_p2;
+	if (symtab_p) {
+		symtab_p2 = (kdb_symtab_t *)symtab_p;
+	} else {
+		symtab_p2 = &symtab;
+		kdbnearsym(addr, symtab_p2);
+	}
+	if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+		return;
+	if (punc & KDB_SP_SPACEB)
+		kdb_printf(" ");
+	if (punc & KDB_SP_VALUE)
+		kdb_printf(kdb_machreg_fmt0, addr);
+	if (symtab_p2->sym_name) {
+		if (punc & KDB_SP_VALUE)
+			kdb_printf(" ");
+		if (punc & KDB_SP_PAREN)
+			kdb_printf("(");
+		if (strcmp(symtab_p2->mod_name, "kernel"))
+			kdb_printf("[%s]", symtab_p2->mod_name);
+		kdb_printf("%s", symtab_p2->sym_name);
+		if (addr != symtab_p2->sym_start)
+			kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+		if (punc & KDB_SP_SYMSIZE)
+			kdb_printf("/0x%lx",
+				   symtab_p2->sym_end - symtab_p2->sym_start);
+		if (punc & KDB_SP_PAREN)
+			kdb_printf(")");
+	}
+	if (punc & KDB_SP_SPACEA)
+		kdb_printf(" ");
+	if (punc & KDB_SP_NEWLINE)
+		kdb_printf("\n");
+}
+
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ *	str	The string to duplicate.
+ *	type	Flags to kmalloc for the new string.
+ * Returns:
+ *	Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ *	This is not in lib/string.c because it uses kmalloc which is not
+ *	available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+	int n = strlen(str)+1;
+	char *s = kmalloc(n, type);
+	if (!s)
+		return NULL;
+	return strcpy(s, str);
+}
+
+/*
+ * kdb_getarea_size - Read an area of data.  The kdb equivalent of
+ *	copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ *	res	Pointer to the area to receive the result.
+ *	addr	Address of the area to copy.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+	int ret = probe_kernel_read((char *)res, (char *)addr, size);
+	if (ret) {
+		if (!KDB_STATE(SUPPRESS)) {
+			kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+			KDB_STATE_SET(SUPPRESS);
+		}
+		ret = KDB_BADADDR;
+	} else {
+		KDB_STATE_CLEAR(SUPPRESS);
+	}
+	return ret;
+}
+
+/*
+ * kdb_putarea_size - Write an area of data.  The kdb equivalent of
+ *	copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ *	addr	Address of the area to write to.
+ *	res	Pointer to the area holding the data.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+	int ret = probe_kernel_read((char *)addr, (char *)res, size);
+	if (ret) {
+		if (!KDB_STATE(SUPPRESS)) {
+			kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+			KDB_STATE_SET(SUPPRESS);
+		}
+		ret = KDB_BADADDR;
+	} else {
+		KDB_STATE_CLEAR(SUPPRESS);
+	}
+	return ret;
+}
+
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ * 	address is in range, use kmap_atomic() to get data
+ * 	similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ * 	res	Pointer to the word to receive the result
+ * 	addr	Physical address of the area to copy
+ * 	size	Size of the area
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+	unsigned long pfn;
+	void *vaddr;
+	struct page *page;
+
+	pfn = (addr >> PAGE_SHIFT);
+	if (!pfn_valid(pfn))
+		return 1;
+	page = pfn_to_page(pfn);
+	vaddr = kmap_atomic(page, KM_KDB);
+	memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+	kunmap_atomic(vaddr, KM_KDB);
+
+	return 0;
+}
+
+/*
+ * kdb_getphysword
+ * Inputs:
+ *	word	Pointer to the word to receive the result.
+ *	addr	Address of the area to copy.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+	int diag;
+	__u8  w1;
+	__u16 w2;
+	__u32 w4;
+	__u64 w8;
+	*word = 0;	/* Default value if addr or size is invalid */
+
+	switch (size) {
+	case 1:
+		diag = kdb_getphys(&w1, addr, sizeof(w1));
+		if (!diag)
+			*word = w1;
+		break;
+	case 2:
+		diag = kdb_getphys(&w2, addr, sizeof(w2));
+		if (!diag)
+			*word = w2;
+		break;
+	case 4:
+		diag = kdb_getphys(&w4, addr, sizeof(w4));
+		if (!diag)
+			*word = w4;
+		break;
+	case 8:
+		if (size <= sizeof(*word)) {
+			diag = kdb_getphys(&w8, addr, sizeof(w8));
+			if (!diag)
+				*word = w8;
+			break;
+		}
+		/* drop through */
+	default:
+		diag = KDB_BADWIDTH;
+		kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+	}
+	return diag;
+}
+
+/*
+ * kdb_getword - Read a binary value.  Unlike kdb_getarea, this treats
+ *	data as numbers.
+ * Inputs:
+ *	word	Pointer to the word to receive the result.
+ *	addr	Address of the area to copy.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+	int diag;
+	__u8  w1;
+	__u16 w2;
+	__u32 w4;
+	__u64 w8;
+	*word = 0;	/* Default value if addr or size is invalid */
+	switch (size) {
+	case 1:
+		diag = kdb_getarea(w1, addr);
+		if (!diag)
+			*word = w1;
+		break;
+	case 2:
+		diag = kdb_getarea(w2, addr);
+		if (!diag)
+			*word = w2;
+		break;
+	case 4:
+		diag = kdb_getarea(w4, addr);
+		if (!diag)
+			*word = w4;
+		break;
+	case 8:
+		if (size <= sizeof(*word)) {
+			diag = kdb_getarea(w8, addr);
+			if (!diag)
+				*word = w8;
+			break;
+		}
+		/* drop through */
+	default:
+		diag = KDB_BADWIDTH;
+		kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+	}
+	return diag;
+}
+
+/*
+ * kdb_putword - Write a binary value.  Unlike kdb_putarea, this
+ *	treats data as numbers.
+ * Inputs:
+ *	addr	Address of the area to write to..
+ *	word	The value to set.
+ *	size	Size of the area.
+ * Returns:
+ *	0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+	int diag;
+	__u8  w1;
+	__u16 w2;
+	__u32 w4;
+	__u64 w8;
+	switch (size) {
+	case 1:
+		w1 = word;
+		diag = kdb_putarea(addr, w1);
+		break;
+	case 2:
+		w2 = word;
+		diag = kdb_putarea(addr, w2);
+		break;
+	case 4:
+		w4 = word;
+		diag = kdb_putarea(addr, w4);
+		break;
+	case 8:
+		if (size <= sizeof(word)) {
+			w8 = word;
+			diag = kdb_putarea(addr, w8);
+			break;
+		}
+		/* drop through */
+	default:
+		diag = KDB_BADWIDTH;
+		kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+	}
+	return diag;
+}
+
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ *	letters DRSTCZEUIMA to a mask for the process state field and
+ *	return the value.  If no argument is supplied, return the mask
+ *	that corresponds to environment variable PS, DRSTCZEU by
+ *	default.
+ * Inputs:
+ *	s	String to convert
+ * Returns:
+ *	Mask for process state.
+ * Notes:
+ *	The mask folds data from several sources into a single long value, so
+ *	be carefull not to overlap the bits.  TASK_* bits are in the LSB,
+ *	special cases like UNRUNNABLE are in the MSB.  As of 2.6.10-rc1 there
+ *	is no overlap between TASK_* and EXIT_* but that may not always be
+ *	true, so EXIT_* bits are shifted left 16 bits before being stored in
+ *	the mask.
+ */
+
+/* unrunnable is < 0 */
+#define UNRUNNABLE	(1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING		(1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE		(1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON		(1UL << (8*sizeof(unsigned long) - 4))
+
+unsigned long kdb_task_state_string(const char *s)
+{
+	long res = 0;
+	if (!s) {
+		s = kdbgetenv("PS");
+		if (!s)
+			s = "DRSTCZEU";	/* default value for ps */
+	}
+	while (*s) {
+		switch (*s) {
+		case 'D':
+			res |= TASK_UNINTERRUPTIBLE;
+			break;
+		case 'R':
+			res |= RUNNING;
+			break;
+		case 'S':
+			res |= TASK_INTERRUPTIBLE;
+			break;
+		case 'T':
+			res |= TASK_STOPPED;
+			break;
+		case 'C':
+			res |= TASK_TRACED;
+			break;
+		case 'Z':
+			res |= EXIT_ZOMBIE << 16;
+			break;
+		case 'E':
+			res |= EXIT_DEAD << 16;
+			break;
+		case 'U':
+			res |= UNRUNNABLE;
+			break;
+		case 'I':
+			res |= IDLE;
+			break;
+		case 'M':
+			res |= DAEMON;
+			break;
+		case 'A':
+			res = ~0UL;
+			break;
+		default:
+			  kdb_printf("%s: unknown flag '%c' ignored\n",
+				     __func__, *s);
+			  break;
+		}
+		++s;
+	}
+	return res;
+}
+
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ *	p	struct task for the process
+ * Returns:
+ *	One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+	int cpu;
+	char state;
+	unsigned long tmp;
+
+	if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+		return 'E';
+
+	cpu = kdb_process_cpu(p);
+	state = (p->state == 0) ? 'R' :
+		(p->state < 0) ? 'U' :
+		(p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+		(p->state & TASK_STOPPED) ? 'T' :
+		(p->state & TASK_TRACED) ? 'C' :
+		(p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+		(p->exit_state & EXIT_DEAD) ? 'E' :
+		(p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+	if (p->pid == 0) {
+		/* Idle task.  Is it really idle, apart from the kdb
+		 * interrupt? */
+		if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+			if (cpu != kdb_initial_cpu)
+				state = 'I';	/* idle task */
+		}
+	} else if (!p->mm && state == 'S') {
+		state = 'M';	/* sleeping system daemon */
+	}
+	return state;
+}
+
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ *	given by the mask.
+ * Inputs:
+ *	p	struct task for the process
+ *	mask	mask from kdb_task_state_string to select processes
+ * Returns:
+ *	True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+	char state[] = { kdb_task_state_char(p), '\0' };
+	return (mask & kdb_task_state_string(state)) != 0;
+}
+
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ *	value to a symbol lookup if possible.
+ * Inputs:
+ *	name	field name to print
+ *	val	value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+	kdb_symtab_t symtab;
+	kdb_printf("  %-11.11s ", name);
+	if (kdbnearsym(val, &symtab))
+		kdb_symbol_print(val, &symtab,
+				 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+	else
+		kdb_printf("0x%lx\n", val);
+}
+
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted.  The algorithms are tuned
+ * for space usage, not for speed.  One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+
+struct debug_alloc_header {
+	u32 next;	/* offset of next header from start of pool */
+	u32 size;
+	void *caller;
+};
+
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size.  Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+
+static u64 debug_alloc_pool_aligned[256*1024/dah_align];	/* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+
+/* Locking is awkward.  The debug code is called from all contexts,
+ * including non maskable interrupts.  A normal spinlock is not safe
+ * in NMI context.  Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up.  If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+	__acquires(dap_lock)
+{
+	static int dap_locked = -1;
+	int count;
+	if (dap_locked == smp_processor_id())
+		count = 1;
+	else
+		count = 1000;
+	while (1) {
+		if (spin_trylock(&dap_lock)) {
+			dap_locked = -1;
+			return 1;
+		}
+		if (!count--)
+			break;
+		udelay(1000);
+	}
+	dap_locked = smp_processor_id();
+	__acquire(dap_lock);
+	return 0;
+}
+
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+	unsigned int rem, h_offset;
+	struct debug_alloc_header *best, *bestprev, *prev, *h;
+	void *p = NULL;
+	if (!get_dap_lock()) {
+		__release(dap_lock);	/* we never actually got it */
+		return NULL;
+	}
+	h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+	if (dah_first_call) {
+		h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+		dah_first_call = 0;
+	}
+	size = ALIGN(size, dah_align);
+	prev = best = bestprev = NULL;
+	while (1) {
+		if (h->size >= size && (!best || h->size < best->size)) {
+			best = h;
+			bestprev = prev;
+			if (h->size == size)
+				break;
+		}
+		if (!h->next)
+			break;
+		prev = h;
+		h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+	}
+	if (!best)
+		goto out;
+	rem = best->size - size;
+	/* The pool must always contain at least one header */
+	if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+		goto out;
+	if (rem >= dah_overhead) {
+		best->size = size;
+		h_offset = ((char *)best - debug_alloc_pool) +
+			   dah_overhead + best->size;
+		h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+		h->size = rem - dah_overhead;
+		h->next = best->next;
+	} else
+		h_offset = best->next;
+	best->caller = __builtin_return_address(0);
+	dah_used += best->size;
+	dah_used_max = max(dah_used, dah_used_max);
+	if (bestprev)
+		bestprev->next = h_offset;
+	else
+		dah_first = h_offset;
+	p = (char *)best + dah_overhead;
+	memset(p, POISON_INUSE, best->size - 1);
+	*((char *)p + best->size - 1) = POISON_END;
+out:
+	spin_unlock(&dap_lock);
+	return p;
+}
+
+void debug_kfree(void *p)
+{
+	struct debug_alloc_header *h;
+	unsigned int h_offset;
+	if (!p)
+		return;
+	if ((char *)p < debug_alloc_pool ||
+	    (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+		kfree(p);
+		return;
+	}
+	if (!get_dap_lock()) {
+		__release(dap_lock);	/* we never actually got it */
+		return;		/* memory leak, cannot be helped */
+	}
+	h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+	memset(p, POISON_FREE, h->size - 1);
+	*((char *)p + h->size - 1) = POISON_END;
+	h->caller = NULL;
+	dah_used -= h->size;
+	h_offset = (char *)h - debug_alloc_pool;
+	if (h_offset < dah_first) {
+		h->next = dah_first;
+		dah_first = h_offset;
+	} else {
+		struct debug_alloc_header *prev;
+		unsigned int prev_offset;
+		prev = (struct debug_alloc_header *)(debug_alloc_pool +
+						     dah_first);
+		while (1) {
+			if (!prev->next || prev->next > h_offset)
+				break;
+			prev = (struct debug_alloc_header *)
+				(debug_alloc_pool + prev->next);
+		}
+		prev_offset = (char *)prev - debug_alloc_pool;
+		if (prev_offset + dah_overhead + prev->size == h_offset) {
+			prev->size += dah_overhead + h->size;
+			memset(h, POISON_FREE, dah_overhead - 1);
+			*((char *)h + dah_overhead - 1) = POISON_END;
+			h = prev;
+			h_offset = prev_offset;
+		} else {
+			h->next = prev->next;
+			prev->next = h_offset;
+		}
+	}
+	if (h_offset + dah_overhead + h->size == h->next) {
+		struct debug_alloc_header *next;
+		next = (struct debug_alloc_header *)
+			(debug_alloc_pool + h->next);
+		h->size += dah_overhead + next->size;
+		h->next = next->next;
+		memset(next, POISON_FREE, dah_overhead - 1);
+		*((char *)next + dah_overhead - 1) = POISON_END;
+	}
+	spin_unlock(&dap_lock);
+}
+
+void debug_kusage(void)
+{
+	struct debug_alloc_header *h_free, *h_used;
+#ifdef	CONFIG_IA64
+	/* FIXME: using dah for ia64 unwind always results in a memory leak.
+	 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+	 * all architectures.
+	 */
+	static int debug_kusage_one_time;
+#else
+	static int debug_kusage_one_time = 1;
+#endif
+	if (!get_dap_lock()) {
+		__release(dap_lock);	/* we never actually got it */
+		return;
+	}
+	h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+	if (dah_first == 0 &&
+	    (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+	     dah_first_call))
+		goto out;
+	if (!debug_kusage_one_time)
+		goto out;
+	debug_kusage_one_time = 0;
+	kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+		   __func__, dah_first);
+	if (dah_first) {
+		h_used = (struct debug_alloc_header *)debug_alloc_pool;
+		kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+			   h_used->size);
+	}
+	do {
+		h_used = (struct debug_alloc_header *)
+			  ((char *)h_free + dah_overhead + h_free->size);
+		kdb_printf("%s: h_used %p size %d caller %p\n",
+			   __func__, h_used, h_used->size, h_used->caller);
+		h_free = (struct debug_alloc_header *)
+			  (debug_alloc_pool + h_free->next);
+	} while (h_free->next);
+	h_used = (struct debug_alloc_header *)
+		  ((char *)h_free + dah_overhead + h_free->size);
+	if ((char *)h_used - debug_alloc_pool !=
+	    sizeof(debug_alloc_pool_aligned))
+		kdb_printf("%s: h_used %p size %d caller %p\n",
+			   __func__, h_used, h_used->size, h_used->caller);
+out:
+	spin_unlock(&dap_lock);
+}
+
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+
+static int kdb_flags_stack[4], kdb_flags_index;
+
+void kdb_save_flags(void)
+{
+	BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+	kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+
+void kdb_restore_flags(void)
+{
+	BUG_ON(kdb_flags_index <= 0);
+	kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
-- 
cgit v1.2.3-55-g7522


From 67fc4e0cb931d6b4ccf21248e4199b154478ecea Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:21 -0500
Subject: kdb: core for kgdb back end (2 of 2)

This patch contains the hooks and instrumentation into kernel which
live outside the kernel/debug directory, which the kdb core
will call to run commands like lsmod, dmesg, bt etc...

CC: linux-arch@vger.kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Martin Hicks <mort@sgi.com>
---
 arch/arm/include/asm/kmap_types.h     |  1 +
 arch/powerpc/include/asm/kmap_types.h |  1 +
 include/asm-generic/kmap_types.h      |  3 ++-
 init/main.c                           |  2 ++
 kernel/kallsyms.c                     | 21 ++++++++++++++++++
 kernel/module.c                       |  4 ++++
 kernel/printk.c                       | 16 ++++++++++++++
 kernel/sched.c                        |  7 ++++--
 kernel/signal.c                       | 40 +++++++++++++++++++++++++++++++++++
 9 files changed, 92 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/kmap_types.h b/arch/arm/include/asm/kmap_types.h
index c4b2ea3fbe42..e51b1e81df05 100644
--- a/arch/arm/include/asm/kmap_types.h
+++ b/arch/arm/include/asm/kmap_types.h
@@ -20,6 +20,7 @@ enum km_type {
 	KM_SOFTIRQ1,
 	KM_L1_CACHE,
 	KM_L2_CACHE,
+	KM_KDB,
 	KM_TYPE_NR
 };
 
diff --git a/arch/powerpc/include/asm/kmap_types.h b/arch/powerpc/include/asm/kmap_types.h
index 916369575c97..bca8fdcd2542 100644
--- a/arch/powerpc/include/asm/kmap_types.h
+++ b/arch/powerpc/include/asm/kmap_types.h
@@ -26,6 +26,7 @@ enum km_type {
 	KM_SOFTIRQ1,
 	KM_PPC_SYNC_PAGE,
 	KM_PPC_SYNC_ICACHE,
+	KM_KDB,
 	KM_TYPE_NR
 };
 
diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h
index e5f234a08540..97e807c8c812 100644
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@ -28,7 +28,8 @@ KMAP_D(15)	KM_UML_USERCOPY,
 KMAP_D(16)	KM_IRQ_PTE,
 KMAP_D(17)	KM_NMI,
 KMAP_D(18)	KM_NMI_PTE,
-KMAP_D(19)	KM_TYPE_NR
+KMAP_D(19)	KM_KDB,
+KMAP_D(20)	KM_TYPE_NR
 };
 
 #undef KMAP_D
diff --git a/init/main.c b/init/main.c
index 5c8540271529..372771333d98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,6 +62,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
+#include <linux/kdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
@@ -675,6 +676,7 @@ asmlinkage void __init start_kernel(void)
 	buffer_init();
 	key_init();
 	security_init();
+	kdb_init(KDB_INIT_FULL);
 	vfs_caches_init(totalram_pages);
 	signals_init();
 	/* rootfs populating might need page-writeback */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13aff293f4de..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/fs.h>
+#include <linux/kdb.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>	/* for cond_resched */
@@ -516,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+#ifdef	CONFIG_KGDB_KDB
+const char *kdb_walk_kallsyms(loff_t *pos)
+{
+	static struct kallsym_iter kdb_walk_kallsyms_iter;
+	if (*pos == 0) {
+		memset(&kdb_walk_kallsyms_iter, 0,
+		       sizeof(kdb_walk_kallsyms_iter));
+		reset_iter(&kdb_walk_kallsyms_iter, 0);
+	}
+	while (1) {
+		if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
+			return NULL;
+		++*pos;
+		/* Some debugging symbols have no name.  Ignore them. */
+		if (kdb_walk_kallsyms_iter.name[0])
+			return kdb_walk_kallsyms_iter.name;
+	}
+}
+#endif	/* CONFIG_KGDB_KDB */
+
 static const struct file_operations kallsyms_operations = {
 	.open = kallsyms_open,
 	.read = seq_read,
diff --git a/kernel/module.c b/kernel/module.c
index e2564580f3f1..b751f1902476 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -77,6 +77,10 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
 
 /* Block module loading/unloading? */
 int modules_disabled = 0;
diff --git a/kernel/printk.c b/kernel/printk.c
index 75077ad0b537..9213b8b5bb4f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -413,6 +413,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 
+#ifdef	CONFIG_KGDB_KDB
+/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
+ * uses locks so it cannot be used during debugging.  Just tell kdb
+ * where the start and end of the physical and logical logs are.  This
+ * is equivalent to do_syslog(3).
+ */
+void kdb_syslog_data(char *syslog_data[4])
+{
+	syslog_data[0] = log_buf;
+	syslog_data[1] = log_buf + log_buf_len;
+	syslog_data[2] = log_buf + log_end -
+		(logged_chars < log_buf_len ? logged_chars : log_buf_len);
+	syslog_data[3] = log_buf + log_end;
+}
+#endif	/* CONFIG_KGDB_KDB */
+
 /*
  * Call the console drivers on a range of log_buf
  */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1d93cd0ae4d3..a25c1324af54 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7758,9 +7758,9 @@ void normalize_rt_tasks(void)
 
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-#ifdef CONFIG_IA64
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 /*
- * These functions are only useful for the IA64 MCA handling.
+ * These functions are only useful for the IA64 MCA handling, or kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
@@ -7780,6 +7780,9 @@ struct task_struct *curr_task(int cpu)
 	return cpu_curr(cpu);
 }
 
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd7fe073c55..825a3f24ad76 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2735,3 +2735,43 @@ void __init signals_init(void)
 {
 	sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
 }
+
+#ifdef CONFIG_KGDB_KDB
+#include <linux/kdb.h>
+/*
+ * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * signal internals.  This function checks if the required locks are
+ * available before calling the main signal code, to avoid kdb
+ * deadlocks.
+ */
+void
+kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+{
+	static struct task_struct *kdb_prev_t;
+	int sig, new_t;
+	if (!spin_trylock(&t->sighand->siglock)) {
+		kdb_printf("Can't do kill command now.\n"
+			   "The sigmask lock is held somewhere else in "
+			   "kernel, try again later\n");
+		return;
+	}
+	spin_unlock(&t->sighand->siglock);
+	new_t = kdb_prev_t != t;
+	kdb_prev_t = t;
+	if (t->state != TASK_RUNNING && new_t) {
+		kdb_printf("Process is not RUNNING, sending a signal from "
+			   "kdb risks deadlock\n"
+			   "on the run queue locks. "
+			   "The signal has _not_ been sent.\n"
+			   "Reissue the kill command if you want to risk "
+			   "the deadlock.\n");
+		return;
+	}
+	sig = info->si_signo;
+	if (send_sig_info(sig, info, t))
+		kdb_printf("Fail to deliver Signal %d to process %d.\n",
+			   sig, t->pid);
+	else
+		kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
+}
+#endif	/* CONFIG_KGDB_KDB */
-- 
cgit v1.2.3-55-g7522


From dcc7871128e99458ca86186b7bc8bf27ff0c47b5 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:21 -0500
Subject: kgdb: core changes to support kdb

These are the minimum changes to the kgdb core in order to enable an
API to connect a new front end (kdb) to the debug core.

This patch introduces the dbg_kdb_mode variable controls where the
user level I/O is routed.  It will be routed to the gdbstub (kgdb) or
to the kdb front end which is a simple shell available over the kgdboc
connection.

You can switch back and forth between kdb or the gdb stub mode of
operation dynamically.  From gdb stub mode you can blindly type
"$3#33", or from the kdb mode you can enter "kgdb" to switch to the
gdb stub.

The logic in the debug core depends on kdb to look for the typical gdb
connection sequences and return immediately with KGDB_PASS_EVENT if a
gdb serial command sequence is detected.  That should allow a
reasonably seamless transition between kdb -> gdb without leaving the
kernel exception state.  The two gdb serial queries that kdb is
responsible for detecting are the "?" and "qSupported" packets.

CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Martin Hicks <mort@sgi.com>
---
 arch/arm/kernel/kgdb.c     |   5 +++
 arch/mips/kernel/kgdb.c    |   5 +++
 arch/powerpc/kernel/kgdb.c |   5 +++
 arch/x86/kernel/kgdb.c     |   5 +++
 include/linux/kgdb.h       |  11 ++++-
 kernel/debug/debug_core.c  | 107 ++++++++++++++++++++++++++++++++++++++-------
 kernel/debug/debug_core.h  |  24 ++++++++++
 kernel/debug/gdbstub.c     |  36 +++++++++++++++
 lib/Kconfig.kgdb           |   8 +++-
 9 files changed, 186 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index a5b846b9895d..c868a8864117 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -98,6 +98,11 @@ sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
 	gdb_regs[_CPSR]		= thread_regs->ARM_cpsr;
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->ARM_pc = pc;
+}
+
 static int compiled_break;
 
 int kgdb_arch_handle_exception(int exception_vector, int signo,
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index 50c9bb880667..6ed4c83c869b 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -180,6 +180,11 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	*(ptr++) = regs->cp0_epc;
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->cp0_epc = pc;
+}
+
 /*
  * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
  * then try to fall into the debugger
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 41bada0298c8..c81e3de1306e 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -309,6 +309,11 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	       (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->nip = pc;
+}
+
 /*
  * This function does PowerPC specific procesing for interfacing to gdb.
  */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b2258ca91003..f95a2c0b915c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -690,6 +690,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
 	return instruction_pointer(regs);
 }
 
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
+
 struct kgdb_arch arch_kgdb_ops = {
 	/* Breakpoint instruction: */
 	.gdb_bpt_instr		= { 0xcc },
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 4830142ec339..5b37df00000d 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -16,10 +16,12 @@
 #include <linux/serial_8250.h>
 #include <linux/linkage.h>
 #include <linux/init.h>
-
 #include <asm/atomic.h>
+#ifdef CONFIG_HAVE_ARCH_KGDB
 #include <asm/kgdb.h>
+#endif
 
+#ifdef CONFIG_KGDB
 struct pt_regs;
 
 /**
@@ -262,6 +264,7 @@ extern struct kgdb_arch		arch_kgdb_ops;
 
 extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs);
 
+extern void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc);
 extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
 extern struct kgdb_io *dbg_io_ops;
@@ -279,5 +282,9 @@ extern int kgdb_nmicallback(int cpu, void *regs);
 
 extern int			kgdb_single_step;
 extern atomic_t			kgdb_active;
-
+#define in_dbg_master() \
+	(raw_smp_processor_id() == atomic_read(&kgdb_active))
+#else /* ! CONFIG_KGDB */
+#define in_dbg_master() (0)
+#endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 7e03969330bc..6e1fa829fdeb 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -43,6 +43,7 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
+#include <linux/kdb.h>
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
@@ -77,6 +78,11 @@ static DEFINE_SPINLOCK(kgdb_registration_lock);
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
 static int kgdb_use_con;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+static int dbg_kdb_mode = 1;
 
 static int __init opt_kgdb_con(char *str)
 {
@@ -100,6 +106,7 @@ static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
  * The CPU# of the active CPU, or -1 if none:
  */
 atomic_t			kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
 
 /*
  * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
@@ -301,7 +308,7 @@ int dbg_set_sw_break(unsigned long addr)
 	return 0;
 }
 
-static int kgdb_deactivate_sw_breakpoints(void)
+int dbg_deactivate_sw_breakpoints(void)
 {
 	unsigned long addr;
 	int error;
@@ -395,8 +402,14 @@ static int kgdb_io_ready(int print_wait)
 		return 1;
 	if (atomic_read(&kgdb_setting_breakpoint))
 		return 1;
-	if (print_wait)
+	if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
 		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+	}
 	return 1;
 }
 
@@ -410,7 +423,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	/* Panic on recursive debugger calls: */
 	exception_level++;
 	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
-	kgdb_deactivate_sw_breakpoints();
+	dbg_deactivate_sw_breakpoints();
 
 	/*
 	 * If the break point removed ok at the place exception
@@ -443,11 +456,24 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	return 1;
 }
 
+static void dbg_cpu_switch(int cpu, int next_cpu)
+{
+	/* Mark the cpu we are switching away from as a slave when it
+	 * holds the kgdb_active token.  This must be done so that the
+	 * that all the cpus wait in for the debug core will not enter
+	 * again as the master. */
+	if (cpu == atomic_read(&kgdb_active)) {
+		kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+		kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
+	}
+	kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+}
+
 static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
 {
 	unsigned long flags;
 	int sstep_tries = 100;
-	int error = 0;
+	int error;
 	int i, cpu;
 	int trace_on = 0;
 acquirelock:
@@ -460,6 +486,8 @@ acquirelock:
 	cpu = ks->cpu;
 	kgdb_info[cpu].debuggerinfo = regs;
 	kgdb_info[cpu].task = current;
+	kgdb_info[cpu].ret_state = 0;
+	kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
 	/*
 	 * Make sure the above info reaches the primary CPU before
 	 * our cpu_in_kgdb[] flag setting does:
@@ -471,7 +499,11 @@ acquirelock:
 	 * master cpu and acquire the kgdb_active lock:
 	 */
 	while (1) {
-		if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+cpu_loop:
+		if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+			kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+			goto cpu_master_loop;
+		} else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
 			if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
 				break;
 		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
@@ -513,7 +545,7 @@ return_normal:
 	}
 
 	if (!kgdb_io_ready(1)) {
-		error = 1;
+		kgdb_info[cpu].ret_state = 1;
 		goto kgdb_restore; /* No I/O connection, resume the system */
 	}
 
@@ -548,7 +580,7 @@ return_normal:
 	 * Wait for the other CPUs to be notified and be waiting for us:
 	 */
 	for_each_online_cpu(i) {
-		while (!atomic_read(&cpu_in_kgdb[i]))
+		while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
 			cpu_relax();
 	}
 
@@ -557,7 +589,7 @@ return_normal:
 	 * in the debugger and all secondary CPUs are quiescent
 	 */
 	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
-	kgdb_deactivate_sw_breakpoints();
+	dbg_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
 	kgdb_contthread = current;
 	exception_level = 0;
@@ -565,8 +597,26 @@ return_normal:
 	if (trace_on)
 		tracing_off();
 
-	/* Talk to debugger with gdbserial protocol */
-	error = gdb_serial_stub(ks);
+	while (1) {
+cpu_master_loop:
+		if (dbg_kdb_mode) {
+			kgdb_connected = 1;
+			error = kdb_stub(ks);
+		} else {
+			error = gdb_serial_stub(ks);
+		}
+
+		if (error == DBG_PASS_EVENT) {
+			dbg_kdb_mode = !dbg_kdb_mode;
+			kgdb_connected = 0;
+		} else if (error == DBG_SWITCH_CPU_EVENT) {
+			dbg_cpu_switch(cpu, dbg_switch_cpu);
+			goto cpu_loop;
+		} else {
+			kgdb_info[cpu].ret_state = error;
+			break;
+		}
+	}
 
 	/* Call the I/O driver's post_exception routine */
 	if (dbg_io_ops->post_exception)
@@ -578,11 +628,16 @@ return_normal:
 		for (i = NR_CPUS-1; i >= 0; i--)
 			atomic_dec(&passive_cpu_wait[i]);
 		/*
-		 * Wait till all the CPUs have quit
-		 * from the debugger.
+		 * Wait till all the CPUs have quit from the debugger,
+		 * but allow a CPU that hit an exception and is
+		 * waiting to become the master to remain in the debug
+		 * core.
 		 */
 		for_each_online_cpu(i) {
-			while (atomic_read(&cpu_in_kgdb[i]))
+			while (kgdb_do_roundup &&
+			       atomic_read(&cpu_in_kgdb[i]) &&
+			       !(kgdb_info[i].exception_state &
+				 DCPU_WANT_MASTER))
 				cpu_relax();
 		}
 	}
@@ -603,7 +658,7 @@ kgdb_restore:
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
-	return error;
+	return kgdb_info[cpu].ret_state;
 }
 
 /*
@@ -632,7 +687,8 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 		return 0; /* Ouch, double exception ! */
 	kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
 	ret = kgdb_cpu_enter(ks, regs);
-	kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+	kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
+						DCPU_IS_SLAVE);
 	return ret;
 }
 
@@ -665,7 +721,7 @@ static void kgdb_console_write(struct console *co, const char *s,
 
 	/* If we're debugging, or KGDB has not connected, don't try
 	 * and print. */
-	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+	if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
 		return;
 
 	local_irq_save(flags);
@@ -687,8 +743,14 @@ static void sysrq_handle_dbg(int key, struct tty_struct *tty)
 		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
 		return;
 	}
-	if (!kgdb_connected)
+	if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+		if (!dbg_kdb_mode)
+			printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
 		printk(KERN_CRIT "Entering KGDB\n");
+#endif
+	}
 
 	kgdb_breakpoint();
 }
@@ -817,6 +879,16 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
 }
 EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 
+int dbg_io_get_char(void)
+{
+	int ret = dbg_io_ops->read_char();
+	if (!dbg_kdb_mode)
+		return ret;
+	if (ret == 127)
+		return 8;
+	return ret;
+}
+
 /**
  * kgdb_breakpoint - generate breakpoint exception
  *
@@ -839,6 +911,7 @@ static int __init opt_kgdb_wait(char *str)
 {
 	kgdb_break_asap = 1;
 
+	kdb_init(KDB_INIT_EARLY);
 	if (kgdb_io_module_registered)
 		kgdb_initial_breakpoint();
 
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index db554f9be51d..44cf3de8cf9e 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -38,6 +38,8 @@ struct debuggerinfo_struct {
 	void			*debuggerinfo;
 	struct task_struct	*task;
 	int			exception_state;
+	int			ret_state;
+	int			irq_depth;
 };
 
 extern struct debuggerinfo_struct kgdb_info[];
@@ -47,9 +49,31 @@ extern int dbg_remove_all_break(void);
 extern int dbg_set_sw_break(unsigned long addr);
 extern int dbg_remove_sw_break(unsigned long addr);
 extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
 
 /* gdbstub interface functions */
 extern int gdb_serial_stub(struct kgdb_state *ks);
 extern void gdbstub_msg_write(const char *s, int len);
 
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+	return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
 #endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ccdf0929f12d..188203a19657 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -887,6 +887,13 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		case 'Z': /* Break point set */
 			gdb_cmd_break(ks);
 			break;
+#ifdef CONFIG_KGDB_KDB
+		case '3': /* Escape into back into kdb */
+			if (remcom_in_buffer[1] == '\0') {
+				gdb_cmd_detachkill(ks);
+				return DBG_PASS_EVENT;
+			}
+#endif
 		case 'C': /* Exception passing */
 			tmp = gdb_cmd_exception_pass(ks);
 			if (tmp > 0)
@@ -932,3 +939,32 @@ kgdb_exit:
 		error = 1;
 	return error;
 }
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+	int error;
+
+	switch (cmd[0]) {
+	case 'e':
+		error = kgdb_arch_handle_exception(ks->ex_vector,
+						   ks->signo,
+						   ks->err_code,
+						   remcom_in_buffer,
+						   remcom_out_buffer,
+						   ks->linux_regs);
+		return error;
+	case 's':
+	case 'c':
+		strcpy(remcom_in_buffer, cmd);
+		return 0;
+	case '?':
+		gdb_cmd_status(ks);
+		break;
+	case '\0':
+		strcpy(remcom_out_buffer, "");
+		break;
+	}
+	dbg_io_ops->write_char('+');
+	put_packet(remcom_out_buffer);
+	return 0;
+}
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index 9b5d1d7f2ef7..78de43a5e902 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -3,7 +3,7 @@ config HAVE_ARCH_KGDB
 	bool
 
 menuconfig KGDB
-	bool "KGDB: kernel debugging with remote gdb"
+	bool "KGDB: kernel debugger"
 	depends on HAVE_ARCH_KGDB
 	depends on DEBUG_KERNEL && EXPERIMENTAL
 	help
@@ -57,4 +57,10 @@ config KGDB_TESTS_BOOT_STRING
 	  information about other strings you could use beyond the
 	  default of V1F100.
 
+config KGDB_KDB
+	bool "KGDB_KDB: include kdb frontend for kgdb"
+	default n
+	help
+	  KDB frontend for kernel
+
 endif # KGDB
-- 
cgit v1.2.3-55-g7522


From f5316b4aea024da9266d740322a5481657f6ce59 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:22 -0500
Subject: kgdb,8250,pl011: Return immediately from console poll

The design of the kdb shell requires that every device that can
provide input to kdb have a polling routine that exits immediately if
there is no character available.  This is required in order to get the
page scrolling mechanism working.

Changing the kernel debugger I/O API to require all polling character
routines to exit immediately if there is no data allows the kernel
debugger to process multiple input channels.

NO_POLL_CHAR will be the return code to the polling routine when ever
there is no character available.

CC: linux-serial@vger.kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 drivers/serial/8250.c           |  4 ++--
 drivers/serial/amba-pl011.c     |  6 +++---
 include/linux/kdb.h             |  1 +
 include/linux/serial_core.h     |  1 +
 kernel/debug/debug_core.c       |  2 ++
 kernel/debug/gdbstub.c          | 37 +++++++++++++++++++++++++++++++------
 kernel/debug/kdb/kdb_debugger.c | 10 ++++++++++
 7 files changed, 50 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 2b1ea3d4c4f4..891e1dd65f24 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1891,8 +1891,8 @@ static int serial8250_get_poll_char(struct uart_port *port)
 	struct uart_8250_port *up = (struct uart_8250_port *)port;
 	unsigned char lsr = serial_inp(up, UART_LSR);
 
-	while (!(lsr & UART_LSR_DR))
-		lsr = serial_inp(up, UART_LSR);
+	if (!(lsr & UART_LSR_DR))
+		return NO_POLL_CHAR;
 
 	return serial_inp(up, UART_RX);
 }
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 743ebf5f16da..eb4cb480b93e 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -342,9 +342,9 @@ static int pl010_get_poll_char(struct uart_port *port)
 	struct uart_amba_port *uap = (struct uart_amba_port *)port;
 	unsigned int status;
 
-	do {
-		status = readw(uap->port.membase + UART01x_FR);
-	} while (status & UART01x_FR_RXFE);
+	status = readw(uap->port.membase + UART01x_FR);
+	if (status & UART01x_FR_RXFE)
+		return NO_POLL_CHAR;
 
 	return readw(uap->port.membase + UART01x_DR);
 }
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 4d93790faec3..d72fa3908128 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -19,6 +19,7 @@
 #include <asm/atomic.h>
 
 #define KDB_POLL_FUNC_MAX	5
+extern int kdb_poll_idx;
 
 /*
  * kdb_initial_cpu is initialized to -1, and is set to the cpu
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 78dd1e7120a9..ad839963fa68 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -246,6 +246,7 @@ struct uart_ops {
 #endif
 };
 
+#define NO_POLL_CHAR		0x00ff0000
 #define UART_CONFIG_TYPE	(1 << 0)
 #define UART_CONFIG_IRQ		(1 << 1)
 
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 6e1fa829fdeb..1d71df66f3fa 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -882,6 +882,8 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 int dbg_io_get_char(void)
 {
 	int ret = dbg_io_ops->read_char();
+	if (ret == NO_POLL_CHAR)
+		return -1;
 	if (!dbg_kdb_mode)
 		return ret;
 	if (ret == 127)
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 188203a19657..3c000490a7dd 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -30,6 +30,7 @@
 
 #include <linux/kernel.h>
 #include <linux/kgdb.h>
+#include <linux/kdb.h>
 #include <linux/reboot.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -62,6 +63,30 @@ static int hex(char ch)
 	return -1;
 }
 
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+	int ret = -1;
+	int i;
+
+	/* poll any additional I/O interfaces that are defined */
+	while (ret < 0)
+		for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+			ret = kdb_poll_funcs[i]();
+			if (ret > 0)
+				break;
+		}
+	return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+	int ret = dbg_io_ops->read_char();
+	while (ret == NO_POLL_CHAR)
+		ret = dbg_io_ops->read_char();
+	return ret;
+}
+#endif
 /* scan for the sequence $<data>#<checksum> */
 static void get_packet(char *buffer)
 {
@@ -75,7 +100,7 @@ static void get_packet(char *buffer)
 		 * Spin and wait around for the start character, ignore all
 		 * other characters:
 		 */
-		while ((ch = (dbg_io_ops->read_char())) != '$')
+		while ((ch = (gdbstub_read_wait())) != '$')
 			/* nothing */;
 
 		kgdb_connected = 1;
@@ -88,7 +113,7 @@ static void get_packet(char *buffer)
 		 * now, read until a # or end of buffer is found:
 		 */
 		while (count < (BUFMAX - 1)) {
-			ch = dbg_io_ops->read_char();
+			ch = gdbstub_read_wait();
 			if (ch == '#')
 				break;
 			checksum = checksum + ch;
@@ -98,8 +123,8 @@ static void get_packet(char *buffer)
 		buffer[count] = 0;
 
 		if (ch == '#') {
-			xmitcsum = hex(dbg_io_ops->read_char()) << 4;
-			xmitcsum += hex(dbg_io_ops->read_char());
+			xmitcsum = hex(gdbstub_read_wait()) << 4;
+			xmitcsum += hex(gdbstub_read_wait());
 
 			if (checksum != xmitcsum)
 				/* failed checksum */
@@ -144,10 +169,10 @@ static void put_packet(char *buffer)
 			dbg_io_ops->flush();
 
 		/* Now see what we get in reply. */
-		ch = dbg_io_ops->read_char();
+		ch = gdbstub_read_wait();
 
 		if (ch == 3)
-			ch = dbg_io_ops->read_char();
+			ch = gdbstub_read_wait();
 
 		/* If we get an ACK, we are done. */
 		if (ch == '+')
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index f024c0c4b8c4..bf6e8270e957 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -20,7 +20,15 @@
 get_char_func kdb_poll_funcs[] = {
 	dbg_io_get_char,
 	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
 };
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
 
 int kdb_stub(struct kgdb_state *ks)
 {
@@ -85,6 +93,7 @@ int kdb_stub(struct kgdb_state *ks)
 	kdb_bp_remove();
 	KDB_STATE_CLEAR(DOING_SS);
 	KDB_STATE_CLEAR(DOING_SSB);
+	KDB_STATE_SET(PAGER);
 	/* zero out any offline cpu data */
 	for_each_present_cpu(i) {
 		if (!cpu_online(i)) {
@@ -112,6 +121,7 @@ int kdb_stub(struct kgdb_state *ks)
 	kdb_initial_cpu = -1;
 	kdb_current_task = NULL;
 	kdb_current_regs = NULL;
+	KDB_STATE_CLEAR(PAGER);
 	kdbnearsym_cleanup();
 	if (error == KDB_CMD_KGDB) {
 		if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
-- 
cgit v1.2.3-55-g7522


From a0de055cf61338549b13079a5677ef2e1b6472ef Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:24 -0500
Subject: kgdb: gdb "monitor" -> kdb passthrough

One of the driving forces behind integrating another front end (kdb)
to the debug core is to allow front end commands to be accessible via
gdb's monitor command.  It is true that you could write gdb macros to
get certain data, but you may want to just use gdb to access the
commands that are available in the kdb front end.

This patch implements the Rcmd gdb stub packet.  In gdb you access
this with the "monitor" command.  For instance you could type "monitor
help", "monitor lsmod" or "monitor ps A" etc...

There is no error checking or command restrictions on what you can and
cannot access at this point.  Doing something like trying to set
breakpoints with the monitor command is going to cause nothing but
problems.  Perhaps in the future only the commands that are actually
known to work with the gdb monitor command will be available.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/debug_core.c      |  2 +-
 kernel/debug/debug_core.h      |  2 ++
 kernel/debug/gdbstub.c         | 22 ++++++++++++++++++++++
 kernel/debug/kdb/kdb_io.c      | 13 +++++++++----
 kernel/debug/kdb/kdb_private.h |  1 -
 5 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1d71df66f3fa..1aed37b4c564 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -82,7 +82,7 @@ static int kgdb_use_con;
 int dbg_switch_cpu;
 
 /* Use kdb or gdbserver mode */
-static int dbg_kdb_mode = 1;
+int dbg_kdb_mode = 1;
 
 static int __init opt_kgdb_con(char *str)
 {
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 44cf3de8cf9e..c5d753d80f67 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -66,9 +66,11 @@ extern void gdbstub_msg_write(const char *s, int len);
 
 /* gdbstub functions used for kdb <-> gdbstub transition */
 extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
 
 #ifdef CONFIG_KGDB_KDB
 extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
 #else /* ! CONFIG_KGDB_KDB */
 static inline int kdb_stub(struct kgdb_state *ks)
 {
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 3c000490a7dd..4b17b3269525 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -201,6 +201,9 @@ void gdbstub_msg_write(const char *s, int len)
 	int wcount;
 	int i;
 
+	if (len == 0)
+		len = strlen(s);
+
 	/* 'O'utput */
 	gdbmsgbuf[0] = 'O';
 
@@ -685,6 +688,25 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
 		}
 		break;
+#ifdef CONFIG_KGDB_KDB
+	case 'R':
+		if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+			int len = strlen(remcom_in_buffer + 6);
+
+			if ((len % 2) != 0) {
+				strcpy(remcom_out_buffer, "E01");
+				break;
+			}
+			kgdb_hex2mem(remcom_in_buffer + 6,
+				     remcom_out_buffer, len);
+			len = len / 2;
+			remcom_out_buffer[len++] = 0;
+
+			kdb_parse(remcom_out_buffer);
+			strcpy(remcom_out_buffer, "OK");
+		}
+		break;
+#endif
 	}
 }
 
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 9e3cec7a925c..8339b291e8bc 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -21,6 +21,7 @@
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <linux/delay.h>
+#include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/kallsyms.h>
 #include "kdb_private.h"
@@ -669,10 +670,14 @@ kdb_printit:
 	 * Write to all consoles.
 	 */
 	retlen = strlen(kdb_buffer);
-	while (c) {
-		c->write(c, kdb_buffer, retlen);
-		touch_nmi_watchdog();
-		c = c->next;
+	if (!dbg_kdb_mode && kgdb_connected) {
+		gdbstub_msg_write(kdb_buffer, retlen);
+	} else {
+		while (c) {
+			c->write(c, kdb_buffer, retlen);
+			touch_nmi_watchdog();
+			c = c->next;
+		}
 	}
 	if (logging) {
 		saved_loglevel = console_loglevel;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 69ed2eff3fea..97d3ba69775d 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -254,7 +254,6 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
 				    unsigned long mask);
 extern void kdb_ps_suppressed(void);
 extern void kdb_ps1(const struct task_struct *p);
-extern int kdb_parse(const char *cmdstr);
 extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
 extern void kdb_meminfo_proc_show(void);
-- 
cgit v1.2.3-55-g7522


From ada64e4c98eb5f04a9ca223c5ff9e7ac22ce6404 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:24 -0500
Subject: kgdboc,keyboard: Keyboard driver for kdb with kgdb

This patch adds in the kdb PS/2 keyboard driver.  This was mostly a
direct port from the original kdb where I cleaned up the code against
checkpatch.pl and added the glue to stitch it into kgdb.

This patch also enables early kdb debug via kgdbwait and the keyboard.

All the access to configure kdb using either a serial console or the
keyboard is done via kgdboc.

If you want to use only the keyboard and want to break in early you
would add to your kernel command arguments:

    kgdboc=kbd kgdbwait

If you wanted serial and or the keyboard access you could use:

    kgdboc=kbd,ttyS0

You can also configure kgdboc as a kernel module or at run time with
the sysfs where you can activate and deactivate kgdb.

Turn it on:
    echo kbd,ttyS0 > /sys/module/kgdboc/parameters/kgdboc

Turn it off:
    echo "" > /sys/module/kgdboc/parameters/kgdboc

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Reviewed-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/kernel-parameters.txt |   8 +-
 drivers/serial/kgdboc.c             |  61 +++++++++--
 kernel/debug/kdb/Makefile           |   1 +
 kernel/debug/kdb/kdb_keyboard.c     | 212 ++++++++++++++++++++++++++++++++++++
 lib/Kconfig.kgdb                    |   7 ++
 5 files changed, 279 insertions(+), 10 deletions(-)
 create mode 100644 kernel/debug/kdb/kdb_keyboard.c

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b12bacd252fc..3845e3a84a52 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1121,9 +1121,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			zone if it does not.
 
 	kgdboc=		[HW] kgdb over consoles.
-			Requires a tty driver that supports console polling.
-			(only serial supported for now)
-			Format: <serial_device>[,baud]
+			Requires a tty driver that supports console polling,
+			or a supported polling keyboard driver (non-usb).
+			Serial only format: <serial_device>[,baud]
+			keyboard only format: kbd
+			keyboard and serial format: kbd,<serial_device>[,baud]
 
 	kmac=		[MIPS] korina ethernet MAC address.
 			Configure the RouterBoard 532 series on-chip
diff --git a/drivers/serial/kgdboc.c b/drivers/serial/kgdboc.c
index eadc1ab6bbce..ecef6e1a599a 100644
--- a/drivers/serial/kgdboc.c
+++ b/drivers/serial/kgdboc.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/kgdb.h>
+#include <linux/kdb.h>
 #include <linux/tty.h>
 
 #define MAX_CONFIG_LEN		40
@@ -32,6 +33,40 @@ static struct kparam_string kps = {
 static struct tty_driver	*kgdb_tty_driver;
 static int			kgdb_tty_line;
 
+#ifdef CONFIG_KDB_KEYBOARD
+static int kgdboc_register_kbd(char **cptr)
+{
+	if (strncmp(*cptr, "kbd", 3) == 0) {
+		if (kdb_poll_idx < KDB_POLL_FUNC_MAX) {
+			kdb_poll_funcs[kdb_poll_idx] = kdb_get_kbd_char;
+			kdb_poll_idx++;
+			if (cptr[0][3] == ',')
+				*cptr += 4;
+			else
+				return 1;
+		}
+	}
+	return 0;
+}
+
+static void kgdboc_unregister_kbd(void)
+{
+	int i;
+
+	for (i = 0; i < kdb_poll_idx; i++) {
+		if (kdb_poll_funcs[i] == kdb_get_kbd_char) {
+			kdb_poll_idx--;
+			kdb_poll_funcs[i] = kdb_poll_funcs[kdb_poll_idx];
+			kdb_poll_funcs[kdb_poll_idx] = NULL;
+			i--;
+		}
+	}
+}
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kgdboc_register_kbd(x) 0
+#define kgdboc_unregister_kbd()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
 static int kgdboc_option_setup(char *opt)
 {
 	if (strlen(opt) > MAX_CONFIG_LEN) {
@@ -45,25 +80,38 @@ static int kgdboc_option_setup(char *opt)
 
 __setup("kgdboc=", kgdboc_option_setup);
 
+static void cleanup_kgdboc(void)
+{
+	kgdboc_unregister_kbd();
+	if (configured == 1)
+		kgdb_unregister_io_module(&kgdboc_io_ops);
+}
+
 static int configure_kgdboc(void)
 {
 	struct tty_driver *p;
 	int tty_line = 0;
 	int err;
+	char *cptr = config;
 
 	err = kgdboc_option_setup(config);
 	if (err || !strlen(config) || isspace(config[0]))
 		goto noconfig;
 
 	err = -ENODEV;
+	kgdb_tty_driver = NULL;
+
+	if (kgdboc_register_kbd(&cptr))
+		goto do_register;
 
-	p = tty_find_polling_driver(config, &tty_line);
+	p = tty_find_polling_driver(cptr, &tty_line);
 	if (!p)
 		goto noconfig;
 
 	kgdb_tty_driver = p;
 	kgdb_tty_line = tty_line;
 
+do_register:
 	err = kgdb_register_io_module(&kgdboc_io_ops);
 	if (err)
 		goto noconfig;
@@ -75,6 +123,7 @@ static int configure_kgdboc(void)
 noconfig:
 	config[0] = 0;
 	configured = 0;
+	cleanup_kgdboc();
 
 	return err;
 }
@@ -88,20 +137,18 @@ static int __init init_kgdboc(void)
 	return configure_kgdboc();
 }
 
-static void cleanup_kgdboc(void)
-{
-	if (configured == 1)
-		kgdb_unregister_io_module(&kgdboc_io_ops);
-}
-
 static int kgdboc_get_char(void)
 {
+	if (!kgdb_tty_driver)
+		return -1;
 	return kgdb_tty_driver->ops->poll_get_char(kgdb_tty_driver,
 						kgdb_tty_line);
 }
 
 static void kgdboc_put_char(u8 chr)
 {
+	if (!kgdb_tty_driver)
+		return;
 	kgdb_tty_driver->ops->poll_put_char(kgdb_tty_driver,
 					kgdb_tty_line, chr);
 }
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d1e925eddbcd..d4fc58f4b88d 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -8,6 +8,7 @@
 
 CCVERSION	:= $(shell $(CC) -v 2>&1 | sed -ne '$$p')
 obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD)    += kdb_keyboard.o
 
 clean-files := gen-kdb_cmds.c
 
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+/* Keyboard Controller Registers on normal PCs. */
+
+#define KBD_STATUS_REG		0x64	/* Status register (R) */
+#define KBD_DATA_REG		0x60	/* Keyboard data register (R/W) */
+
+/* Status Register Bits */
+
+#define KBD_STAT_OBF 		0x01	/* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF	0x20	/* Mouse output buffer full */
+
+static int kbd_exists;
+
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+	int scancode, scanstatus;
+	static int shift_lock;	/* CAPS LOCK state (0-off, 1-on) */
+	static int shift_key;	/* Shift next keypress */
+	static int ctrl_key;
+	u_short keychar;
+
+	if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+	    (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+		kbd_exists = 0;
+		return -1;
+	}
+	kbd_exists = 1;
+
+	if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+		return -1;
+
+	/*
+	 * Fetch the scancode
+	 */
+	scancode = inb(KBD_DATA_REG);
+	scanstatus = inb(KBD_STATUS_REG);
+
+	/*
+	 * Ignore mouse events.
+	 */
+	if (scanstatus & KBD_STAT_MOUSE_OBF)
+		return -1;
+
+	/*
+	 * Ignore release, trigger on make
+	 * (except for shift keys, where we want to
+	 *  keep the shift state so long as the key is
+	 *  held down).
+	 */
+
+	if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+		/*
+		 * Next key may use shift table
+		 */
+		if ((scancode & 0x80) == 0)
+			shift_key = 1;
+		else
+			shift_key = 0;
+		return -1;
+	}
+
+	if ((scancode&0x7f) == 0x1d) {
+		/*
+		 * Left ctrl key
+		 */
+		if ((scancode & 0x80) == 0)
+			ctrl_key = 1;
+		else
+			ctrl_key = 0;
+		return -1;
+	}
+
+	if ((scancode & 0x80) != 0)
+		return -1;
+
+	scancode &= 0x7f;
+
+	/*
+	 * Translate scancode
+	 */
+
+	if (scancode == 0x3a) {
+		/*
+		 * Toggle caps lock
+		 */
+		shift_lock ^= 1;
+
+#ifdef	KDB_BLINK_LED
+		kdb_toggleled(0x4);
+#endif
+		return -1;
+	}
+
+	if (scancode == 0x0e) {
+		/*
+		 * Backspace
+		 */
+		return 8;
+	}
+
+	/* Special Key */
+	switch (scancode) {
+	case 0xF: /* Tab */
+		return 9;
+	case 0x53: /* Del */
+		return 4;
+	case 0x47: /* Home */
+		return 1;
+	case 0x4F: /* End */
+		return 5;
+	case 0x4B: /* Left */
+		return 2;
+	case 0x48: /* Up */
+		return 16;
+	case 0x50: /* Down */
+		return 14;
+	case 0x4D: /* Right */
+		return 6;
+	}
+
+	if (scancode == 0xe0)
+		return -1;
+
+	/*
+	 * For Japanese 86/106 keyboards
+	 * 	See comment in drivers/char/pc_keyb.c.
+	 * 	- Masahiro Adegawa
+	 */
+	if (scancode == 0x73)
+		scancode = 0x59;
+	else if (scancode == 0x7d)
+		scancode = 0x7c;
+
+	if (!shift_lock && !shift_key && !ctrl_key) {
+		keychar = plain_map[scancode];
+	} else if ((shift_lock || shift_key) && key_maps[1]) {
+		keychar = key_maps[1][scancode];
+	} else if (ctrl_key && key_maps[4]) {
+		keychar = key_maps[4][scancode];
+	} else {
+		keychar = 0x0020;
+		kdb_printf("Unknown state/scancode (%d)\n", scancode);
+	}
+	keychar &= 0x0fff;
+	if (keychar == '\t')
+		keychar = ' ';
+	switch (KTYP(keychar)) {
+	case KT_LETTER:
+	case KT_LATIN:
+		if (isprint(keychar))
+			break;		/* printable characters */
+		/* drop through */
+	case KT_SPEC:
+		if (keychar == K_ENTER)
+			break;
+		/* drop through */
+	default:
+		return -1;	/* ignore unprintables */
+	}
+
+	if ((scancode & 0x7f) == 0x1c) {
+		/*
+		 * enter key.  All done.  Absorb the release scancode.
+		 */
+		while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+			;
+
+		/*
+		 * Fetch the scancode
+		 */
+		scancode = inb(KBD_DATA_REG);
+		scanstatus = inb(KBD_STATUS_REG);
+
+		while (scanstatus & KBD_STAT_MOUSE_OBF) {
+			scancode = inb(KBD_DATA_REG);
+			scanstatus = inb(KBD_STATUS_REG);
+		}
+
+		if (scancode != 0x9c) {
+			/*
+			 * Wasn't an enter-release,  why not?
+			 */
+			kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
+			       scancode, scanstatus);
+		}
+
+		return 13;
+	}
+
+	return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index 78de43a5e902..ee8ae7132f20 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -63,4 +63,11 @@ config KGDB_KDB
 	help
 	  KDB frontend for kernel
 
+config KDB_KEYBOARD
+	bool "KGDB_KDB: keyboard as input device"
+	depends on VT && KGDB_KDB
+	default n
+	help
+	  KDB can use a PS/2 type keyboard for an input device
+
 endif # KGDB
-- 
cgit v1.2.3-55-g7522


From 98ec1878cacb393975cba64f7392eece81716cb4 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Mon, 27 Apr 2009 10:58:06 -0500
Subject: kgdb: remove post_primary_code references

Remove all the references to the kgdb_post_primary_code.  This
function serves no useful purpose because you can obtain the same
information from the "struct kgdb_state *ks" from with in the
debugger, if for some reason you want the data.

Also remove the unintentional duplicate assignment for ks->ex_vector.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c    | 29 -----------------------------
 include/linux/kgdb.h      | 14 --------------
 kernel/debug/debug_core.c |  8 --------
 3 files changed, 51 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f95a2c0b915c..acba57169938 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -47,20 +47,8 @@
 #include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
-
 #include <asm/apic.h>
 
-/*
- * Put the error code here just in case the user cares:
- */
-static int gdb_x86errcode;
-
-/*
- * Likewise, the vector number here (since GDB only gets the signal
- * number through the usual means, and that's not very specific):
- */
-static int gdb_x86vector = -1;
-
 /**
  *	pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
  *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
@@ -399,23 +387,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
 	}
 }
 
-/**
- *	kgdb_post_primary_code - Save error vector/code numbers.
- *	@regs: Original pt_regs.
- *	@e_vector: Original error vector.
- *	@err_code: Original error code.
- *
- *	This is needed on architectures which support SMP and KGDB.
- *	This function is called after all the slave cpus have been put
- *	to a know spin state and the primary CPU has control over KGDB.
- */
-void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	/* primary processor is completely in the debugger */
-	gdb_x86vector = e_vector;
-	gdb_x86errcode = err_code;
-}
-
 #ifdef CONFIG_SMP
 /**
  *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 407edb1e0c4d..406f6f9286f3 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -35,20 +35,6 @@ struct pt_regs;
  */
 extern int kgdb_skipexception(int exception, struct pt_regs *regs);
 
-/**
- *	kgdb_post_primary_code - (optional) Save error vector/code numbers.
- *	@regs: Original pt_regs.
- *	@e_vector: Original error vector.
- *	@err_code: Original error code.
- *
- *	This is usually needed on architectures which support SMP and
- *	KGDB.  This function is called after all the secondary cpus have
- *	been put to a know spin state and the primary CPU has control over
- *	KGDB.
- */
-extern void kgdb_post_primary_code(struct pt_regs *regs, int e_vector,
-				  int err_code);
-
 /**
  *	kgdb_disable_hw_debug - (optional) Disable hardware debugging hook
  *	@regs: Current &struct pt_regs.
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1aed37b4c564..88a83a225374 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -203,12 +203,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
 	return 0;
 }
 
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-	return;
-}
-
 /**
  *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
  *	@regs: Current &struct pt_regs.
@@ -588,7 +582,6 @@ return_normal:
 	 * At this point the primary processor is completely
 	 * in the debugger and all secondary CPUs are quiescent
 	 */
-	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
 	dbg_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
 	kgdb_contthread = current;
@@ -678,7 +671,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 	ks->cpu			= raw_smp_processor_id();
 	ks->ex_vector		= evector;
 	ks->signo		= signo;
-	ks->ex_vector		= evector;
 	ks->err_code		= ecode;
 	ks->kgdb_usethreadid	= 0;
 	ks->linux_regs		= regs;
-- 
cgit v1.2.3-55-g7522


From f503b5ae53cb557ac351a668fcac1baab1cef0db Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:25 -0500
Subject: x86,kgdb: Add low level debug hook

The only way the debugger can handle a trap in inside rcu_lock,
notify_die, or atomic_notifier_call_chain without a triple fault is
to have a low level "first opportunity handler" in the int3 exception
handler.

Generally this will be something the vast majority of folks will not
need, but for those who need it, it is added as a kernel .config
option called KGDB_LOW_LEVEL_TRAP.

CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: H. Peter Anvin <hpa@zytor.com>
CC: x86@kernel.org
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/include/asm/kgdb.h |  3 +++
 arch/x86/kernel/kgdb.c      | 22 +++++++++++++++++++++-
 arch/x86/kernel/traps.c     |  6 ++++++
 include/linux/kgdb.h        |  1 +
 kernel/debug/debug_core.c   |  2 +-
 lib/Kconfig.kgdb            |  9 +++++++++
 6 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index e6c6c808489f..006da3687cdc 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -76,4 +76,7 @@ static inline void arch_kgdb_breakpoint(void)
 #define BREAK_INSTR_SIZE	1
 #define CACHE_FLUSH_IS_SAFE	1
 
+extern int kgdb_ll_trap(int cmd, const char *str,
+			struct pt_regs *regs, long err, int trap, int sig);
+
 #endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index acba57169938..95b89d4cb8f1 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -538,7 +538,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 			return NOTIFY_DONE;
 	}
 
-	if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
+	if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
 		return NOTIFY_DONE;
 
 	/* Must touch watchdog before return to normal operation */
@@ -546,6 +546,26 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 	return NOTIFY_STOP;
 }
 
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+int kgdb_ll_trap(int cmd, const char *str,
+		 struct pt_regs *regs, long err, int trap, int sig)
+{
+	struct die_args args = {
+		.regs	= regs,
+		.str	= str,
+		.err	= err,
+		.trapnr	= trap,
+		.signr	= sig,
+
+	};
+
+	if (!kgdb_io_module_registered)
+		return NOTIFY_DONE;
+
+	return __kgdb_notify(&args, cmd);
+}
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
 static int
 kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
 {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 02cfb9b8f5b1..7eaad4c5110a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/kgdb.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
@@ -451,6 +452,11 @@ void restart_nmi(void)
 /* May run on IST stack. */
 dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 #ifdef CONFIG_KPROBES
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 406f6f9286f3..19d1b29a2694 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -60,6 +60,7 @@ struct uart_port;
 void kgdb_breakpoint(void);
 
 extern int kgdb_connected;
+extern int kgdb_io_module_registered;
 
 extern atomic_t			kgdb_setting_breakpoint;
 extern atomic_t			kgdb_cpu_doing_single_step;
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 88a83a225374..375e42f0baf0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -66,7 +66,7 @@ int				kgdb_connected;
 EXPORT_SYMBOL_GPL(kgdb_connected);
 
 /* All the KGDB handlers are installed */
-static int			kgdb_io_module_registered;
+int			kgdb_io_module_registered;
 
 /* Guard for recursive entry */
 static int			exception_level;
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
index ee8ae7132f20..c56ccb4ad292 100644
--- a/lib/Kconfig.kgdb
+++ b/lib/Kconfig.kgdb
@@ -57,6 +57,15 @@ config KGDB_TESTS_BOOT_STRING
 	  information about other strings you could use beyond the
 	  default of V1F100.
 
+config KGDB_LOW_LEVEL_TRAP
+       bool "KGDB: Allow debugging with traps in notifiers"
+       depends on X86
+       default n
+       help
+         This will add an extra call back to kgdb for the breakpoint
+         exception handler on which will will allow kgdb to step
+         through a notify handler.
+
 config KGDB_KDB
 	bool "KGDB_KDB: include kdb frontend for kgdb"
 	default n
-- 
cgit v1.2.3-55-g7522


From 1cee5e35f15d0893be1ba944f1aec8676e43ab76 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Wed, 3 Jun 2009 14:06:57 -0500
Subject: kgdb: Add the ability to schedule a breakpoint via a tasklet

Some kgdb I/O modules require the ability to create a breakpoint
tasklet, such as kgdboc and external modules such as kgdboe.  The
breakpoint tasklet is used as an asynchronous entry point into the
debugger which will have a different function scope than the current
execution path where it might not be safe to have an inline
breakpoint.  This is true of some of the kgdb I/O drivers which share
code with kgdb and rest of the kernel users.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 include/linux/kgdb.h      |  1 +
 kernel/debug/debug_core.c | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 19d1b29a2694..ee007ea341b8 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -271,6 +271,7 @@ extern int kgdb_mem2hex(char *mem, char *buf, int count);
 extern int kgdb_hex2mem(char *buf, char *mem, int count);
 
 extern int kgdb_isremovedbreak(unsigned long addr);
+extern void kgdb_schedule_breakpoint(void);
 
 extern int
 kgdb_handle_exception(int ex_vector, int signo, int err_code,
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 375e42f0baf0..fff59019cca0 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -114,6 +114,7 @@ EXPORT_SYMBOL_GPL(kgdb_active);
  */
 static atomic_t			passive_cpu_wait[NR_CPUS];
 static atomic_t			cpu_in_kgdb[NR_CPUS];
+static atomic_t			kgdb_break_tasklet_var;
 atomic_t			kgdb_setting_breakpoint;
 
 struct task_struct		*kgdb_usethread;
@@ -789,6 +790,31 @@ static void kgdb_unregister_callbacks(void)
 	}
 }
 
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+	kgdb_breakpoint();
+	atomic_set(&kgdb_break_tasklet_var, 0);
+}
+
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+
+void kgdb_schedule_breakpoint(void)
+{
+	if (atomic_read(&kgdb_break_tasklet_var) ||
+		atomic_read(&kgdb_active) != -1 ||
+		atomic_read(&kgdb_setting_breakpoint))
+		return;
+	atomic_inc(&kgdb_break_tasklet_var);
+	tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+
 static void kgdb_initial_breakpoint(void)
 {
 	kgdb_break_asap = 0;
-- 
cgit v1.2.3-55-g7522


From efe2f29e324fd20e0449bcd6dc6dbe4734c2ba94 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:26 -0500
Subject: kgdboc,kdb: Allow kdb to work on a non open console port

If kdb is open on a serial port that is not actually a console make
sure to call the poll routines to emit and receive characters.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Martin Hicks <mort@sgi.com>
---
 drivers/serial/kgdboc.c   | 14 ++++++++++++++
 include/linux/kgdb.h      |  3 +++
 kernel/debug/kdb/kdb_io.c | 16 ++++++++++++++++
 3 files changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/drivers/serial/kgdboc.c b/drivers/serial/kgdboc.c
index ecef6e1a599a..b765ab48dfe7 100644
--- a/drivers/serial/kgdboc.c
+++ b/drivers/serial/kgdboc.c
@@ -16,6 +16,7 @@
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/tty.h>
+#include <linux/console.h>
 
 #define MAX_CONFIG_LEN		40
 
@@ -93,12 +94,14 @@ static int configure_kgdboc(void)
 	int tty_line = 0;
 	int err;
 	char *cptr = config;
+	struct console *cons;
 
 	err = kgdboc_option_setup(config);
 	if (err || !strlen(config) || isspace(config[0]))
 		goto noconfig;
 
 	err = -ENODEV;
+	kgdboc_io_ops.is_console = 0;
 	kgdb_tty_driver = NULL;
 
 	if (kgdboc_register_kbd(&cptr))
@@ -108,6 +111,17 @@ static int configure_kgdboc(void)
 	if (!p)
 		goto noconfig;
 
+	cons = console_drivers;
+	while (cons) {
+		int idx;
+		if (cons->device && cons->device(cons, &idx) == p &&
+		    idx == tty_line) {
+			kgdboc_io_ops.is_console = 1;
+			break;
+		}
+		cons = cons->next;
+	}
+
 	kgdb_tty_driver = p;
 	kgdb_tty_line = tty_line;
 
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index ee007ea341b8..6c784ab6856a 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -247,6 +247,8 @@ struct kgdb_arch {
  * the I/O driver.
  * @post_exception: Pointer to a function that will do any cleanup work
  * for the I/O driver.
+ * @is_console: 1 if the end device is a console 0 if the I/O device is
+ * not a console
  */
 struct kgdb_io {
 	const char		*name;
@@ -256,6 +258,7 @@ struct kgdb_io {
 	int			(*init) (void);
 	void			(*pre_exception) (void);
 	void			(*post_exception) (void);
+	int			is_console;
 };
 
 extern struct kgdb_arch		arch_kgdb_ops;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 8339b291e8bc..58be7e9c9e95 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -673,6 +673,14 @@ kdb_printit:
 	if (!dbg_kdb_mode && kgdb_connected) {
 		gdbstub_msg_write(kdb_buffer, retlen);
 	} else {
+		if (!dbg_io_ops->is_console) {
+			len = strlen(kdb_buffer);
+			cp = kdb_buffer;
+			while (len--) {
+				dbg_io_ops->write_char(*cp);
+				cp++;
+			}
+		}
 		while (c) {
 			c->write(c, kdb_buffer, retlen);
 			touch_nmi_watchdog();
@@ -719,6 +727,14 @@ kdb_printit:
 		kdb_input_flush();
 		c = console_drivers;
 
+		if (!dbg_io_ops->is_console) {
+			len = strlen(moreprompt);
+			cp = moreprompt;
+			while (len--) {
+				dbg_io_ops->write_char(*cp);
+				cp++;
+			}
+		}
 		while (c) {
 			c->write(c, moreprompt, strlen(moreprompt));
 			touch_nmi_watchdog();
-- 
cgit v1.2.3-55-g7522


From d37d39ae3b4a8f9a21114921fb344fe7cadb1abd Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:27 -0500
Subject: printk,kdb: capture printk() when in kdb shell

Certain calls from the kdb shell will call out to printk(), and any of
these calls should get vectored back to the kdb_printf() so that the
kdb pager and processing can be used, as well as to properly channel
I/O to the polled I/O devices.

CC: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kdb.h         |  3 +++
 kernel/debug/kdb/kdb_bt.c   |  2 ++
 kernel/debug/kdb/kdb_io.c   | 24 ++++++++++++++++++++----
 kernel/debug/kdb/kdb_main.c |  4 ++++
 kernel/printk.c             |  9 +++++++++
 5 files changed, 38 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index d72fa3908128..ccb2b3ec0fe8 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -78,6 +78,9 @@ typedef enum {
 	KDB_REASON_SSTEP,	/* Single Step trap. - regs valid */
 } kdb_reason_t;
 
+extern int kdb_trap_printk;
+extern int vkdb_printf(const char *fmt, va_list args)
+	    __attribute__ ((format (printf, 1, 0)));
 extern int kdb_printf(const char *, ...)
 	    __attribute__ ((format (printf, 1, 2)));
 typedef int (*kdb_printf_t)(const char *, ...)
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 483fa4e7aaac..2f62fe85f16a 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -23,6 +23,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
 {
 	int old_lvl = console_loglevel;
 	console_loglevel = 15;
+	kdb_trap_printk++;
 	kdb_set_current_task(p);
 	if (addr) {
 		show_stack((struct task_struct *)p, addr);
@@ -36,6 +37,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
 		show_stack(p, NULL);
 	}
 	console_loglevel = old_lvl;
+	kdb_trap_printk--;
 }
 
 /*
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 58be7e9c9e95..c9b7f4f90bba 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -29,6 +29,7 @@
 #define CMD_BUFLEN 256
 char kdb_prompt_str[CMD_BUFLEN];
 
+int kdb_trap_printk;
 
 static void kgdb_transition_check(char *buffer)
 {
@@ -533,12 +534,12 @@ static int kdb_search_string(char *searched, char *searchfor)
 	return 0;
 }
 
-int kdb_printf(const char *fmt, ...)
+int vkdb_printf(const char *fmt, va_list ap)
 {
-	va_list ap;
 	int diag;
 	int linecount;
 	int logging, saved_loglevel = 0;
+	int saved_trap_printk;
 	int got_printf_lock = 0;
 	int retlen = 0;
 	int fnd, len;
@@ -549,6 +550,9 @@ int kdb_printf(const char *fmt, ...)
 	unsigned long uninitialized_var(flags);
 
 	preempt_disable();
+	saved_trap_printk = kdb_trap_printk;
+	kdb_trap_printk = 0;
+
 	/* Serialize kdb_printf if multiple cpus try to write at once.
 	 * But if any cpu goes recursive in kdb, just print the output,
 	 * even if it is interleaved with any other text.
@@ -575,9 +579,7 @@ int kdb_printf(const char *fmt, ...)
 		next_avail = kdb_buffer;
 		size_avail = sizeof(kdb_buffer);
 	}
-	va_start(ap, fmt);
 	vsnprintf(next_avail, size_avail, fmt, ap);
-	va_end(ap);
 
 	/*
 	 * If kdb_parse() found that the command was cmd xxx | grep yyy
@@ -805,6 +807,20 @@ kdb_print_out:
 	} else {
 		__release(kdb_printf_lock);
 	}
+	kdb_trap_printk = saved_trap_printk;
 	preempt_enable();
 	return retlen;
 }
+
+int kdb_printf(const char *fmt, ...)
+{
+	va_list ap;
+	int r;
+
+	va_start(ap, fmt);
+	r = vkdb_printf(fmt, ap);
+	va_end(ap);
+
+	return r;
+}
+
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 64ef9ac14ba9..b724c791b6d4 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1056,7 +1056,9 @@ static void kdb_dumpregs(struct pt_regs *regs)
 {
 	int old_lvl = console_loglevel;
 	console_loglevel = 15;
+	kdb_trap_printk++;
 	show_regs(regs);
+	kdb_trap_printk--;
 	kdb_printf("\n");
 	console_loglevel = old_lvl;
 }
@@ -1819,7 +1821,9 @@ static int kdb_sr(int argc, const char **argv)
 	if (argc != 1)
 		return KDB_ARGCOUNT;
 	sysrq_toggle_support(1);
+	kdb_trap_printk++;
 	handle_sysrq(*argv[1], NULL);
+	kdb_trap_printk--;
 
 	return 0;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 9213b8b5bb4f..444b770c9595 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/kdb.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
 #include <linux/syslog.h>
@@ -602,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...)
 	va_list args;
 	int r;
 
+#ifdef CONFIG_KGDB_KDB
+	if (unlikely(kdb_trap_printk)) {
+		va_start(args, fmt);
+		r = vkdb_printf(fmt, args);
+		va_end(args);
+		return r;
+	}
+#endif
 	va_start(args, fmt);
 	r = vprintk(fmt, args);
 	va_end(args);
-- 
cgit v1.2.3-55-g7522


From 6d90634076200af035f1d9dcc8fc11acefd603e9 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:27 -0500
Subject: debug_core,kdb: Allow the debug core to process a recursive debug
 entry

This allows kdb to debug a crash with in the kms code with a
single level recursive re-entry.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/debug_core.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index fff59019cca0..b38bb25dbbb3 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -445,6 +445,10 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 	}
 
 	printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+	/* Allow kdb to debug itself one level */
+	return 0;
+#endif
 	dump_stack();
 	panic("Recursive entry to debugger");
 
@@ -489,6 +493,9 @@ acquirelock:
 	 */
 	atomic_inc(&cpu_in_kgdb[cpu]);
 
+	if (exception_level == 1)
+		goto cpu_master_loop;
+
 	/*
 	 * CPU will loop if it is a slave or request to become a kgdb
 	 * master cpu and acquire the kgdb_active lock:
-- 
cgit v1.2.3-55-g7522


From 4402c153cb9c549cd21d6007ef0dfac50c8d148d Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:28 -0500
Subject: kdb,debug_core: Allow the debug core to receive a panic notification

It is highly desirable to trap into kdb on panic.  The debug core will
attempt to register as the first in line for the panic notifier.

CC: Ingo Molnar <mingo@elte.hu>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/debug_core.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b38bb25dbbb3..64b5588c9638 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -762,11 +762,28 @@ static struct sysrq_key_op sysrq_dbg_op = {
 };
 #endif
 
+static int kgdb_panic_event(struct notifier_block *self,
+			    unsigned long val,
+			    void *data)
+{
+	if (dbg_kdb_mode)
+		kdb_printf("PANIC: %s\n", (char *)data);
+	kgdb_breakpoint();
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kgdb_panic_event_nb = {
+       .notifier_call	= kgdb_panic_event,
+       .priority	= INT_MAX,
+};
+
 static void kgdb_register_callbacks(void)
 {
 	if (!kgdb_io_module_registered) {
 		kgdb_io_module_registered = 1;
 		kgdb_arch_init();
+		atomic_notifier_chain_register(&panic_notifier_list,
+					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
 		register_sysrq_key('g', &sysrq_dbg_op);
 #endif
@@ -786,6 +803,8 @@ static void kgdb_unregister_callbacks(void)
 	 */
 	if (kgdb_io_module_registered) {
 		kgdb_io_module_registered = 0;
+		atomic_notifier_chain_unregister(&panic_notifier_list,
+					       &kgdb_panic_event_nb);
 		kgdb_arch_exit();
 #ifdef CONFIG_MAGIC_SYSRQ
 		unregister_sysrq_key('g', &sysrq_dbg_op);
-- 
cgit v1.2.3-55-g7522


From 0b4b3827db386ec6034a5aba1261025b039440c2 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Thu, 20 May 2010 21:04:29 -0500
Subject: x86, kgdb, init: Add early and late debug states

The kernel debugger can operate well before mm_init(), but the x86
hardware breakpoint code which uses the perf api requires that the
kernel allocators are initialized.

This means the kernel debug core needs to provide an optional arch
specific call back to allow the initialization functions to run after
the kernel has been further initialized.

The kdb shell already had a similar restriction with an early
initialization and late initialization.  The kdb_init() was moved into
the debug core's version of the late init which is called
dbg_late_init();

CC: kgdb-bugreport@lists.sourceforge.net
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/x86/kernel/kgdb.c    | 17 ++++++++++-------
 include/linux/kgdb.h      | 14 ++++++++++++++
 init/main.c               |  4 ++--
 kernel/debug/debug_core.c | 16 ++++++++++++++++
 4 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 95b89d4cb8f1..2b71ec41869f 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -595,15 +595,16 @@ static struct notifier_block kgdb_notifier = {
  *	specific callbacks.
  */
 int kgdb_arch_init(void)
+{
+	return register_die_notifier(&kgdb_notifier);
+}
+
+void kgdb_arch_late(void)
 {
 	int i, cpu;
-	int ret;
 	struct perf_event_attr attr;
 	struct perf_event **pevent;
 
-	ret = register_die_notifier(&kgdb_notifier);
-	if (ret != 0)
-		return ret;
 	/*
 	 * Pre-allocate the hw breakpoint structions in the non-atomic
 	 * portion of kgdb because this operation requires mutexs to
@@ -615,12 +616,15 @@ int kgdb_arch_init(void)
 	attr.bp_type = HW_BREAKPOINT_W;
 	attr.disabled = 1;
 	for (i = 0; i < 4; i++) {
+		if (breakinfo[i].pev)
+			continue;
 		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
 		if (IS_ERR(breakinfo[i].pev)) {
-			printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
+			printk(KERN_ERR "kgdb: Could not allocate hw"
+			       "breakpoints\nDisabling the kernel debugger\n");
 			breakinfo[i].pev = NULL;
 			kgdb_arch_exit();
-			return -1;
+			return;
 		}
 		for_each_online_cpu(cpu) {
 			pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -631,7 +635,6 @@ int kgdb_arch_init(void)
 			}
 		}
 	}
-	return ret;
 }
 
 /**
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index 6c784ab6856a..9340f34d1bb5 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -207,6 +207,17 @@ extern int kgdb_validate_break_address(unsigned long addr);
 extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr);
 extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle);
 
+/**
+ *	kgdb_arch_late - Perform any architecture specific initalization.
+ *
+ *	This function will handle the late initalization of any
+ *	architecture specific callbacks.  This is an optional function for
+ *	handling things like late initialization of hw breakpoints.  The
+ *	default implementation does nothing.
+ */
+extern void kgdb_arch_late(void);
+
+
 /**
  * struct kgdb_arch - Describe architecture specific values.
  * @gdb_bpt_instr: The instruction to trigger a breakpoint.
@@ -285,7 +296,10 @@ extern int			kgdb_single_step;
 extern atomic_t			kgdb_active;
 #define in_dbg_master() \
 	(raw_smp_processor_id() == atomic_read(&kgdb_active))
+extern bool dbg_is_early;
+extern void __init dbg_late_init(void);
 #else /* ! CONFIG_KGDB */
 #define in_dbg_master() (0)
+#define dbg_late_init()
 #endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */
diff --git a/init/main.c b/init/main.c
index 372771333d98..22881b5e95e3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -62,7 +62,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
-#include <linux/kdb.h>
+#include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
@@ -676,7 +676,7 @@ asmlinkage void __init start_kernel(void)
 	buffer_init();
 	key_init();
 	security_init();
-	kdb_init(KDB_INIT_FULL);
+	dbg_late_init();
 	vfs_caches_init(totalram_pages);
 	signals_init();
 	/* rootfs populating might need page-writeback */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 64b5588c9638..5cb7cd1de10c 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -78,6 +78,8 @@ static DEFINE_SPINLOCK(kgdb_registration_lock);
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
 static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
 /* Next cpu to become the master debug core */
 int dbg_switch_cpu;
 
@@ -777,11 +779,25 @@ static struct notifier_block kgdb_panic_event_nb = {
        .priority	= INT_MAX,
 };
 
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+	dbg_is_early = false;
+	if (kgdb_io_module_registered)
+		kgdb_arch_late();
+	kdb_init(KDB_INIT_FULL);
+}
+
 static void kgdb_register_callbacks(void)
 {
 	if (!kgdb_io_module_registered) {
 		kgdb_io_module_registered = 1;
 		kgdb_arch_init();
+		if (!dbg_is_early)
+			kgdb_arch_late();
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
-- 
cgit v1.2.3-55-g7522


From db1afffab0b5d9f6d31f8f4bea44c9cb3bc59351 Mon Sep 17 00:00:00 2001
From: NeilBrown
Date: Tue, 16 Mar 2010 15:14:51 +1100
Subject: kref: remove kref_set

Of the three uses of kref_set in the kernel:

 One really should be kref_put as the code is letting go of a
    reference,
 Two really should be kref_init because the kref is being
    initialised.

This suggests that making kref_set available encourages bad code.
So fix the three uses and remove kref_set completely.

Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 include/linux/kref.h              |  1 -
 kernel/user_namespace.c           |  4 ++--
 lib/kref.c                        | 15 ++-------------
 security/integrity/ima/ima_iint.c |  4 ++--
 4 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index baf4b9e4b194..6cc38fc07ab7 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -21,7 +21,6 @@ struct kref {
 	atomic_t refcount;
 };
 
-void kref_set(struct kref *kref, int num);
 void kref_init(struct kref *kref);
 void kref_get(struct kref *kref);
 int kref_put(struct kref *kref, void (*release) (struct kref *kref));
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..b2d70d38dff4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -54,8 +54,8 @@ int create_user_ns(struct cred *new)
 #endif
 	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
-	kref_set(&ns->kref, 1);
+	/* root_user holds a reference to ns, our reference can be dropped */
+	put_user_ns(ns);
 
 	return 0;
 }
diff --git a/lib/kref.c b/lib/kref.c
index 6d19f690380b..d3d227a08a4b 100644
--- a/lib/kref.c
+++ b/lib/kref.c
@@ -15,24 +15,14 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
-/**
- * kref_set - initialize object and set refcount to requested number.
- * @kref: object in question.
- * @num: initial reference counter
- */
-void kref_set(struct kref *kref, int num)
-{
-	atomic_set(&kref->refcount, num);
-	smp_mb();
-}
-
 /**
  * kref_init - initialize object.
  * @kref: object in question.
  */
 void kref_init(struct kref *kref)
 {
-	kref_set(kref, 1);
+	atomic_set(&kref->refcount, 1);
+	smp_mb();
 }
 
 /**
@@ -72,7 +62,6 @@ int kref_put(struct kref *kref, void (*release)(struct kref *kref))
 	return 0;
 }
 
-EXPORT_SYMBOL(kref_set);
 EXPORT_SYMBOL(kref_init);
 EXPORT_SYMBOL(kref_get);
 EXPORT_SYMBOL(kref_put);
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index 2dc2d6594145..7625b85c2274 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -94,7 +94,7 @@ void iint_free(struct kref *kref)
 		       iint->opencount);
 		iint->opencount = 0;
 	}
-	kref_set(&iint->refcount, 1);
+	kref_init(&iint->refcount);
 	kmem_cache_free(iint_cache, iint);
 }
 
@@ -133,7 +133,7 @@ static void init_once(void *foo)
 	iint->readcount = 0;
 	iint->writecount = 0;
 	iint->opencount = 0;
-	kref_set(&iint->refcount, 1);
+	kref_init(&iint->refcount);
 }
 
 static int __init ima_iintcache_init(void)
-- 
cgit v1.2.3-55-g7522


From 1704f47b50b5d9e1b825e43e1baaf2c5897baf03 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 19 Mar 2010 01:37:42 +0100
Subject: lockdep: Add novalidate class for dev->mutex conversion

The conversion of device->sem to device->mutex resulted in lockdep
warnings. Create a novalidate class for now until the driver folks
come up with separate classes. That way we have at least the basic
mutex debugging coverage.

Add a checkpatch error so the usage is reserved for device->mutex.

[ tglx: checkpatch and compile fix for LOCKDEP=n ]

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 drivers/base/core.c     |  1 +
 include/linux/lockdep.h |  8 ++++++++
 kernel/lockdep.c        |  5 +++++
 scripts/checkpatch.pl   | 11 +++++++++++
 4 files changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index cf507a7d200c..4c5be85016b6 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -559,6 +559,7 @@ void device_initialize(struct device *dev)
 	kobject_init(&dev->kobj, &device_ktype);
 	INIT_LIST_HEAD(&dev->dma_pools);
 	mutex_init(&dev->mutex);
+	lockdep_set_novalidate_class(&dev->mutex);
 	spin_lock_init(&dev->devres_lock);
 	INIT_LIST_HEAD(&dev->devres_head);
 	device_pm_init(dev);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index a03977a96d7e..06aed8305bf3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -44,6 +44,8 @@ struct lock_class_key {
 	struct lockdep_subclass_key	subkeys[MAX_LOCKDEP_SUBCLASSES];
 };
 
+extern struct lock_class_key __lockdep_no_validate__;
+
 #define LOCKSTAT_POINTS		4
 
 /*
@@ -270,6 +272,9 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
 #define lockdep_set_subclass(lock, sub)	\
 		lockdep_init_map(&(lock)->dep_map, #lock, \
 				 (lock)->dep_map.key, sub)
+
+#define lockdep_set_novalidate_class(lock) \
+	lockdep_set_class(lock, &__lockdep_no_validate__)
 /*
  * Compare locking classes
  */
@@ -354,6 +359,9 @@ static inline void lockdep_on(void)
 #define lockdep_set_class_and_subclass(lock, key, sub) \
 		do { (void)(key); } while (0)
 #define lockdep_set_subclass(lock, sub)		do { } while (0)
+
+#define lockdep_set_novalidate_class(lock) do { } while (0)
+
 /*
  * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
  * case since the result is not well defined and the caller should rather
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ec21304856d1..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2711,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
+struct lock_class_key __lockdep_no_validate__;
+
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
  * We maintain the dependency maps and validate the locking attempt:
@@ -2745,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		return 0;
 	}
 
+	if (lock->key == &__lockdep_no_validate__)
+		check = 1;
+
 	if (!subclass)
 		class = lock->class_cache;
 	/*
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index a4d74344d805..f2bbea900700 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2656,6 +2656,7 @@ sub process {
 # check for semaphores used as mutexes
 		if ($line =~ /^.\s*init_MUTEX_LOCKED\s*\(/) {
 			WARN("consider using a completion\n" . $herecurr);
+
 		}
 # recommend strict_strto* over simple_strto*
 		if ($line =~ /\bsimple_(strto.*?)\s*\(/) {
@@ -2740,6 +2741,16 @@ sub process {
 				WARN("use of in_atomic() is incorrect outside core kernel code\n" . $herecurr);
 			}
 		}
+
+# check for lockdep_set_novalidate_class
+		if ($line =~ /^.\s*lockdep_set_novalidate_class\s*\(/ ||
+		    $line =~ /__lockdep_no_validate__\s*\)/ ) {
+			if ($realfile !~ m@^kernel/lockdep@ &&
+			    $realfile !~ m@^include/linux/lockdep@ &&
+			    $realfile !~ m@^drivers/base/core@) {
+				ERROR("lockdep_no_validate class is reserved for device->mutex.\n" . $herecurr);
+			}
+		}
 	}
 
 	# If we have no input at all, then there is nothing to report on
-- 
cgit v1.2.3-55-g7522


From 2c3c8bea608866d8bd9dcf92657d57fdcac011c5 Mon Sep 17 00:00:00 2001
From: Chris Wright
Date: Wed, 12 May 2010 18:28:57 -0700
Subject: sysfs: add struct file* to bin_attr callbacks

This allows bin_attr->read,write,mmap callbacks to check file specific data
(such as inode owner) as part of any privilege validation.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 arch/alpha/kernel/pci-sysfs.c        |  8 ++++---
 arch/mips/txx9/generic/setup.c       |  4 ++--
 arch/powerpc/sysdev/mv64x60_pci.c    |  4 ++--
 arch/s390/kernel/ipl.c               | 14 ++++++------
 drivers/acpi/system.c                |  2 +-
 drivers/base/firmware_class.c        | 11 ++++++----
 drivers/firmware/dcdbas.c            |  4 ++--
 drivers/firmware/dell_rbu.c          | 10 ++++-----
 drivers/firmware/efivars.c           |  4 ++--
 drivers/gpu/drm/drm_sysfs.c          |  5 +++--
 drivers/misc/c2port/core.c           |  4 ++--
 drivers/misc/ds1682.c                |  6 ++++--
 drivers/misc/eeprom/at24.c           |  6 ++++--
 drivers/misc/eeprom/at25.c           |  6 ++++--
 drivers/misc/eeprom/eeprom.c         |  3 ++-
 drivers/misc/eeprom/max6875.c        |  2 +-
 drivers/net/netxen/netxen_nic_main.c | 11 ++++++----
 drivers/net/qlcnic/qlcnic_main.c     | 12 +++++++----
 drivers/pci/hotplug/acpiphp_ibm.c    |  5 +++--
 drivers/pci/pci-sysfs.c              | 42 +++++++++++++++++++++++++-----------
 drivers/pcmcia/cistpl.c              |  4 ++--
 drivers/power/olpc_battery.c         |  2 +-
 drivers/rapidio/rio-sysfs.c          |  6 ++++--
 drivers/rtc/rtc-cmos.c               |  6 ++++--
 drivers/rtc/rtc-ds1305.c             |  6 ++++--
 drivers/rtc/rtc-ds1307.c             |  6 ++++--
 drivers/rtc/rtc-ds1511.c             | 10 +++++----
 drivers/rtc/rtc-ds1553.c             |  4 ++--
 drivers/rtc/rtc-ds1742.c             |  4 ++--
 drivers/rtc/rtc-m48t59.c             |  4 ++--
 drivers/rtc/rtc-stk17ta8.c           |  4 ++--
 drivers/rtc/rtc-tx4939.c             |  4 ++--
 drivers/s390/cio/chp.c               |  5 +++--
 drivers/scsi/3w-sas.c                |  4 ++--
 drivers/scsi/arcmsr/arcmsr_attr.c    |  9 +++++---
 drivers/scsi/ibmvscsi/ibmvfc.c       |  3 ++-
 drivers/scsi/ipr.c                   |  9 +++++---
 drivers/scsi/lpfc/lpfc_attr.c        | 20 ++++++++++++-----
 drivers/scsi/qla2xxx/qla_attr.c      | 32 +++++++++++++--------------
 drivers/staging/udlfb/udlfb.c        |  3 ++-
 drivers/usb/core/sysfs.c             |  3 ++-
 drivers/video/aty/radeon_base.c      |  4 ++--
 drivers/w1/slaves/w1_ds2431.c        |  4 ++--
 drivers/w1/slaves/w1_ds2433.c        |  4 ++--
 drivers/w1/slaves/w1_ds2760.c        |  2 +-
 drivers/w1/w1.c                      |  4 ++--
 drivers/zorro/zorro-sysfs.c          |  2 +-
 fs/sysfs/bin.c                       | 24 ++++++++++-----------
 include/linux/sysfs.h                |  7 +++---
 kernel/ksysfs.c                      |  3 ++-
 kernel/module.c                      |  2 +-
 net/bridge/br_sysfs_br.c             |  2 +-
 52 files changed, 220 insertions(+), 149 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c
index d979e7c7bc4b..a5fffc882c72 100644
--- a/arch/alpha/kernel/pci-sysfs.c
+++ b/arch/alpha/kernel/pci-sysfs.c
@@ -53,6 +53,7 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num,
 
 /**
  * pci_mmap_resource - map a PCI resource into user memory space
+ * @filp: open sysfs file
  * @kobj: kobject for mapping
  * @attr: struct bin_attribute for the file being mapped
  * @vma: struct vm_area_struct passed into the mmap
@@ -60,7 +61,8 @@ static int __pci_mmap_fits(struct pci_dev *pdev, int num,
  *
  * Use the bus mapping routines to map a PCI resource into userspace.
  */
-static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
+static int pci_mmap_resource(struct file *filp, struct kobject *kobj,
+			     struct bin_attribute *attr,
 			     struct vm_area_struct *vma, int sparse)
 {
 	struct pci_dev *pdev = to_pci_dev(container_of(kobj,
@@ -89,14 +91,14 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 	return hose_mmap_page_range(pdev->sysdata, vma, mmap_type, sparse);
 }
 
-static int pci_mmap_resource_sparse(struct kobject *kobj,
+static int pci_mmap_resource_sparse(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *attr,
 				    struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 1);
 }
 
-static int pci_mmap_resource_dense(struct kobject *kobj,
+static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj,
 				   struct bin_attribute *attr,
 				   struct vm_area_struct *vma)
 {
diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c
index adc69291f9e2..575d219b8001 100644
--- a/arch/mips/txx9/generic/setup.c
+++ b/arch/mips/txx9/generic/setup.c
@@ -905,7 +905,7 @@ struct txx9_sramc_sysdev {
 	void __iomem *base;
 };
 
-static ssize_t txx9_sram_read(struct kobject *kobj,
+static ssize_t txx9_sram_read(struct file *filp, struct kobject *kobj,
 			      struct bin_attribute *bin_attr,
 			      char *buf, loff_t pos, size_t size)
 {
@@ -920,7 +920,7 @@ static ssize_t txx9_sram_read(struct kobject *kobj,
 	return size;
 }
 
-static ssize_t txx9_sram_write(struct kobject *kobj,
+static ssize_t txx9_sram_write(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr,
 			       char *buf, loff_t pos, size_t size)
 {
diff --git a/arch/powerpc/sysdev/mv64x60_pci.c b/arch/powerpc/sysdev/mv64x60_pci.c
index 1456015a22d8..198f288570cc 100644
--- a/arch/powerpc/sysdev/mv64x60_pci.c
+++ b/arch/powerpc/sysdev/mv64x60_pci.c
@@ -24,7 +24,7 @@
 #define MV64X60_VAL_LEN_MAX		11
 #define MV64X60_PCICFG_CPCI_HOTSWAP	0x68
 
-static ssize_t mv64x60_hs_reg_read(struct kobject *kobj,
+static ssize_t mv64x60_hs_reg_read(struct file *filp, struct kobject *kobj,
 				   struct bin_attribute *attr, char *buf,
 				   loff_t off, size_t count)
 {
@@ -45,7 +45,7 @@ static ssize_t mv64x60_hs_reg_read(struct kobject *kobj,
 	return sprintf(buf, "0x%08x\n", v);
 }
 
-static ssize_t mv64x60_hs_reg_write(struct kobject *kobj,
+static ssize_t mv64x60_hs_reg_write(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *attr, char *buf,
 				    loff_t off, size_t count)
 {
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 72c8b0d070c8..a689070be287 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -403,8 +403,9 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj,
 static struct kobj_attribute sys_ipl_device_attr =
 	__ATTR(device, S_IRUGO, sys_ipl_device_show, NULL);
 
-static ssize_t ipl_parameter_read(struct kobject *kobj, struct bin_attribute *attr,
-				  char *buf, loff_t off, size_t count)
+static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj,
+				  struct bin_attribute *attr, char *buf,
+				  loff_t off, size_t count)
 {
 	return memory_read_from_buffer(buf, count, &off, IPL_PARMBLOCK_START,
 					IPL_PARMBLOCK_SIZE);
@@ -419,8 +420,9 @@ static struct bin_attribute ipl_parameter_attr = {
 	.read = &ipl_parameter_read,
 };
 
-static ssize_t ipl_scp_data_read(struct kobject *kobj, struct bin_attribute *attr,
-				 char *buf, loff_t off, size_t count)
+static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
+				 struct bin_attribute *attr, char *buf,
+				 loff_t off, size_t count)
 {
 	unsigned int size = IPL_PARMBLOCK_START->ipl_info.fcp.scp_data_len;
 	void *scp_data = &IPL_PARMBLOCK_START->ipl_info.fcp.scp_data;
@@ -694,7 +696,7 @@ static struct kobj_attribute sys_reipl_ccw_vmparm_attr =
 
 /* FCP reipl device attributes */
 
-static ssize_t reipl_fcp_scpdata_read(struct kobject *kobj,
+static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj,
 				      struct bin_attribute *attr,
 				      char *buf, loff_t off, size_t count)
 {
@@ -704,7 +706,7 @@ static ssize_t reipl_fcp_scpdata_read(struct kobject *kobj,
 	return memory_read_from_buffer(buf, count, &off, scp_data, size);
 }
 
-static ssize_t reipl_fcp_scpdata_write(struct kobject *kobj,
+static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj,
 				       struct bin_attribute *attr,
 				       char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index e35525b39f6b..c79e789ed03a 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -71,7 +71,7 @@ struct acpi_table_attr {
 	struct list_head node;
 };
 
-static ssize_t acpi_table_show(struct kobject *kobj,
+static ssize_t acpi_table_show(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr, char *buf,
 			       loff_t offset, size_t count)
 {
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index d98e424675cf..3f093b0dd217 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -278,8 +278,9 @@ static ssize_t firmware_loading_store(struct device *dev,
 static DEVICE_ATTR(loading, 0644, firmware_loading_show, firmware_loading_store);
 
 static ssize_t
-firmware_data_read(struct kobject *kobj, struct bin_attribute *bin_attr,
-		   char *buffer, loff_t offset, size_t count)
+firmware_data_read(struct file *filp, struct kobject *kobj,
+		   struct bin_attribute *bin_attr, char *buffer, loff_t offset,
+		   size_t count)
 {
 	struct device *dev = to_dev(kobj);
 	struct firmware_priv *fw_priv = dev_get_drvdata(dev);
@@ -362,6 +363,7 @@ fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
 
 /**
  * firmware_data_write - write method for firmware
+ * @filp: open sysfs file
  * @kobj: kobject for the device
  * @bin_attr: bin_attr structure
  * @buffer: buffer being written
@@ -372,8 +374,9 @@ fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
  *	the driver as a firmware image.
  **/
 static ssize_t
-firmware_data_write(struct kobject *kobj, struct bin_attribute *bin_attr,
-		    char *buffer, loff_t offset, size_t count)
+firmware_data_write(struct file* filp, struct kobject *kobj,
+		    struct bin_attribute *bin_attr, char *buffer,
+		    loff_t offset, size_t count)
 {
 	struct device *dev = to_dev(kobj);
 	struct firmware_priv *fw_priv = dev_get_drvdata(dev);
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index fb09bb3c0ad6..aa9bc9e980e1 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -149,7 +149,7 @@ static ssize_t smi_data_buf_size_store(struct device *dev,
 	return count;
 }
 
-static ssize_t smi_data_read(struct kobject *kobj,
+static ssize_t smi_data_read(struct file *filp, struct kobject *kobj,
 			     struct bin_attribute *bin_attr,
 			     char *buf, loff_t pos, size_t count)
 {
@@ -162,7 +162,7 @@ static ssize_t smi_data_read(struct kobject *kobj,
 	return ret;
 }
 
-static ssize_t smi_data_write(struct kobject *kobj,
+static ssize_t smi_data_write(struct file *filp, struct kobject *kobj,
 			      struct bin_attribute *bin_attr,
 			      char *buf, loff_t pos, size_t count)
 {
diff --git a/drivers/firmware/dell_rbu.c b/drivers/firmware/dell_rbu.c
index 3a4460265b10..2f452f1f7c8a 100644
--- a/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@ -522,7 +522,7 @@ static ssize_t read_rbu_mono_data(char *buffer, loff_t pos, size_t count)
 			rbu_data.image_update_buffer, rbu_data.bios_image_size);
 }
 
-static ssize_t read_rbu_data(struct kobject *kobj,
+static ssize_t read_rbu_data(struct file *filp, struct kobject *kobj,
 			     struct bin_attribute *bin_attr,
 			     char *buffer, loff_t pos, size_t count)
 {
@@ -576,7 +576,7 @@ static void callbackfn_rbu(const struct firmware *fw, void *context)
 	release_firmware(fw);
 }
 
-static ssize_t read_rbu_image_type(struct kobject *kobj,
+static ssize_t read_rbu_image_type(struct file *filp, struct kobject *kobj,
 				   struct bin_attribute *bin_attr,
 				   char *buffer, loff_t pos, size_t count)
 {
@@ -586,7 +586,7 @@ static ssize_t read_rbu_image_type(struct kobject *kobj,
 	return size;
 }
 
-static ssize_t write_rbu_image_type(struct kobject *kobj,
+static ssize_t write_rbu_image_type(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *bin_attr,
 				    char *buffer, loff_t pos, size_t count)
 {
@@ -647,7 +647,7 @@ static ssize_t write_rbu_image_type(struct kobject *kobj,
 	return rc;
 }
 
-static ssize_t read_rbu_packet_size(struct kobject *kobj,
+static ssize_t read_rbu_packet_size(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *bin_attr,
 				    char *buffer, loff_t pos, size_t count)
 {
@@ -660,7 +660,7 @@ static ssize_t read_rbu_packet_size(struct kobject *kobj,
 	return size;
 }
 
-static ssize_t write_rbu_packet_size(struct kobject *kobj,
+static ssize_t write_rbu_packet_size(struct file *filp, struct kobject *kobj,
 				     struct bin_attribute *bin_attr,
 				     char *buffer, loff_t pos, size_t count)
 {
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 81b70bd07586..2a62ec6390e0 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -402,7 +402,7 @@ efivar_unregister(struct efivar_entry *var)
 }
 
 
-static ssize_t efivar_create(struct kobject *kobj,
+static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 			     struct bin_attribute *bin_attr,
 			     char *buf, loff_t pos, size_t count)
 {
@@ -461,7 +461,7 @@ static ssize_t efivar_create(struct kobject *kobj,
 	return count;
 }
 
-static ssize_t efivar_delete(struct kobject *kobj,
+static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 			     struct bin_attribute *bin_attr,
 			     char *buf, loff_t pos, size_t count)
 {
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 25bbd30ed7af..387166d5a109 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -193,8 +193,9 @@ static ssize_t enabled_show(struct device *device,
 			"disabled");
 }
 
-static ssize_t edid_show(struct kobject *kobj, struct bin_attribute *attr,
-			 char *buf, loff_t off, size_t count)
+static ssize_t edid_show(struct file *filp, struct kobject *kobj,
+			 struct bin_attribute *attr, char *buf, loff_t off,
+			 size_t count)
 {
 	struct device *connector_dev = container_of(kobj, struct device, kobj);
 	struct drm_connector *connector = to_drm_connector(connector_dev);
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index ed090e77c9cd..19fc7c1cb428 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -707,7 +707,7 @@ static ssize_t __c2port_read_flash_data(struct c2port_device *dev,
 	return nread;
 }
 
-static ssize_t c2port_read_flash_data(struct kobject *kobj,
+static ssize_t c2port_read_flash_data(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *attr,
 				char *buffer, loff_t offset, size_t count)
 {
@@ -824,7 +824,7 @@ static ssize_t __c2port_write_flash_data(struct c2port_device *dev,
 	return nwrite;
 }
 
-static ssize_t c2port_write_flash_data(struct kobject *kobj,
+static ssize_t c2port_write_flash_data(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *attr,
 				char *buffer, loff_t offset, size_t count)
 {
diff --git a/drivers/misc/ds1682.c b/drivers/misc/ds1682.c
index 9197cfc55015..a513f0aa6432 100644
--- a/drivers/misc/ds1682.c
+++ b/drivers/misc/ds1682.c
@@ -140,7 +140,8 @@ static const struct attribute_group ds1682_group = {
 /*
  * User data attribute
  */
-static ssize_t ds1682_eeprom_read(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t ds1682_eeprom_read(struct file *filp, struct kobject *kobj,
+				  struct bin_attribute *attr,
 				  char *buf, loff_t off, size_t count)
 {
 	struct i2c_client *client = kobj_to_i2c_client(kobj);
@@ -163,7 +164,8 @@ static ssize_t ds1682_eeprom_read(struct kobject *kobj, struct bin_attribute *at
 	return count;
 }
 
-static ssize_t ds1682_eeprom_write(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t ds1682_eeprom_write(struct file *filp, struct kobject *kobj,
+				   struct bin_attribute *attr,
 				   char *buf, loff_t off, size_t count)
 {
 	struct i2c_client *client = kobj_to_i2c_client(kobj);
diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index db7d0f21b65d..a79a62f75481 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -274,7 +274,8 @@ static ssize_t at24_read(struct at24_data *at24,
 	return retval;
 }
 
-static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t at24_bin_read(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct at24_data *at24;
@@ -395,7 +396,8 @@ static ssize_t at24_write(struct at24_data *at24, const char *buf, loff_t off,
 	return retval;
 }
 
-static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
+static ssize_t at24_bin_write(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct at24_data *at24;
diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index d194212a41f6..c627e4174ccd 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -126,7 +126,8 @@ at25_ee_read(
 }
 
 static ssize_t
-at25_bin_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+at25_bin_read(struct file *filp, struct kobject *kobj,
+	      struct bin_attribute *bin_attr,
 	      char *buf, loff_t off, size_t count)
 {
 	struct device		*dev;
@@ -253,7 +254,8 @@ at25_ee_write(struct at25_data *at25, const char *buf, loff_t off,
 }
 
 static ssize_t
-at25_bin_write(struct kobject *kobj, struct bin_attribute *bin_attr,
+at25_bin_write(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *bin_attr,
 	       char *buf, loff_t off, size_t count)
 {
 	struct device		*dev;
diff --git a/drivers/misc/eeprom/eeprom.c b/drivers/misc/eeprom/eeprom.c
index e306a8cd2f96..45060ddc4e59 100644
--- a/drivers/misc/eeprom/eeprom.c
+++ b/drivers/misc/eeprom/eeprom.c
@@ -81,7 +81,8 @@ exit:
 	mutex_unlock(&data->update_lock);
 }
 
-static ssize_t eeprom_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+static ssize_t eeprom_read(struct file *filp, struct kobject *kobj,
+			   struct bin_attribute *bin_attr,
 			   char *buf, loff_t off, size_t count)
 {
 	struct i2c_client *client = to_i2c_client(container_of(kobj, struct device, kobj));
diff --git a/drivers/misc/eeprom/max6875.c b/drivers/misc/eeprom/max6875.c
index fe2909278507..5653a3ce0517 100644
--- a/drivers/misc/eeprom/max6875.c
+++ b/drivers/misc/eeprom/max6875.c
@@ -107,7 +107,7 @@ exit_up:
 	mutex_unlock(&data->update_lock);
 }
 
-static ssize_t max6875_read(struct kobject *kobj,
+static ssize_t max6875_read(struct file *filp, struct kobject *kobj,
 			    struct bin_attribute *bin_attr,
 			    char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index c61a61f177b7..6ce6ce1df6d2 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -2560,7 +2560,8 @@ netxen_sysfs_validate_crb(struct netxen_adapter *adapter,
 }
 
 static ssize_t
-netxen_sysfs_read_crb(struct kobject *kobj, struct bin_attribute *attr,
+netxen_sysfs_read_crb(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -2587,7 +2588,8 @@ netxen_sysfs_read_crb(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static ssize_t
-netxen_sysfs_write_crb(struct kobject *kobj, struct bin_attribute *attr,
+netxen_sysfs_write_crb(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -2627,7 +2629,8 @@ netxen_sysfs_validate_mem(struct netxen_adapter *adapter,
 }
 
 static ssize_t
-netxen_sysfs_read_mem(struct kobject *kobj, struct bin_attribute *attr,
+netxen_sysfs_read_mem(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -2647,7 +2650,7 @@ netxen_sysfs_read_mem(struct kobject *kobj, struct bin_attribute *attr,
 	return size;
 }
 
-static ssize_t netxen_sysfs_write_mem(struct kobject *kobj,
+static ssize_t netxen_sysfs_write_mem(struct file *filp, struct kobject *kobj,
 		struct bin_attribute *attr, char *buf,
 		loff_t offset, size_t size)
 {
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index 1003eb76fda3..23ea9caa5261 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -2464,7 +2464,8 @@ qlcnic_sysfs_validate_crb(struct qlcnic_adapter *adapter,
 }
 
 static ssize_t
-qlcnic_sysfs_read_crb(struct kobject *kobj, struct bin_attribute *attr,
+qlcnic_sysfs_read_crb(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -2488,7 +2489,8 @@ qlcnic_sysfs_read_crb(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static ssize_t
-qlcnic_sysfs_write_crb(struct kobject *kobj, struct bin_attribute *attr,
+qlcnic_sysfs_write_crb(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -2525,7 +2527,8 @@ qlcnic_sysfs_validate_mem(struct qlcnic_adapter *adapter,
 }
 
 static ssize_t
-qlcnic_sysfs_read_mem(struct kobject *kobj, struct bin_attribute *attr,
+qlcnic_sysfs_read_mem(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -2546,7 +2549,8 @@ qlcnic_sysfs_read_mem(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static ssize_t
-qlcnic_sysfs_write_mem(struct kobject *kobj, struct bin_attribute *attr,
+qlcnic_sysfs_write_mem(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t offset, size_t size)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index 6ecbfb27db9d..e525263210ee 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -108,7 +108,7 @@ static int ibm_set_attention_status(struct hotplug_slot *slot, u8 status);
 static int ibm_get_attention_status(struct hotplug_slot *slot, u8 *status);
 static void ibm_handle_events(acpi_handle handle, u32 event, void *context);
 static int ibm_get_table_from_acpi(char **bufp);
-static ssize_t ibm_read_apci_table(struct kobject *kobj,
+static ssize_t ibm_read_apci_table(struct file *filp, struct kobject *kobj,
 				   struct bin_attribute *bin_attr,
 				   char *buffer, loff_t pos, size_t size);
 static acpi_status __init ibm_find_acpi_device(acpi_handle handle,
@@ -351,6 +351,7 @@ read_table_done:
 
 /**
  * ibm_read_apci_table - callback for the sysfs apci_table file
+ * @filp: the open sysfs file
  * @kobj: the kobject this binary attribute is a part of
  * @bin_attr: struct bin_attribute for this file
  * @buffer: the kernel space buffer to fill
@@ -364,7 +365,7 @@ read_table_done:
  * things get really tricky here...
  * our solution is to only allow reading the table in all at once.
  */
-static ssize_t ibm_read_apci_table(struct kobject *kobj,
+static ssize_t ibm_read_apci_table(struct file *filp, struct kobject *kobj,
 				   struct bin_attribute *bin_attr,
 				   char *buffer, loff_t pos, size_t size)
 {
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index fad93983bfed..ad44557e65c4 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -357,7 +357,8 @@ boot_vga_show(struct device *dev, struct device_attribute *attr, char *buf)
 struct device_attribute vga_attr = __ATTR_RO(boot_vga);
 
 static ssize_t
-pci_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+pci_read_config(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *dev = to_pci_dev(container_of(kobj,struct device,kobj));
@@ -430,7 +431,8 @@ pci_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
 }
 
 static ssize_t
-pci_write_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+pci_write_config(struct file* filp, struct kobject *kobj,
+		 struct bin_attribute *bin_attr,
 		 char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *dev = to_pci_dev(container_of(kobj,struct device,kobj));
@@ -487,7 +489,8 @@ pci_write_config(struct kobject *kobj, struct bin_attribute *bin_attr,
 }
 
 static ssize_t
-read_vpd_attr(struct kobject *kobj, struct bin_attribute *bin_attr,
+read_vpd_attr(struct file *filp, struct kobject *kobj,
+	      struct bin_attribute *bin_attr,
 	      char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *dev =
@@ -502,7 +505,8 @@ read_vpd_attr(struct kobject *kobj, struct bin_attribute *bin_attr,
 }
 
 static ssize_t
-write_vpd_attr(struct kobject *kobj, struct bin_attribute *bin_attr,
+write_vpd_attr(struct file *filp, struct kobject *kobj,
+	       struct bin_attribute *bin_attr,
 	       char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *dev =
@@ -519,6 +523,7 @@ write_vpd_attr(struct kobject *kobj, struct bin_attribute *bin_attr,
 #ifdef HAVE_PCI_LEGACY
 /**
  * pci_read_legacy_io - read byte(s) from legacy I/O port space
+ * @filp: open sysfs file
  * @kobj: kobject corresponding to file to read from
  * @bin_attr: struct bin_attribute for this file
  * @buf: buffer to store results
@@ -529,7 +534,8 @@ write_vpd_attr(struct kobject *kobj, struct bin_attribute *bin_attr,
  * callback routine (pci_legacy_read).
  */
 static ssize_t
-pci_read_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
+pci_read_legacy_io(struct file *filp, struct kobject *kobj,
+		   struct bin_attribute *bin_attr,
 		   char *buf, loff_t off, size_t count)
 {
         struct pci_bus *bus = to_pci_bus(container_of(kobj,
@@ -545,6 +551,7 @@ pci_read_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
 
 /**
  * pci_write_legacy_io - write byte(s) to legacy I/O port space
+ * @filp: open sysfs file
  * @kobj: kobject corresponding to file to read from
  * @bin_attr: struct bin_attribute for this file
  * @buf: buffer containing value to be written
@@ -555,7 +562,8 @@ pci_read_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
  * callback routine (pci_legacy_write).
  */
 static ssize_t
-pci_write_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
+pci_write_legacy_io(struct file *filp, struct kobject *kobj,
+		    struct bin_attribute *bin_attr,
 		    char *buf, loff_t off, size_t count)
 {
         struct pci_bus *bus = to_pci_bus(container_of(kobj,
@@ -570,6 +578,7 @@ pci_write_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
 
 /**
  * pci_mmap_legacy_mem - map legacy PCI memory into user memory space
+ * @filp: open sysfs file
  * @kobj: kobject corresponding to device to be mapped
  * @attr: struct bin_attribute for this file
  * @vma: struct vm_area_struct passed to mmap
@@ -579,7 +588,8 @@ pci_write_legacy_io(struct kobject *kobj, struct bin_attribute *bin_attr,
  * memory space.
  */
 static int
-pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr,
+pci_mmap_legacy_mem(struct file *filp, struct kobject *kobj,
+		    struct bin_attribute *attr,
                     struct vm_area_struct *vma)
 {
         struct pci_bus *bus = to_pci_bus(container_of(kobj,
@@ -591,6 +601,7 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr,
 
 /**
  * pci_mmap_legacy_io - map legacy PCI IO into user memory space
+ * @filp: open sysfs file
  * @kobj: kobject corresponding to device to be mapped
  * @attr: struct bin_attribute for this file
  * @vma: struct vm_area_struct passed to mmap
@@ -600,7 +611,8 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr,
  * memory space. Returns -ENOSYS if the operation isn't supported
  */
 static int
-pci_mmap_legacy_io(struct kobject *kobj, struct bin_attribute *attr,
+pci_mmap_legacy_io(struct file *filp, struct kobject *kobj,
+		   struct bin_attribute *attr,
 		   struct vm_area_struct *vma)
 {
         struct pci_bus *bus = to_pci_bus(container_of(kobj,
@@ -750,14 +762,16 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static int
-pci_mmap_resource_uc(struct kobject *kobj, struct bin_attribute *attr,
+pci_mmap_resource_uc(struct file *filp, struct kobject *kobj,
+		     struct bin_attribute *attr,
 		     struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 0);
 }
 
 static int
-pci_mmap_resource_wc(struct kobject *kobj, struct bin_attribute *attr,
+pci_mmap_resource_wc(struct file *filp, struct kobject *kobj,
+		     struct bin_attribute *attr,
 		     struct vm_area_struct *vma)
 {
 	return pci_mmap_resource(kobj, attr, vma, 1);
@@ -861,6 +875,7 @@ void __weak pci_remove_resource_files(struct pci_dev *dev) { return; }
 
 /**
  * pci_write_rom - used to enable access to the PCI ROM display
+ * @filp: sysfs file
  * @kobj: kernel object handle
  * @bin_attr: struct bin_attribute for this file
  * @buf: user input
@@ -870,7 +885,8 @@ void __weak pci_remove_resource_files(struct pci_dev *dev) { return; }
  * writing anything except 0 enables it
  */
 static ssize_t
-pci_write_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
+pci_write_rom(struct file *filp, struct kobject *kobj,
+	      struct bin_attribute *bin_attr,
 	      char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj));
@@ -885,6 +901,7 @@ pci_write_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
 
 /**
  * pci_read_rom - read a PCI ROM
+ * @filp: sysfs file
  * @kobj: kernel object handle
  * @bin_attr: struct bin_attribute for this file
  * @buf: where to put the data we read from the ROM
@@ -895,7 +912,8 @@ pci_write_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
  * device corresponding to @kobj.
  */
 static ssize_t
-pci_read_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
+pci_read_rom(struct file *filp, struct kobject *kobj,
+	     struct bin_attribute *bin_attr,
 	     char *buf, loff_t off, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj));
diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c
index 60d428be0b07..8844bc3e3118 100644
--- a/drivers/pcmcia/cistpl.c
+++ b/drivers/pcmcia/cistpl.c
@@ -1531,7 +1531,7 @@ static ssize_t pccard_extract_cis(struct pcmcia_socket *s, char *buf,
 }
 
 
-static ssize_t pccard_show_cis(struct kobject *kobj,
+static ssize_t pccard_show_cis(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr,
 			       char *buf, loff_t off, size_t count)
 {
@@ -1562,7 +1562,7 @@ static ssize_t pccard_show_cis(struct kobject *kobj,
 }
 
 
-static ssize_t pccard_store_cis(struct kobject *kobj,
+static ssize_t pccard_store_cis(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *bin_attr,
 				char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c
index 8fefe5a73558..baefcf1cffc9 100644
--- a/drivers/power/olpc_battery.c
+++ b/drivers/power/olpc_battery.c
@@ -354,7 +354,7 @@ static enum power_supply_property olpc_bat_props[] = {
 #define EEPROM_END	0x80
 #define EEPROM_SIZE	(EEPROM_END - EEPROM_START)
 
-static ssize_t olpc_bat_eeprom_read(struct kobject *kobj,
+static ssize_t olpc_bat_eeprom_read(struct file *filp, struct kobject *kobj,
 		struct bin_attribute *attr, char *buf, loff_t off, size_t count)
 {
 	uint8_t ec_byte;
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index ba742e82c57d..00b475658356 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -68,7 +68,8 @@ struct device_attribute rio_dev_attrs[] = {
 };
 
 static ssize_t
-rio_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+rio_read_config(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct rio_dev *dev =
@@ -139,7 +140,8 @@ rio_read_config(struct kobject *kobj, struct bin_attribute *bin_attr,
 }
 
 static ssize_t
-rio_write_config(struct kobject *kobj, struct bin_attribute *bin_attr,
+rio_write_config(struct file *filp, struct kobject *kobj,
+		 struct bin_attribute *bin_attr,
 		 char *buf, loff_t off, size_t count)
 {
 	struct rio_dev *dev =
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index e9aa814ddd23..ece4dbddc0ea 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -519,7 +519,8 @@ static const struct rtc_class_ops cmos_rtc_ops = {
 #define NVRAM_OFFSET	(RTC_REG_D + 1)
 
 static ssize_t
-cmos_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
+cmos_nvram_read(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	int	retval;
@@ -547,7 +548,8 @@ cmos_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static ssize_t
-cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
+cmos_nvram_write(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct cmos_rtc	*cmos;
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index 7836c9cec557..48da85e97ca4 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -542,7 +542,8 @@ static void msg_init(struct spi_message *m, struct spi_transfer *x,
 }
 
 static ssize_t
-ds1305_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
+ds1305_nvram_read(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct spi_device	*spi;
@@ -572,7 +573,8 @@ ds1305_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static ssize_t
-ds1305_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
+ds1305_nvram_write(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct spi_device	*spi;
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index c4ec5c158aa1..de033b7ac21f 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -556,7 +556,8 @@ static const struct rtc_class_ops ds13xx_rtc_ops = {
 #define NVRAM_SIZE	56
 
 static ssize_t
-ds1307_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
+ds1307_nvram_read(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct i2c_client	*client;
@@ -580,7 +581,8 @@ ds1307_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
 }
 
 static ssize_t
-ds1307_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
+ds1307_nvram_write(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct i2c_client	*client;
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 06b8566c4532..37268e97de49 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -423,8 +423,9 @@ static const struct rtc_class_ops ds1511_rtc_ops = {
 };
 
  static ssize_t
-ds1511_nvram_read(struct kobject *kobj, struct bin_attribute *ba,
-				char *buf, loff_t pos, size_t size)
+ds1511_nvram_read(struct file *filp, struct kobject *kobj,
+		  struct bin_attribute *ba,
+		  char *buf, loff_t pos, size_t size)
 {
 	ssize_t count;
 
@@ -452,8 +453,9 @@ ds1511_nvram_read(struct kobject *kobj, struct bin_attribute *ba,
 }
 
  static ssize_t
-ds1511_nvram_write(struct kobject *kobj, struct bin_attribute *bin_attr,
-				char *buf, loff_t pos, size_t size)
+ds1511_nvram_write(struct file *filp, struct kobject *kobj,
+		   struct bin_attribute *bin_attr,
+		   char *buf, loff_t pos, size_t size)
 {
 	ssize_t count;
 
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index 244f9994bcbb..ff432e2ca275 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -252,7 +252,7 @@ static const struct rtc_class_ops ds1553_rtc_ops = {
 	.update_irq_enable	= ds1553_rtc_update_irq_enable,
 };
 
-static ssize_t ds1553_nvram_read(struct kobject *kobj,
+static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t pos, size_t size)
 {
@@ -267,7 +267,7 @@ static ssize_t ds1553_nvram_read(struct kobject *kobj,
 	return count;
 }
 
-static ssize_t ds1553_nvram_write(struct kobject *kobj,
+static ssize_t ds1553_nvram_write(struct file *filp, struct kobject *kobj,
 				  struct bin_attribute *bin_attr,
 				  char *buf, loff_t pos, size_t size)
 {
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 2b4b0bc42d6f..042630c90dd3 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -128,7 +128,7 @@ static const struct rtc_class_ops ds1742_rtc_ops = {
 	.set_time	= ds1742_rtc_set_time,
 };
 
-static ssize_t ds1742_nvram_read(struct kobject *kobj,
+static ssize_t ds1742_nvram_read(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t pos, size_t size)
 {
@@ -143,7 +143,7 @@ static ssize_t ds1742_nvram_read(struct kobject *kobj,
 	return count;
 }
 
-static ssize_t ds1742_nvram_write(struct kobject *kobj,
+static ssize_t ds1742_nvram_write(struct file *filp, struct kobject *kobj,
 				  struct bin_attribute *bin_attr,
 				  char *buf, loff_t pos, size_t size)
 {
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index 365ff3ac2348..be8359fdb65a 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -343,7 +343,7 @@ static const struct rtc_class_ops m48t02_rtc_ops = {
 	.set_time	= m48t59_rtc_set_time,
 };
 
-static ssize_t m48t59_nvram_read(struct kobject *kobj,
+static ssize_t m48t59_nvram_read(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *bin_attr,
 				char *buf, loff_t pos, size_t size)
 {
@@ -363,7 +363,7 @@ static ssize_t m48t59_nvram_read(struct kobject *kobj,
 	return cnt;
 }
 
-static ssize_t m48t59_nvram_write(struct kobject *kobj,
+static ssize_t m48t59_nvram_write(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *bin_attr,
 				char *buf, loff_t pos, size_t size)
 {
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index b53a00198dbe..3b943673cd3e 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -244,7 +244,7 @@ static const struct rtc_class_ops stk17ta8_rtc_ops = {
 	.alarm_irq_enable	= stk17ta8_rtc_alarm_irq_enable,
 };
 
-static ssize_t stk17ta8_nvram_read(struct kobject *kobj,
+static ssize_t stk17ta8_nvram_read(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *attr, char *buf,
 				 loff_t pos, size_t size)
 {
@@ -259,7 +259,7 @@ static ssize_t stk17ta8_nvram_read(struct kobject *kobj,
 	return count;
 }
 
-static ssize_t stk17ta8_nvram_write(struct kobject *kobj,
+static ssize_t stk17ta8_nvram_write(struct file *filp, struct kobject *kobj,
 				  struct bin_attribute *attr, char *buf,
 				  loff_t pos, size_t size)
 {
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index 20bfc64a15c8..ec6313d15359 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -188,7 +188,7 @@ static const struct rtc_class_ops tx4939_rtc_ops = {
 	.alarm_irq_enable	= tx4939_rtc_alarm_irq_enable,
 };
 
-static ssize_t tx4939_rtc_nvram_read(struct kobject *kobj,
+static ssize_t tx4939_rtc_nvram_read(struct file *filp, struct kobject *kobj,
 				     struct bin_attribute *bin_attr,
 				     char *buf, loff_t pos, size_t size)
 {
@@ -207,7 +207,7 @@ static ssize_t tx4939_rtc_nvram_read(struct kobject *kobj,
 	return count;
 }
 
-static ssize_t tx4939_rtc_nvram_write(struct kobject *kobj,
+static ssize_t tx4939_rtc_nvram_write(struct file *filp, struct kobject *kobj,
 				      struct bin_attribute *bin_attr,
 				      char *buf, loff_t pos, size_t size)
 {
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 1d16189f2f2d..6c9fa15aac7b 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -135,7 +135,8 @@ static int s390_vary_chpid(struct chp_id chpid, int on)
 /*
  * Channel measurement related functions
  */
-static ssize_t chp_measurement_chars_read(struct kobject *kobj,
+static ssize_t chp_measurement_chars_read(struct file *filp,
+					  struct kobject *kobj,
 					  struct bin_attribute *bin_attr,
 					  char *buf, loff_t off, size_t count)
 {
@@ -182,7 +183,7 @@ static void chp_measurement_copy_block(struct cmg_entry *buf,
 	} while (reference_buf.values[0] != buf->values[0]);
 }
 
-static ssize_t chp_measurement_read(struct kobject *kobj,
+static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj,
 				    struct bin_attribute *bin_attr,
 				    char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c
index 54c5ffb1eaa1..d38000db9237 100644
--- a/drivers/scsi/3w-sas.c
+++ b/drivers/scsi/3w-sas.c
@@ -98,7 +98,7 @@ static int twl_reset_device_extension(TW_Device_Extension *tw_dev, int ioctl_res
 /* Functions */
 
 /* This function returns AENs through sysfs */
-static ssize_t twl_sysfs_aen_read(struct kobject *kobj,
+static ssize_t twl_sysfs_aen_read(struct file *filp, struct kobject *kobj,
 				  struct bin_attribute *bin_attr,
 				  char *outbuf, loff_t offset, size_t count)
 {
@@ -129,7 +129,7 @@ static struct bin_attribute twl_sysfs_aen_read_attr = {
 };
 
 /* This function returns driver compatibility info through sysfs */
-static ssize_t twl_sysfs_compat_info(struct kobject *kobj,
+static ssize_t twl_sysfs_compat_info(struct file *filp, struct kobject *kobj,
 				     struct bin_attribute *bin_attr,
 				     char *outbuf, loff_t offset, size_t count)
 {
diff --git a/drivers/scsi/arcmsr/arcmsr_attr.c b/drivers/scsi/arcmsr/arcmsr_attr.c
index 5877f29a6005..a4e04c50c436 100644
--- a/drivers/scsi/arcmsr/arcmsr_attr.c
+++ b/drivers/scsi/arcmsr/arcmsr_attr.c
@@ -59,7 +59,8 @@
 
 struct device_attribute *arcmsr_host_attrs[];
 
-static ssize_t arcmsr_sysfs_iop_message_read(struct kobject *kobj,
+static ssize_t arcmsr_sysfs_iop_message_read(struct file *filp,
+					     struct kobject *kobj,
 					     struct bin_attribute *bin,
 					     char *buf, loff_t off,
 					     size_t count)
@@ -105,7 +106,8 @@ static ssize_t arcmsr_sysfs_iop_message_read(struct kobject *kobj,
 	return (allxfer_len);
 }
 
-static ssize_t arcmsr_sysfs_iop_message_write(struct kobject *kobj,
+static ssize_t arcmsr_sysfs_iop_message_write(struct file *filp,
+					      struct kobject *kobj,
 					      struct bin_attribute *bin,
 					      char *buf, loff_t off,
 					      size_t count)
@@ -153,7 +155,8 @@ static ssize_t arcmsr_sysfs_iop_message_write(struct kobject *kobj,
 	}
 }
 
-static ssize_t arcmsr_sysfs_iop_message_clear(struct kobject *kobj,
+static ssize_t arcmsr_sysfs_iop_message_clear(struct file *filp,
+					      struct kobject *kobj,
 					      struct bin_attribute *bin,
 					      char *buf, loff_t off,
 					      size_t count)
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index d18f45c95639..3eb2b7b3d8b0 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -2919,6 +2919,7 @@ static DEVICE_ATTR(log_level, S_IRUGO | S_IWUSR,
 #ifdef CONFIG_SCSI_IBMVFC_TRACE
 /**
  * ibmvfc_read_trace - Dump the adapter trace
+ * @filp:		open sysfs file
  * @kobj:		kobject struct
  * @bin_attr:	bin_attribute struct
  * @buf:		buffer
@@ -2928,7 +2929,7 @@ static DEVICE_ATTR(log_level, S_IRUGO | S_IWUSR,
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ibmvfc_read_trace(struct kobject *kobj,
+static ssize_t ibmvfc_read_trace(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index b90c118119d7..6a6661c35b2f 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3120,6 +3120,7 @@ restart:
 #ifdef CONFIG_SCSI_IPR_TRACE
 /**
  * ipr_read_trace - Dump the adapter trace
+ * @filp:		open sysfs file
  * @kobj:		kobject struct
  * @bin_attr:		bin_attribute struct
  * @buf:		buffer
@@ -3129,7 +3130,7 @@ restart:
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ipr_read_trace(struct kobject *kobj,
+static ssize_t ipr_read_trace(struct file *filp, struct kobject *kobj,
 			      struct bin_attribute *bin_attr,
 			      char *buf, loff_t off, size_t count)
 {
@@ -3764,6 +3765,7 @@ static struct device_attribute *ipr_ioa_attrs[] = {
 #ifdef CONFIG_SCSI_IPR_DUMP
 /**
  * ipr_read_dump - Dump the adapter
+ * @filp:		open sysfs file
  * @kobj:		kobject struct
  * @bin_attr:		bin_attribute struct
  * @buf:		buffer
@@ -3773,7 +3775,7 @@ static struct device_attribute *ipr_ioa_attrs[] = {
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ipr_read_dump(struct kobject *kobj,
+static ssize_t ipr_read_dump(struct file *filp, struct kobject *kobj,
 			     struct bin_attribute *bin_attr,
 			     char *buf, loff_t off, size_t count)
 {
@@ -3927,6 +3929,7 @@ static int ipr_free_dump(struct ipr_ioa_cfg *ioa_cfg)
 
 /**
  * ipr_write_dump - Setup dump state of adapter
+ * @filp:		open sysfs file
  * @kobj:		kobject struct
  * @bin_attr:		bin_attribute struct
  * @buf:		buffer
@@ -3936,7 +3939,7 @@ static int ipr_free_dump(struct ipr_ioa_cfg *ioa_cfg)
  * Return value:
  *	number of bytes printed to buffer
  **/
-static ssize_t ipr_write_dump(struct kobject *kobj,
+static ssize_t ipr_write_dump(struct file *filp, struct kobject *kobj,
 			      struct bin_attribute *bin_attr,
 			      char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 2e5f376d9ccc..bf33b315f93e 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -2643,6 +2643,7 @@ static DEVICE_ATTR(lpfc_stat_data_ctrl, S_IRUGO | S_IWUSR,
 
 /**
  * sysfs_drvr_stat_data_read - Read function for lpfc_drvr_stat_data attribute
+ * @filp: sysfs file
  * @kobj: Pointer to the kernel object
  * @bin_attr: Attribute object
  * @buff: Buffer pointer
@@ -2654,7 +2655,8 @@ static DEVICE_ATTR(lpfc_stat_data_ctrl, S_IRUGO | S_IWUSR,
  * applications.
  **/
 static ssize_t
-sysfs_drvr_stat_data_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+sysfs_drvr_stat_data_read(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device,
@@ -3362,6 +3364,7 @@ struct device_attribute *lpfc_vport_attrs[] = {
 
 /**
  * sysfs_ctlreg_write - Write method for writing to ctlreg
+ * @filp: open sysfs file
  * @kobj: kernel kobject that contains the kernel class device.
  * @bin_attr: kernel attributes passed to us.
  * @buf: contains the data to be written to the adapter IOREG space.
@@ -3379,7 +3382,8 @@ struct device_attribute *lpfc_vport_attrs[] = {
  * value of count, buf contents written
  **/
 static ssize_t
-sysfs_ctlreg_write(struct kobject *kobj, struct bin_attribute *bin_attr,
+sysfs_ctlreg_write(struct file *filp, struct kobject *kobj,
+		   struct bin_attribute *bin_attr,
 		   char *buf, loff_t off, size_t count)
 {
 	size_t buf_off;
@@ -3415,6 +3419,7 @@ sysfs_ctlreg_write(struct kobject *kobj, struct bin_attribute *bin_attr,
 
 /**
  * sysfs_ctlreg_read - Read method for reading from ctlreg
+ * @filp: open sysfs file
  * @kobj: kernel kobject that contains the kernel class device.
  * @bin_attr: kernel attributes passed to us.
  * @buf: if successful contains the data from the adapter IOREG space.
@@ -3431,7 +3436,8 @@ sysfs_ctlreg_write(struct kobject *kobj, struct bin_attribute *bin_attr,
  * value of count, buf contents read
  **/
 static ssize_t
-sysfs_ctlreg_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+sysfs_ctlreg_read(struct file *filp, struct kobject *kobj,
+		  struct bin_attribute *bin_attr,
 		  char *buf, loff_t off, size_t count)
 {
 	size_t buf_off;
@@ -3496,6 +3502,7 @@ sysfs_mbox_idle(struct lpfc_hba *phba)
 
 /**
  * sysfs_mbox_write - Write method for writing information via mbox
+ * @filp: open sysfs file
  * @kobj: kernel kobject that contains the kernel class device.
  * @bin_attr: kernel attributes passed to us.
  * @buf: contains the data to be written to sysfs mbox.
@@ -3516,7 +3523,8 @@ sysfs_mbox_idle(struct lpfc_hba *phba)
  * count number of bytes transferred
  **/
 static ssize_t
-sysfs_mbox_write(struct kobject *kobj, struct bin_attribute *bin_attr,
+sysfs_mbox_write(struct file *filp, struct kobject *kobj,
+		 struct bin_attribute *bin_attr,
 		 char *buf, loff_t off, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
@@ -3571,6 +3579,7 @@ sysfs_mbox_write(struct kobject *kobj, struct bin_attribute *bin_attr,
 
 /**
  * sysfs_mbox_read - Read method for reading information via mbox
+ * @filp: open sysfs file
  * @kobj: kernel kobject that contains the kernel class device.
  * @bin_attr: kernel attributes passed to us.
  * @buf: contains the data to be read from sysfs mbox.
@@ -3593,7 +3602,8 @@ sysfs_mbox_write(struct kobject *kobj, struct bin_attribute *bin_attr,
  * count number of bytes transferred
  **/
 static ssize_t
-sysfs_mbox_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+sysfs_mbox_read(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
index 3b708606b932..1e4cafabba15 100644
--- a/drivers/scsi/qla2xxx/qla_attr.c
+++ b/drivers/scsi/qla2xxx/qla_attr.c
@@ -16,7 +16,7 @@ static int qla24xx_vport_disable(struct fc_vport *, bool);
 /* SYSFS attributes --------------------------------------------------------- */
 
 static ssize_t
-qla2x00_sysfs_read_fw_dump(struct kobject *kobj,
+qla2x00_sysfs_read_fw_dump(struct file *filp, struct kobject *kobj,
 			   struct bin_attribute *bin_attr,
 			   char *buf, loff_t off, size_t count)
 {
@@ -32,7 +32,7 @@ qla2x00_sysfs_read_fw_dump(struct kobject *kobj,
 }
 
 static ssize_t
-qla2x00_sysfs_write_fw_dump(struct kobject *kobj,
+qla2x00_sysfs_write_fw_dump(struct file *filp, struct kobject *kobj,
 			    struct bin_attribute *bin_attr,
 			    char *buf, loff_t off, size_t count)
 {
@@ -92,7 +92,7 @@ static struct bin_attribute sysfs_fw_dump_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_nvram(struct kobject *kobj,
+qla2x00_sysfs_read_nvram(struct file *filp, struct kobject *kobj,
 			 struct bin_attribute *bin_attr,
 			 char *buf, loff_t off, size_t count)
 {
@@ -111,7 +111,7 @@ qla2x00_sysfs_read_nvram(struct kobject *kobj,
 }
 
 static ssize_t
-qla2x00_sysfs_write_nvram(struct kobject *kobj,
+qla2x00_sysfs_write_nvram(struct file *filp, struct kobject *kobj,
 			  struct bin_attribute *bin_attr,
 			  char *buf, loff_t off, size_t count)
 {
@@ -177,7 +177,7 @@ static struct bin_attribute sysfs_nvram_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_optrom(struct kobject *kobj,
+qla2x00_sysfs_read_optrom(struct file *filp, struct kobject *kobj,
 			  struct bin_attribute *bin_attr,
 			  char *buf, loff_t off, size_t count)
 {
@@ -193,7 +193,7 @@ qla2x00_sysfs_read_optrom(struct kobject *kobj,
 }
 
 static ssize_t
-qla2x00_sysfs_write_optrom(struct kobject *kobj,
+qla2x00_sysfs_write_optrom(struct file *filp, struct kobject *kobj,
 			   struct bin_attribute *bin_attr,
 			   char *buf, loff_t off, size_t count)
 {
@@ -224,7 +224,7 @@ static struct bin_attribute sysfs_optrom_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_write_optrom_ctl(struct kobject *kobj,
+qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr,
 			       char *buf, loff_t off, size_t count)
 {
@@ -387,7 +387,7 @@ static struct bin_attribute sysfs_optrom_ctl_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_vpd(struct kobject *kobj,
+qla2x00_sysfs_read_vpd(struct file *filp, struct kobject *kobj,
 		       struct bin_attribute *bin_attr,
 		       char *buf, loff_t off, size_t count)
 {
@@ -408,7 +408,7 @@ qla2x00_sysfs_read_vpd(struct kobject *kobj,
 }
 
 static ssize_t
-qla2x00_sysfs_write_vpd(struct kobject *kobj,
+qla2x00_sysfs_write_vpd(struct file *filp, struct kobject *kobj,
 			struct bin_attribute *bin_attr,
 			char *buf, loff_t off, size_t count)
 {
@@ -461,7 +461,7 @@ static struct bin_attribute sysfs_vpd_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_sfp(struct kobject *kobj,
+qla2x00_sysfs_read_sfp(struct file *filp, struct kobject *kobj,
 		       struct bin_attribute *bin_attr,
 		       char *buf, loff_t off, size_t count)
 {
@@ -522,7 +522,7 @@ static struct bin_attribute sysfs_sfp_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_write_reset(struct kobject *kobj,
+qla2x00_sysfs_write_reset(struct file *filp, struct kobject *kobj,
 			struct bin_attribute *bin_attr,
 			char *buf, loff_t off, size_t count)
 {
@@ -592,7 +592,7 @@ static struct bin_attribute sysfs_reset_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_write_edc(struct kobject *kobj,
+qla2x00_sysfs_write_edc(struct file *filp, struct kobject *kobj,
 			struct bin_attribute *bin_attr,
 			char *buf, loff_t off, size_t count)
 {
@@ -650,7 +650,7 @@ static struct bin_attribute sysfs_edc_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_write_edc_status(struct kobject *kobj,
+qla2x00_sysfs_write_edc_status(struct file *filp, struct kobject *kobj,
 			struct bin_attribute *bin_attr,
 			char *buf, loff_t off, size_t count)
 {
@@ -700,7 +700,7 @@ qla2x00_sysfs_write_edc_status(struct kobject *kobj,
 }
 
 static ssize_t
-qla2x00_sysfs_read_edc_status(struct kobject *kobj,
+qla2x00_sysfs_read_edc_status(struct file *filp, struct kobject *kobj,
 			   struct bin_attribute *bin_attr,
 			   char *buf, loff_t off, size_t count)
 {
@@ -730,7 +730,7 @@ static struct bin_attribute sysfs_edc_status_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_xgmac_stats(struct kobject *kobj,
+qla2x00_sysfs_read_xgmac_stats(struct file *filp, struct kobject *kobj,
 		       struct bin_attribute *bin_attr,
 		       char *buf, loff_t off, size_t count)
 {
@@ -782,7 +782,7 @@ static struct bin_attribute sysfs_xgmac_stats_attr = {
 };
 
 static ssize_t
-qla2x00_sysfs_read_dcbx_tlv(struct kobject *kobj,
+qla2x00_sysfs_read_dcbx_tlv(struct file *filp, struct kobject *kobj,
 		       struct bin_attribute *bin_attr,
 		       char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/staging/udlfb/udlfb.c b/drivers/staging/udlfb/udlfb.c
index aa8195199a2c..577f2bf6eb23 100644
--- a/drivers/staging/udlfb/udlfb.c
+++ b/drivers/staging/udlfb/udlfb.c
@@ -1063,7 +1063,8 @@ static ssize_t metrics_misc_show(struct device *fbdev,
 			atomic_read(&dev->lost_pixels) ? "yes" : "no");
 }
 
-static ssize_t edid_show(struct kobject *kobj, struct bin_attribute *a,
+static ssize_t edid_show(struct file *filp, struct kobject *kobj,
+			 struct bin_attribute *a,
 			 char *buf, loff_t off, size_t count) {
 	struct device *fbdev = container_of(kobj, struct device, kobj);
 	struct fb_info *fb_info = dev_get_drvdata(fbdev);
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index 06863befaf3a..448f5b47fc48 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -658,7 +658,8 @@ const struct attribute_group *usb_device_groups[] = {
 /* Binary descriptors */
 
 static ssize_t
-read_descriptors(struct kobject *kobj, struct bin_attribute *attr,
+read_descriptors(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *attr,
 		char *buf, loff_t off, size_t count)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
diff --git a/drivers/video/aty/radeon_base.c b/drivers/video/aty/radeon_base.c
index 6c37e8ee5efe..3c1e13ed1cba 100644
--- a/drivers/video/aty/radeon_base.c
+++ b/drivers/video/aty/radeon_base.c
@@ -2099,7 +2099,7 @@ static ssize_t radeon_show_one_edid(char *buf, loff_t off, size_t count, const u
 }
 
 
-static ssize_t radeon_show_edid1(struct kobject *kobj,
+static ssize_t radeon_show_edid1(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t off, size_t count)
 {
@@ -2112,7 +2112,7 @@ static ssize_t radeon_show_edid1(struct kobject *kobj,
 }
 
 
-static ssize_t radeon_show_edid2(struct kobject *kobj,
+static ssize_t radeon_show_edid2(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/w1/slaves/w1_ds2431.c b/drivers/w1/slaves/w1_ds2431.c
index 2c6c0cf6a20f..84e2410aec1d 100644
--- a/drivers/w1/slaves/w1_ds2431.c
+++ b/drivers/w1/slaves/w1_ds2431.c
@@ -96,7 +96,7 @@ static int w1_f2d_readblock(struct w1_slave *sl, int off, int count, char *buf)
 	return -1;
 }
 
-static ssize_t w1_f2d_read_bin(struct kobject *kobj,
+static ssize_t w1_f2d_read_bin(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr,
 			       char *buf, loff_t off, size_t count)
 {
@@ -202,7 +202,7 @@ retry:
 	return 0;
 }
 
-static ssize_t w1_f2d_write_bin(struct kobject *kobj,
+static ssize_t w1_f2d_write_bin(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *bin_attr,
 				char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/w1/slaves/w1_ds2433.c b/drivers/w1/slaves/w1_ds2433.c
index d2bf32118a98..0f7b8f9c509a 100644
--- a/drivers/w1/slaves/w1_ds2433.c
+++ b/drivers/w1/slaves/w1_ds2433.c
@@ -92,7 +92,7 @@ static int w1_f23_refresh_block(struct w1_slave *sl, struct w1_f23_data *data,
 }
 #endif	/* CONFIG_W1_SLAVE_DS2433_CRC */
 
-static ssize_t w1_f23_read_bin(struct kobject *kobj,
+static ssize_t w1_f23_read_bin(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr,
 			       char *buf, loff_t off, size_t count)
 {
@@ -206,7 +206,7 @@ static int w1_f23_write(struct w1_slave *sl, int addr, int len, const u8 *data)
 	return 0;
 }
 
-static ssize_t w1_f23_write_bin(struct kobject *kobj,
+static ssize_t w1_f23_write_bin(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *bin_attr,
 				char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c
index 6e153343e117..483d45180911 100644
--- a/drivers/w1/slaves/w1_ds2760.c
+++ b/drivers/w1/slaves/w1_ds2760.c
@@ -97,7 +97,7 @@ int w1_ds2760_recall_eeprom(struct device *dev, int addr)
 	return w1_ds2760_eeprom_cmd(dev, addr, W1_DS2760_RECALL_DATA);
 }
 
-static ssize_t w1_ds2760_read_bin(struct kobject *kobj,
+static ssize_t w1_ds2760_read_bin(struct file *filp, struct kobject *kobj,
 				  struct bin_attribute *bin_attr,
 				  char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index ad5897dc4495..2839e281cd65 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -120,7 +120,7 @@ static struct device_attribute w1_slave_attr_id =
 
 /* Default family */
 
-static ssize_t w1_default_write(struct kobject *kobj,
+static ssize_t w1_default_write(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *bin_attr,
 				char *buf, loff_t off, size_t count)
 {
@@ -139,7 +139,7 @@ out_up:
 	return count;
 }
 
-static ssize_t w1_default_read(struct kobject *kobj,
+static ssize_t w1_default_read(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr,
 			       char *buf, loff_t off, size_t count)
 {
diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c
index eb924e0a64ce..26f7184ef9e1 100644
--- a/drivers/zorro/zorro-sysfs.c
+++ b/drivers/zorro/zorro-sysfs.c
@@ -49,7 +49,7 @@ static ssize_t zorro_show_resource(struct device *dev, struct device_attribute *
 
 static DEVICE_ATTR(resource, S_IRUGO, zorro_show_resource, NULL);
 
-static ssize_t zorro_read_config(struct kobject *kobj,
+static ssize_t zorro_read_config(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t off, size_t count)
 {
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 806b277453f9..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
 };
 
 static int
-fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
 
 	rc = -EIO;
 	if (attr->read)
-		rc = attr->read(kobj, attr, buffer, off, count);
+		rc = attr->read(file, kobj, attr, buffer, off, count);
 
 	sysfs_put_active(attr_sd);
 
@@ -70,8 +70,7 @@ static ssize_t
 read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
 	struct bin_buffer *bb = file->private_data;
-	struct dentry *dentry = file->f_path.dentry;
-	int size = dentry->d_inode->i_size;
+	int size = file->f_path.dentry->d_inode->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 
 	mutex_lock(&bb->mutex);
 
-	count = fill_read(dentry, bb->buffer, offs, count);
+	count = fill_read(file, bb->buffer, offs, count);
 	if (count < 0) {
 		mutex_unlock(&bb->mutex);
 		goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 }
 
 static int
-flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
 {
-	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
 	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
 
 	rc = -EIO;
 	if (attr->write)
-		rc = attr->write(kobj, attr, buffer, offset, count);
+		rc = attr->write(file, kobj, attr, buffer, offset, count);
 
 	sysfs_put_active(attr_sd);
 
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 		     size_t bytes, loff_t *off)
 {
 	struct bin_buffer *bb = file->private_data;
-	struct dentry *dentry = file->f_path.dentry;
-	int size = dentry->d_inode->i_size;
+	int size = file->f_path.dentry->d_inode->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 
 	memcpy(bb->buffer, temp, count);
 
-	count = flush_write(dentry, bb->buffer, offs, count);
+	count = flush_write(file, bb->buffer, offs, count);
 	mutex_unlock(&bb->mutex);
 
 	if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	if (!attr->mmap)
 		goto out_put;
 
-	rc = attr->mmap(kobj, attr, vma);
+	rc = attr->mmap(file, kobj, attr, vma);
 	if (rc)
 		goto out_put;
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 6903e9204032..f2694eb4dd3d 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -87,17 +87,18 @@ struct attribute_group {
 
 #define attr_name(_attr) (_attr).attr.name
 
+struct file;
 struct vm_area_struct;
 
 struct bin_attribute {
 	struct attribute	attr;
 	size_t			size;
 	void			*private;
-	ssize_t (*read)(struct kobject *, struct bin_attribute *,
+	ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *,
 			char *, loff_t, size_t);
-	ssize_t (*write)(struct kobject *, struct bin_attribute *,
+	ssize_t (*write)(struct file *,struct kobject *, struct bin_attribute *,
 			 char *, loff_t, size_t);
-	int (*mmap)(struct kobject *, struct bin_attribute *attr,
+	int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr,
 		    struct vm_area_struct *vma);
 };
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
 extern const void __stop_notes __attribute__((weak));
 #define	notes_size (&__stop_notes - &__start_notes)
 
-static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+static ssize_t notes_read(struct file *filp, struct kobject *kobj,
+			  struct bin_attribute *bin_attr,
 			  char *buf, loff_t off, size_t count)
 {
 	memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/module.c b/kernel/module.c
index e2564580f3f1..5e14483768bb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1182,7 +1182,7 @@ struct module_notes_attrs {
 	struct bin_attribute attrs[0];
 };
 
-static ssize_t module_notes_read(struct kobject *kobj,
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
 				 struct bin_attribute *bin_attr,
 				 char *buf, loff_t pos, size_t count)
 {
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index dd321e39e621..486b8f3861d2 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -659,7 +659,7 @@ static struct attribute_group bridge_group = {
  *
  * Returns the number of bytes read.
  */
-static ssize_t brforward_read(struct kobject *kobj,
+static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
 			      struct bin_attribute *bin_attr,
 			      char *buf, loff_t off, size_t count)
 {
-- 
cgit v1.2.3-55-g7522


From 35f3d14dbbc58447c61e38a162ea10add6b31dc7 Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Thu, 20 May 2010 10:43:18 +0200
Subject: pipe: add support for shrinking and growing pipes

This patch adds F_GETPIPE_SZ and F_SETPIPE_SZ fcntl() actions for
growing and shrinking the size of a pipe and adjusts pipe.c and splice.c
(and relay and network splice) usage to work with these larger (or smaller)
pipes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fcntl.c                |   5 ++
 fs/pipe.c                 | 107 ++++++++++++++++++++++++++++----
 fs/splice.c               | 151 ++++++++++++++++++++++++++++++++--------------
 include/linux/fcntl.h     |   6 ++
 include/linux/pipe_fs_i.h |  11 ++--
 include/linux/splice.h    |   7 +++
 kernel/relay.c            |  15 +++--
 kernel/trace/trace.c      |  60 ++++++++++--------
 net/core/skbuff.c         |  38 ++++++------
 9 files changed, 292 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..bcba960328fa 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_NOTIFY:
 		err = fcntl_dirnotify(fd, filp, arg);
 		break;
+	case F_SETPIPE_SZ:
+	case F_GETPIPE_SZ:
+		err = pipe_fcntl(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..054b8a6a2c7a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -390,7 +391,7 @@ redo:
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
-				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
 				pipe->curbuf = curbuf;
 				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
@@ -472,7 +473,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 	if (pipe->nrbufs && chars != 0) {
 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-							(PIPE_BUFFERS-1);
+							(pipe->buffers - 1);
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		const struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
@@ -518,8 +519,8 @@ redo1:
 			break;
 		}
 		bufs = pipe->nrbufs;
-		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+		if (bufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
 			char *src;
@@ -580,7 +581,7 @@ redo2:
 			if (!total_len)
 				break;
 		}
-		if (bufs < PIPE_BUFFERS)
+		if (bufs < pipe->buffers)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
 			if (!ret)
@@ -640,7 +641,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
 				count += pipe->bufs[buf].len;
-				buf = (buf+1) & (PIPE_BUFFERS-1);
+				buf = (buf+1) & (pipe->buffers - 1);
 			}
 			mutex_unlock(&inode->i_mutex);
 
@@ -671,7 +672,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
 		/*
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
@@ -877,25 +878,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
-		init_waitqueue_head(&pipe->wait);
-		pipe->r_counter = pipe->w_counter = 1;
-		pipe->inode = inode;
+		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		if (pipe->bufs) {
+			init_waitqueue_head(&pipe->wait);
+			pipe->r_counter = pipe->w_counter = 1;
+			pipe->inode = inode;
+			pipe->buffers = PIPE_DEF_BUFFERS;
+			return pipe;
+		}
+		kfree(pipe);
 	}
 
-	return pipe;
+	return NULL;
 }
 
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
 			buf->ops->release(pipe, buf);
 	}
 	if (pipe->tmp_page)
 		__free_page(pipe->tmp_page);
+	kfree(pipe->bufs);
 	kfree(pipe);
 }
 
@@ -1093,6 +1101,81 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 	return sys_pipe2(fildes, 0);
 }
 
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+	struct pipe_buffer *bufs;
+
+	/*
+	 * Must be a power-of-2 currently
+	 */
+	if (!is_power_of_2(arg))
+		return -EINVAL;
+
+	/*
+	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+	 * expect a lot of shrink+grow operations, just free and allocate
+	 * again like we would do for growing. If the pipe currently
+	 * contains more buffers than arg, then return busy.
+	 */
+	if (arg < pipe->nrbufs)
+		return -EBUSY;
+
+	bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+	if (unlikely(!bufs))
+		return -ENOMEM;
+
+	/*
+	 * The pipe array wraps around, so just start the new one at zero
+	 * and adjust the indexes.
+	 */
+	if (pipe->nrbufs) {
+		const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+		const unsigned int head = pipe->nrbufs - tail;
+
+		if (head)
+			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+		if (tail)
+			memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+	}
+
+	pipe->curbuf = 0;
+	kfree(pipe->bufs);
+	pipe->bufs = bufs;
+	pipe->buffers = arg;
+	return arg;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pipe_inode_info *pipe;
+	long ret;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+	if (!pipe)
+		return -EBADF;
+
+	mutex_lock(&pipe->inode->i_mutex);
+
+	switch (cmd) {
+	case F_SETPIPE_SZ:
+		ret = pipe_set_size(pipe, arg);
+		break;
+	case F_GETPIPE_SZ:
+		ret = pipe->buffers;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&pipe->inode->i_mutex);
+	return ret;
+}
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			break;
 		}
 
-		if (pipe->nrbufs < PIPE_BUFFERS) {
-			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+		if (pipe->nrbufs < pipe->buffers) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 
 			buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
 			if (!--spd->nr_pages)
 				break;
-			if (pipe->nrbufs < PIPE_BUFFERS)
+			if (pipe->nrbufs < pipe->buffers)
 				continue;
 
 			break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 	page_cache_release(spd->pages[i]);
 }
 
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return 0;
+
+	spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+	if (spd->pages && spd->partial)
+		return 0;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+	return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+		       struct splice_pipe_desc *spd)
+{
+	if (pipe->buffers <= PIPE_DEF_BUFFERS)
+		return;
+
+	kfree(spd->pages);
+	kfree(spd->partial);
+}
+
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int loff, nr_pages, req_pages;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+	nr_pages = min(req_pages, pipe->buffers);
 
 	/*
 	 * Lookup the (hopefully) full range of pages we need.
 	 */
-	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 	index += spd.nr_pages;
 
 	/*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			unlock_page(page);
 		}
 
-		pages[spd.nr_pages++] = page;
+		spd.pages[spd.nr_pages++] = page;
 		index++;
 	}
 
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * this_len is the max we'll use from this page
 		 */
 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-		page = pages[page_nr];
+		page = spd.pages[page_nr];
 
 		if (PageReadahead(page))
 			page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 					error = -ENOMEM;
 					break;
 				}
-				page_cache_release(pages[page_nr]);
-				pages[page_nr] = page;
+				page_cache_release(spd.pages[page_nr]);
+				spd.pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
 			len = this_len;
 		}
 
-		partial[page_nr].offset = loff;
-		partial[page_nr].len = this_len;
+		spd.partial[page_nr].offset = loff;
+		spd.partial[page_nr].len = this_len;
 		len -= this_len;
 		loff = 0;
 		spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
 	 * we got, 'nr_pages' is how many pages are in the map.
 	 */
 	while (page_nr < nr_pages)
-		page_cache_release(pages[page_nr++]);
+		page_cache_release(spd.pages[page_nr++]);
 	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 
 	if (spd.nr_pages)
-		return splice_to_pipe(pipe, &spd);
+		error = splice_to_pipe(pipe, &spd);
 
+	splice_shrink_spd(pipe, &spd);
 	return error;
 }
 
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	unsigned int nr_pages;
 	unsigned int nr_freed;
 	size_t offset;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
-	struct iovec vec[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
 	pgoff_t index;
 	ssize_t res;
 	size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		.spd_release = spd_release_page,
 	};
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	res = -ENOMEM;
+	vec = __vec;
+	if (pipe->buffers > PIPE_DEF_BUFFERS) {
+		vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+		if (!vec)
+			goto shrink_ret;
+	}
+
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+	for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
 		struct page *page;
 
 		page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
 		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
-		pages[i] = page;
+		spd.pages[i] = page;
 		spd.nr_pages++;
 		len -= this_len;
 		offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
 		this_len = min_t(size_t, vec[i].iov_len, res);
-		partial[i].offset = 0;
-		partial[i].len = this_len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = this_len;
 		if (!this_len) {
-			__free_page(pages[i]);
-			pages[i] = NULL;
+			__free_page(spd.pages[i]);
+			spd.pages[i] = NULL;
 			nr_freed++;
 		}
 		res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	if (res > 0)
 		*ppos += res;
 
+shrink_ret:
+	if (vec != __vec)
+		kfree(vec);
+	splice_shrink_spd(pipe, &spd);
 	return res;
 
 err:
 	for (i = 0; i < spd.nr_pages; i++)
-		__free_page(pages[i]);
+		__free_page(spd.pages[i]);
 
-	return error;
+	res = error;
+	goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
 
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 		if (!buf->len) {
 			buf->ops = NULL;
 			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
 			if (pipe->inode)
 				sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
 	 * If we did an incomplete transfer we must release
 	 * the pipe buffers in question:
 	 */
-	for (i = 0; i < PIPE_BUFFERS; i++) {
+	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 
 		if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial, int aligned)
+				struct partial_page *partial, int aligned,
+				unsigned int pipe_buffers)
 {
 	int buffers = 0, error = 0;
 
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 			break;
 
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		if (npages > PIPE_BUFFERS - buffers)
-			npages = PIPE_BUFFERS - buffers;
+		if (npages > pipe_buffers - buffers)
+			npages = pipe_buffers - buffers;
 
 		error = get_user_pages_fast((unsigned long)base, npages,
 					0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * or if we mapped the max number of pages that we have
 		 * room for.
 		 */
-		if (error < npages || buffers == PIPE_BUFFERS)
+		if (error < npages || buffers == pipe_buffers)
 			break;
 
 		nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 			     unsigned long nr_segs, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 		.ops = &user_page_pipe_buf_ops,
 		.spd_release = spd_release_page,
 	};
+	long ret;
 
 	pipe = pipe_info(file->f_path.dentry->d_inode);
 	if (!pipe)
 		return -EBADF;
 
-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
-					    flags & SPLICE_F_GIFT);
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+					    spd.partial, flags & SPLICE_F_GIFT,
+					    pipe->buffers);
 	if (spd.nr_pages <= 0)
-		return spd.nr_pages;
+		ret = spd.nr_pages;
+	else
+		ret = splice_to_pipe(pipe, &spd);
 
-	return splice_to_pipe(pipe, &spd);
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe->nrbufs < PIPE_BUFFERS)
+	if (pipe->nrbufs < pipe->buffers)
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (pipe->nrbufs >= PIPE_BUFFERS) {
+	while (pipe->nrbufs >= pipe->buffers) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
 		 * Cannot make any progress, because either the input
 		 * pipe is empty or the output pipe is full.
 		 */
-		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
 			/* Already processed some buffers, break */
 			if (ret)
 				break;
@@ -1831,7 +1890,7 @@ retry:
 		}
 
 		ibuf = ipipe->bufs + ipipe->curbuf;
-		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 		obuf = opipe->bufs + nbuf;
 
 		if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
 			*obuf = *ibuf;
 			ibuf->ops = NULL;
 			opipe->nrbufs++;
-			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
 			ipipe->nrbufs--;
 			input_wakeup = true;
 		} else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 * If we have iterated all input buffers or ran out of
 		 * output room, break.
 		 */
-		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
 			break;
 
-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
 
 		/*
 		 * Get a reference to this pipe buffer,
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 86037400a6e3..afc00af3229b 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -21,6 +21,12 @@
  */
 #define F_NOTIFY	(F_LINUX_SPECIFIC_BASE+2)
 
+/*
+ * Set and get of pipe page size array
+ */
+#define F_SETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 7)
+#define F_GETPIPE_SZ	(F_LINUX_SPECIFIC_BASE + 8)
+
 /*
  * Types of directory notifications that may be requested.
  */
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b43a9e039059..65f4282fcbaf 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -3,7 +3,7 @@
 
 #define PIPEFS_MAGIC 0x50495045
 
-#define PIPE_BUFFERS (16)
+#define PIPE_DEF_BUFFERS	16
 
 #define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
 #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
@@ -44,17 +44,17 @@ struct pipe_buffer {
  **/
 struct pipe_inode_info {
 	wait_queue_head_t wait;
-	unsigned int nrbufs, curbuf;
-	struct page *tmp_page;
+	unsigned int nrbufs, curbuf, buffers;
 	unsigned int readers;
 	unsigned int writers;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
+	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 	struct inode *inode;
-	struct pipe_buffer bufs[PIPE_BUFFERS];
+	struct pipe_buffer *bufs;
 };
 
 /*
@@ -154,4 +154,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
+/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
+
 #endif
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 18e7c7c0cae6..997c3b4c212b 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *,
 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
 				      splice_direct_actor *);
 
+/*
+ * for dynamic pipe sizing
+ */
+extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
+extern void splice_shrink_spd(struct pipe_inode_info *,
+				struct splice_pipe_desc *);
+
 #endif
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..4268287148c1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	size_t read_subbuf = read_start / subbuf_size;
 	size_t padding = rbuf->padding[read_subbuf];
 	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
 
 	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
 		return 0;
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
 
 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
 		unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
 		}
 	}
 
+	ret = 0;
 	if (!spd.nr_pages)
-		return 0;
+		goto out;
 
 	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
 	if (ret < 0 || ret < total_len)
-		return ret;
+		goto out;
 
         if (read_start + ret == nonpad_end)
                 ret += padding;
 
+out:
+	splice_shrink_spd(pipe, &spd);
         return ret;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..7b155a0e6f31 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3269,12 +3269,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 					size_t len,
 					unsigned int flags)
 {
-	struct page *pages[PIPE_BUFFERS];
-	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.nr_pages	= 0, /* This gets updated below. */
 		.flags		= flags,
 		.ops		= &tracing_pipe_buf_ops,
@@ -3285,6 +3285,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	size_t rem;
 	unsigned int i;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
 	if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3315,23 +3318,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	trace_access_lock(iter->cpu_file);
 
 	/* Fill as many pages as possible. */
-	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
-		pages[i] = alloc_page(GFP_KERNEL);
-		if (!pages[i])
+	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+		spd.pages[i] = alloc_page(GFP_KERNEL);
+		if (!spd.pages[i])
 			break;
 
 		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-					  page_address(pages[i]),
+					  page_address(spd.pages[i]),
 					  iter->seq.len);
 		if (ret < 0) {
-			__free_page(pages[i]);
+			__free_page(spd.pages[i]);
 			break;
 		}
-		partial[i].offset = 0;
-		partial[i].len = iter->seq.len;
+		spd.partial[i].offset = 0;
+		spd.partial[i].len = iter->seq.len;
 
 		trace_seq_init(&iter->seq);
 	}
@@ -3342,12 +3345,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	spd.nr_pages = i;
 
-	return splice_to_pipe(pipe, &spd);
+	ret = splice_to_pipe(pipe, &spd);
+out:
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 
 out_err:
 	mutex_unlock(&iter->mutex);
-
-	return ret;
+	goto out;
 }
 
 static ssize_t
@@ -3746,11 +3751,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			    unsigned int flags)
 {
 	struct ftrace_buffer_info *info = file->private_data;
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial_def[PIPE_DEF_BUFFERS];
+	struct page *pages_def[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
-		.pages		= pages,
-		.partial	= partial,
+		.pages		= pages_def,
+		.partial	= partial_def,
 		.flags		= flags,
 		.ops		= &buffer_pipe_buf_ops,
 		.spd_release	= buffer_spd_release,
@@ -3759,22 +3764,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	int entries, size, i;
 	size_t ret;
 
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
 	if (*ppos & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out;
 	}
 
 	if (len & (PAGE_SIZE - 1)) {
 		WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
-		if (len < PAGE_SIZE)
-			return -EINVAL;
+		if (len < PAGE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
 		len &= PAGE_MASK;
 	}
 
 	trace_access_lock(info->cpu);
 	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 
-	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3829,11 +3840,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		else
 			ret = 0;
 		/* TODO: block */
-		return ret;
+		goto out;
 	}
 
 	ret = splice_to_pipe(pipe, &spd);
-
+	splice_shrink_spd(pipe, &spd);
+out:
 	return ret;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e060c91e..931981774b1a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1417,12 +1417,13 @@ new_page:
 /*
  * Fill page/offset/length into spd, if it can hold more pages.
  */
-static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+static inline int spd_fill_page(struct splice_pipe_desc *spd,
+				struct pipe_inode_info *pipe, struct page *page,
 				unsigned int *len, unsigned int offset,
 				struct sk_buff *skb, int linear,
 				struct sock *sk)
 {
-	if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+	if (unlikely(spd->nr_pages == pipe->buffers))
 		return 1;
 
 	if (linear) {
@@ -1458,7 +1459,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 				   unsigned int plen, unsigned int *off,
 				   unsigned int *len, struct sk_buff *skb,
 				   struct splice_pipe_desc *spd, int linear,
-				   struct sock *sk)
+				   struct sock *sk,
+				   struct pipe_inode_info *pipe)
 {
 	if (!*len)
 		return 1;
@@ -1481,7 +1483,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
 		/* the linear region may spread across several pages  */
 		flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
 
-		if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
+		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
 			return 1;
 
 		__segment_seek(&page, &poff, &plen, flen);
@@ -1496,9 +1498,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff,
  * Map linear and fragment data from the skb to spd. It reports failure if the
  * pipe is full or if we already spliced the requested length.
  */
-static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
-			     unsigned int *len, struct splice_pipe_desc *spd,
-			     struct sock *sk)
+static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+			     unsigned int *offset, unsigned int *len,
+			     struct splice_pipe_desc *spd, struct sock *sk)
 {
 	int seg;
 
@@ -1508,7 +1510,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 	if (__splice_segment(virt_to_page(skb->data),
 			     (unsigned long) skb->data & (PAGE_SIZE - 1),
 			     skb_headlen(skb),
-			     offset, len, skb, spd, 1, sk))
+			     offset, len, skb, spd, 1, sk, pipe))
 		return 1;
 
 	/*
@@ -1518,7 +1520,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
 		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
 
 		if (__splice_segment(f->page, f->page_offset, f->size,
-				     offset, len, skb, spd, 0, sk))
+				     offset, len, skb, spd, 0, sk, pipe))
 			return 1;
 	}
 
@@ -1535,8 +1537,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
 		    unsigned int flags)
 {
-	struct partial_page partial[PIPE_BUFFERS];
-	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -1546,12 +1548,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	};
 	struct sk_buff *frag_iter;
 	struct sock *sk = skb->sk;
+	int ret = 0;
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
 
 	/*
 	 * __skb_splice_bits() only fails if the output has no room left,
 	 * so no point in going over the frag_list for the error case.
 	 */
-	if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
+	if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
 		goto done;
 	else if (!tlen)
 		goto done;
@@ -1562,14 +1568,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	skb_walk_frags(skb, frag_iter) {
 		if (!tlen)
 			break;
-		if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk))
+		if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
 			break;
 	}
 
 done:
 	if (spd.nr_pages) {
-		int ret;
-
 		/*
 		 * Drop the socket lock, otherwise we have reverse
 		 * locking dependencies between sk_lock and i_mutex
@@ -1582,10 +1586,10 @@ done:
 		release_sock(sk);
 		ret = splice_to_pipe(pipe, &spd);
 		lock_sock(sk);
-		return ret;
 	}
 
-	return 0;
+	splice_shrink_spd(pipe, &spd);
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From b492e95be0ae672922f4734acf3f5d35c30be948 Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Wed, 19 May 2010 21:03:16 +0200
Subject: pipe: set lower and upper limit on max pages in the pipe page array

We need at least two to guarantee proper POSIX behaviour, so
never allow a smaller limit than that.

Also expose a /proc/sys/fs/pipe-max-pages sysctl file that allows
root to define a sane upper limit. Make it default to 16 times the
default size, which is 16 pages.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/pipe.c                 | 15 +++++++++++++++
 include/linux/pipe_fs_i.h |  2 ++
 kernel/sysctl.c           |  9 +++++++++
 3 files changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index 054b8a6a2c7a..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -19,10 +19,17 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -1162,6 +1169,14 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
+		if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+			return -EINVAL;
+		/*
+		 * The pipe needs to be at least 2 pages large to
+		 * guarantee POSIX behaviour.
+		 */
+		if (arg < 2)
+			return -EINVAL;
 		ret = pipe_set_size(pipe, arg);
 		break;
 	case F_GETPIPE_SZ:
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 65f4282fcbaf..16de3933c45e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -139,6 +139,8 @@ void pipe_lock(struct pipe_inode_info *);
 void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
+extern unsigned int pipe_max_pages;
+
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..c649d1c5fe09 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,7 @@
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
 #include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1423,6 +1424,14 @@ static struct ctl_table fs_table[] = {
 		.child		= binfmt_misc_table,
 	},
 #endif
+	{
+		.procname	= "pipe-max-pages",
+		.data		= &pipe_max_pages,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &two,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-55-g7522


From 0fc377bd648d1935ea34665239e3f0a274b71698 Mon Sep 17 00:00:00 2001
From: Randy Dunlap
Date: Fri, 21 May 2010 11:29:53 -0700
Subject: sysctl: fix kernel-doc notation and typos

Fix kernel-doc warnings, kernel-doc special characters, and
typos in recent kernel/sysctl.c additions.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Amerigo Wang <amwang@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b12583047757..30acc6c87b1b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2083,20 +2083,20 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
 
 #define TMPBUFLEN 22
 /**
- * proc_get_long - reads an ASCII formated integer from a user buffer
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
  *
- * @buf - a kernel buffer
- * @size - size of the kernel buffer
- * @val - this is where the number will be stored
- * @neg - set to %TRUE if number is negative
- * @perm_tr - a vector which contains the allowed trailers
- * @perm_tr_len - size of the perm_tr vector
- * @tr - pointer to store the trailer character
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
  *
- * In case of success 0 is returned and buf and size are updated with
- * the amount of bytes read. If tr is non NULL and a trailing
- * character exist (size is non zero after returning from this
- * function) tr is updated with the trailing character.
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
  */
 static int proc_get_long(char **buf, size_t *size,
 			  unsigned long *val, bool *neg,
@@ -2147,15 +2147,15 @@ static int proc_get_long(char **buf, size_t *size,
 }
 
 /**
- * proc_put_long - coverts an integer to a decimal ASCII formated string
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
  *
- * @buf - the user buffer
- * @size - the size of the user buffer
- * @val - the integer to be converted
- * @neg - sign of the number, %TRUE for negative
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
  *
- * In case of success 0 is returned and buf and size are updated with
- * the amount of bytes read.
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes written.
  */
 static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
 			  bool neg)
-- 
cgit v1.2.3-55-g7522


From bd45b7a385c5ffd82c11a1d51880be18559e5ad9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sun, 23 May 2010 08:14:45 +0200
Subject: timekeeping: Fix timezone update

commit 64ce4c2f (time: Clean up warp_clock()) breaks the timezone
update in a very subtle way. To avoid the direct access to timekeeping
internals it adds the timezone delta to the current time with
timespec_add_safe(). This works nicely when the timezone delta is > 0.
If timezone delta is < 0 then the wrap check in timespec_add_safe()
triggers and timespec_add_safe() returns TIME_MAX and screws up
timekeeping completely. 

The comment above timespec_add_safe() says:
    It's assumed that both values are valid (>= 0)

Add the timezone seconds adjustment directly.

Reported-by: Rafael J. Wysocki <rjw@sisk.pl>
Tested-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 50612faa9baf..848b1c2ab09a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,10 +132,10 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
  */
 static inline void warp_clock(void)
 {
-	struct timespec delta, adjust;
-	delta.tv_sec = sys_tz.tz_minuteswest * 60;
-	delta.tv_nsec = 0;
-	adjust = timespec_add_safe(current_kernel_time(), delta);
+	struct timespec adjust;
+
+	adjust = current_kernel_time();
+	adjust.tv_sec += sys_tz.tz_minuteswest * 60;
 	do_settimeofday(&adjust);
 }
 
-- 
cgit v1.2.3-55-g7522


From f00e047efdf9d31c8a7dd7875b411f97cfa7d8e5 Mon Sep 17 00:00:00 2001
From: Jeff Chua
Date: Mon, 24 May 2010 07:16:24 +0800
Subject: timers: Fix slack calculation for expired timers

commit 3bbb9ec946 (timers: Introduce the concept of timer slack for
legacy timers) does not take the case into account when the timer is
already expired. This broke wireless drivers.

The solution is not to apply slack to already expired timers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/timer.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 9199f3c52215..be394af5bc22 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -750,13 +750,14 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 	unsigned long expires_limit, mask;
 	int bit;
 
-	expires_limit = expires + timer->slack;
+	expires_limit = expires;
 
-	if (timer->slack < 0) /* auto slack: use 0.4% */
+	if (timer->slack > -1)
+		expires_limit = expires + timer->slack;
+	else if (time_after(expires, jiffies)) /* auto slack: use 0.4% */
 		expires_limit = expires + (expires - jiffies)/256;
 
 	mask = expires ^ expires_limit;
-
 	if (mask == 0)
 		return expires;
 
-- 
cgit v1.2.3-55-g7522


From 708c1bbc9d0c3e57f40501794d9b0eed29d10fce Mon Sep 17 00:00:00 2001
From: Miao Xie
Date: Mon, 24 May 2010 14:32:07 -0700
Subject: mempolicy: restructure rebinding-mempolicy functions

Nick Piggin reported that the allocator may see an empty nodemask when
changing cpuset's mems[1].  It happens only on the kernel that do not do
atomic nodemask_t stores.  (MAX_NUMNODES > BITS_PER_LONG)

But I found that there is also a problem on the kernel that can do atomic
nodemask_t stores.  The problem is that the allocator can't find a node to
alloc page when changing cpuset's mems though there is a lot of free
memory.  The reason is like this:

(mpol: mempolicy)
	task1			task1's mpol	task2
	alloc page		1
	  alloc on node0? NO	1
				1		change mems from 1 to 0
				1		rebind task1's mpol
				0-1		  set new bits
				0	  	  clear disallowed bits
	  alloc on node1? NO	0
	  ...
	can't alloc page
	  goto oom

I can use the attached program reproduce it by the following step:

# mkdir /dev/cpuset
# mount -t cpuset cpuset /dev/cpuset
# mkdir /dev/cpuset/1
# echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus
# echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems
# echo $$ > /dev/cpuset/1/tasks
# numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> &
   <nr_tasks> = max(nr_cpus - 1, 1)
# killall -s SIGUSR1 cpuset_mem_hog
# ./change_mems.sh

several hours later, oom will happen though there is a lot of free memory.

This patchset fixes this problem by expanding the nodes range first(set
newly allowed bits) and shrink it lazily(clear newly disallowed bits).  So
we use a variable to tell the write-side task that read-side task is
reading nodemask, and the write-side task clears newly disallowed nodes
after read-side task ends the current memory allocation.

This patch:

In order to fix no node to alloc memory, when we want to update mempolicy
and mems_allowed, we expand the set of nodes first (set all the newly
nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the
mempolicy's rebind functions may breaks the expanding.

So we restructure the mempolicy's rebind functions and split the rebind
work to two steps, just like the update of cpuset's mems: The 1st step:
expand the set of the mempolicy's nodes.  The 2nd step: shrink the set of
the mempolicy's nodes.  It is used when there is no real lock to protect
the mempolicy in the read-side.  Otherwise we can do rebind work at once.

In order to implement it, we define

	enum mpol_rebind_step {
		MPOL_REBIND_ONCE,
		MPOL_REBIND_STEP1,
		MPOL_REBIND_STEP2,
		MPOL_REBIND_NSTEP,
	};

If the mempolicy needn't be updated by two steps, we can pass
MPOL_REBIND_ONCE to the rebind functions.  Or we can pass
MPOL_REBIND_STEP1 to do the first step of the rebind work and pass
MPOL_REBIND_STEP2 to do the second step work.

Besides that, it maybe long time between these two step and we have to
release the lock that protects mempolicy and mems_allowed.  If we hold the
lock once again, we must check whether the current mempolicy is under the
rebinding (the first step has been done) or not, because the task may
alloc a new mempolicy when we don't hold the lock.  So we defined the
following flag to identify it:

#define MPOL_F_REBINDING (1 << 2)

The new functions will be used in the next patch.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Menage <menage@google.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  15 ++++--
 kernel/cpuset.c           |   4 +-
 mm/mempolicy.c            | 124 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 119 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1cc966cd3e5f..7b9ef6bf45aa 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -23,6 +23,13 @@ enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
+enum mpol_rebind_step {
+	MPOL_REBIND_ONCE,	/* do rebind work at once(not by two step) */
+	MPOL_REBIND_STEP1,	/* first step(set all the newly nodes) */
+	MPOL_REBIND_STEP2,	/* second step(clean all the disallowed nodes)*/
+	MPOL_REBIND_NSTEP,
+};
+
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
@@ -51,6 +58,7 @@ enum {
  */
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
+#define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 
 #ifdef __KERNEL__
 
@@ -193,8 +201,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void mpol_rebind_task(struct task_struct *tsk,
-					const nodemask_t *new);
+extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+				enum mpol_rebind_step step);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
@@ -308,7 +316,8 @@ static inline void numa_default_policy(void)
 }
 
 static inline void mpol_rebind_task(struct task_struct *tsk,
-					const nodemask_t *new)
+				const nodemask_t *new,
+				enum mpol_rebind_step step)
 {
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a50c5f6e727..db0990ac3fac 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -953,8 +953,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
-	mpol_rebind_task(tsk, newmems);
+	mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE);
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE);
 	tsk->mems_allowed = *newmems;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0c73c8b814cd..8a993db88029 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
 
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
-	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+	/*
+	 * If read-side task has no lock to protect task->mempolicy, write-side
+	 * task will rebind the task->mempolicy by two step. The first step is
+	 * setting all the newly nodes, and the second step is cleaning all the
+	 * disallowed nodes. In this way, we can avoid finding no node to alloc
+	 * page.
+	 * If we have a lock to protect task->mempolicy in read-side, we do
+	 * rebind directly.
+	 *
+	 * step:
+	 * 	MPOL_REBIND_ONCE - do rebind work at once
+	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
+	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
+	 */
+	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
+			enum mpol_rebind_step step);
 } mpol_ops[MPOL_MAX];
 
 /* Check that the nodemask contains at least one populated zone */
@@ -274,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
 	kmem_cache_free(policy_cache, p);
 }
 
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
+				enum mpol_rebind_step step)
 {
 }
 
-static void mpol_rebind_nodemask(struct mempolicy *pol,
-				 const nodemask_t *nodes)
+/*
+ * step:
+ * 	MPOL_REBIND_ONCE  - do rebind work at once
+ * 	MPOL_REBIND_STEP1 - set all the newly nodes
+ * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
+				 enum mpol_rebind_step step)
 {
 	nodemask_t tmp;
 
@@ -288,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 	else {
-		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
-			    *nodes);
-		pol->w.cpuset_mems_allowed = *nodes;
+		/*
+		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
+		 * result
+		 */
+		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
+			nodes_remap(tmp, pol->v.nodes,
+					pol->w.cpuset_mems_allowed, *nodes);
+			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
+		} else if (step == MPOL_REBIND_STEP2) {
+			tmp = pol->w.cpuset_mems_allowed;
+			pol->w.cpuset_mems_allowed = *nodes;
+		} else
+			BUG();
 	}
 
-	pol->v.nodes = tmp;
+	if (nodes_empty(tmp))
+		tmp = *nodes;
+
+	if (step == MPOL_REBIND_STEP1)
+		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
+	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
+		pol->v.nodes = tmp;
+	else
+		BUG();
+
 	if (!node_isset(current->il_next, tmp)) {
 		current->il_next = next_node(current->il_next, tmp);
 		if (current->il_next >= MAX_NUMNODES)
@@ -304,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
-				  const nodemask_t *nodes)
+				  const nodemask_t *nodes,
+				  enum mpol_rebind_step step)
 {
 	nodemask_t tmp;
 
@@ -327,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 	}
 }
 
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
-			       const nodemask_t *newmask)
+/*
+ * mpol_rebind_policy - Migrate a policy to a different set of nodes
+ *
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * 	MPOL_REBIND_ONCE  - do rebind work at once
+ * 	MPOL_REBIND_STEP1 - set all the newly nodes
+ * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
+				enum mpol_rebind_step step)
 {
 	if (!pol)
 		return;
-	if (!mpol_store_user_nodemask(pol) &&
+	if (!mpol_store_user_nodemask(pol) && step == 0 &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
-	mpol_ops[pol->mode].rebind(pol, newmask);
+
+	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
+		return;
+
+	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
+		BUG();
+
+	if (step == MPOL_REBIND_STEP1)
+		pol->flags |= MPOL_F_REBINDING;
+	else if (step == MPOL_REBIND_STEP2)
+		pol->flags &= ~MPOL_F_REBINDING;
+	else if (step >= MPOL_REBIND_NSTEP)
+		BUG();
+
+	mpol_ops[pol->mode].rebind(pol, newmask, step);
 }
 
 /*
@@ -346,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
  * Called with task's alloc_lock held.
  */
 
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+			enum mpol_rebind_step step)
 {
-	mpol_rebind_policy(tsk->mempolicy, new);
+	mpol_rebind_policy(tsk->mempolicy, new, step);
 }
 
 /*
@@ -363,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 
 	down_write(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		mpol_rebind_policy(vma->vm_policy, new);
+		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 	up_write(&mm->mmap_sem);
 }
 
@@ -1745,6 +1817,9 @@ EXPORT_SYMBOL(alloc_pages_current);
  * with the mems_allowed returned by cpuset_mems_allowed().  This
  * keeps mempolicies cpuset relative after its cpuset moves.  See
  * further kernel/cpuset.c update_nodemask().
+ *
+ * current's mempolicy may be rebinded by the other task(the task that changes
+ * cpuset's mems), so we needn't do rebind work for current task.
  */
 
 /* Slow path of a mempolicy duplicate */
@@ -1754,13 +1829,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 
 	if (!new)
 		return ERR_PTR(-ENOMEM);
+
+	/* task's mempolicy is protected by alloc_lock */
+	if (old == current->mempolicy) {
+		task_lock(current);
+		*new = *old;
+		task_unlock(current);
+	} else
+		*new = *old;
+
 	rcu_read_lock();
 	if (current_cpuset_is_being_rebound()) {
 		nodemask_t mems = cpuset_mems_allowed(current);
-		mpol_rebind_policy(old, &mems);
+		if (new->flags & MPOL_F_REBINDING)
+			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
+		else
+			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
 	}
 	rcu_read_unlock();
-	*new = *old;
 	atomic_set(&new->refcnt, 1);
 	return new;
 }
-- 
cgit v1.2.3-55-g7522


From c0ff7453bb5c7c98e0885fb94279f2571946f280 Mon Sep 17 00:00:00 2001
From: Miao Xie
Date: Mon, 24 May 2010 14:32:08 -0700
Subject: cpuset,mm: fix no node to alloc memory when changing cpuset's mems

Before applying this patch, cpuset updates task->mems_allowed and
mempolicy by setting all new bits in the nodemask first, and clearing all
old unallowed bits later.  But in the way, the allocator may find that
there is no node to alloc memory.

The reason is that cpuset rebinds the task's mempolicy, it cleans the
nodes which the allocater can alloc pages on, for example:

(mpol: mempolicy)
	task1			task1's mpol	task2
	alloc page		1
	  alloc on node0? NO	1
				1		change mems from 1 to 0
				1		rebind task1's mpol
				0-1		  set new bits
				0	  	  clear disallowed bits
	  alloc on node1? NO	0
	  ...
	can't alloc page
	  goto oom

This patch fixes this problem by expanding the nodes range first(set newly
allowed bits) and shrink it lazily(clear newly disallowed bits).  So we
use a variable to tell the write-side task that read-side task is reading
nodemask, and the write-side task clears newly disallowed nodes after
read-side task ends the current memory allocation.

[akpm@linux-foundation.org: fix spello]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Menage <menage@google.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 43 +++++++++++++++++++++++++++++++++++++
 include/linux/sched.h  |  1 +
 kernel/cpuset.c        | 58 +++++++++++++++++++++++++++++++++++++++++++-------
 kernel/exit.c          |  2 ++
 mm/filemap.c           | 10 +++++++--
 mm/hugetlb.c           | 12 +++++++----
 mm/mempolicy.c         | 24 +++++++++++++++++----
 mm/page_alloc.c        |  6 +++++-
 mm/slab.c              |  4 ++++
 mm/slub.c              |  6 +++++-
 mm/vmscan.c            |  2 ++
 11 files changed, 148 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a73454aec333..20b51cab6593 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -86,9 +86,44 @@ extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
+/*
+ * reading current mems_allowed and mempolicy in the fastpath must protected
+ * by get_mems_allowed()
+ */
+static inline void get_mems_allowed(void)
+{
+	current->mems_allowed_change_disable++;
+
+	/*
+	 * ensure that reading mems_allowed and mempolicy happens after the
+	 * update of ->mems_allowed_change_disable.
+	 *
+	 * the write-side task finds ->mems_allowed_change_disable is not 0,
+	 * and knows the read-side task is reading mems_allowed or mempolicy,
+	 * so it will clear old bits lazily.
+	 */
+	smp_mb();
+}
+
+static inline void put_mems_allowed(void)
+{
+	/*
+	 * ensure that reading mems_allowed and mempolicy before reducing
+	 * mems_allowed_change_disable.
+	 *
+	 * the write-side task will know that the read-side task is still
+	 * reading mems_allowed or mempolicy, don't clears old bits in the
+	 * nodemask.
+	 */
+	smp_mb();
+	--ACCESS_ONCE(current->mems_allowed_change_disable);
+}
+
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
+	task_lock(current);
 	current->mems_allowed = nodemask;
+	task_unlock(current);
 }
 
 #else /* !CONFIG_CPUSETS */
@@ -187,6 +222,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
+static inline void get_mems_allowed(void)
+{
+}
+
+static inline void put_mems_allowed(void)
+{
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b55e988988b5..415b8f8a3f45 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1421,6 +1421,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;	/* Protected by alloc_lock */
+	int mems_allowed_change_disable;
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index db0990ac3fac..61d6af7fa676 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  * In order to avoid seeing no nodes if the old and new nodes are disjoint,
  * we structure updates as setting all new allowed nodes, then clearing newly
  * disallowed ones.
- *
- * Called with task's alloc_lock held
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
+repeat:
+	/*
+	 * Allow tasks that have access to memory reserves because they have
+	 * been OOM killed to get memory anywhere.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE)))
+		return;
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return;
+
+	task_lock(tsk);
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE);
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE);
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+
+	/*
+	 * ensure checking ->mems_allowed_change_disable after setting all new
+	 * allowed nodes.
+	 *
+	 * the read-side task can see an nodemask with new allowed nodes and
+	 * old allowed nodes. and if it allocates page when cpuset clears newly
+	 * disallowed ones continuous, it can see the new allowed bits.
+	 *
+	 * And if setting all new allowed nodes is after the checking, setting
+	 * all new allowed nodes and clearing newly disallowed ones will be done
+	 * continuous, and the read-side task may find no node to alloc page.
+	 */
+	smp_mb();
+
+	/*
+	 * Allocation of memory is very fast, we needn't sleep when waiting
+	 * for the read-side.
+	 */
+	while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+		task_unlock(tsk);
+		if (!task_curr(tsk))
+			yield();
+		goto repeat;
+	}
+
+	/*
+	 * ensure checking ->mems_allowed_change_disable before clearing all new
+	 * disallowed nodes.
+	 *
+	 * if clearing newly disallowed bits before the checking, the read-side
+	 * task may find no node to alloc page.
+	 */
+	smp_mb();
+
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
 	tsk->mems_allowed = *newmems;
+	task_unlock(tsk);
 }
 
 /*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	cs = cgroup_cs(scan->cg);
 	guarantee_online_mems(cs, newmems);
 
-	task_lock(p);
 	cpuset_change_task_nodemask(p, newmems);
-	task_unlock(p);
 
 	NODEMASK_FREE(newmems);
 
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	WARN_ON_ONCE(err);
 
-	task_lock(tsk);
 	cpuset_change_task_nodemask(tsk, to);
-	task_unlock(tsk);
 	cpuset_update_task_spread_flag(cs, tsk);
 
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index eabca5a73a85..019a2843bf95 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code)
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
+	task_lock(tsk);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
+	task_unlock(tsk);
 #endif
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
diff --git a/mm/filemap.c b/mm/filemap.c
index d6f4f073836e..88d719665a28 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -461,9 +461,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 #ifdef CONFIG_NUMA
 struct page *__page_cache_alloc(gfp_t gfp)
 {
+	int n;
+	struct page *page;
+
 	if (cpuset_do_page_mem_spread()) {
-		int n = cpuset_mem_spread_node();
-		return alloc_pages_exact_node(n, gfp, 0);
+		get_mems_allowed();
+		n = cpuset_mem_spread_node();
+		page = alloc_pages_exact_node(n, gfp, 0);
+		put_mems_allowed();
+		return page;
 	}
 	return alloc_pages(gfp, 0);
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9e6bbf3772..54d42b009dbe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,11 +465,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
-	struct zonelist *zonelist = huge_zonelist(vma, address,
-					htlb_alloc_mask, &mpol, &nodemask);
+	struct zonelist *zonelist;
 	struct zone *zone;
 	struct zoneref *z;
 
+	get_mems_allowed();
+	zonelist = huge_zonelist(vma, address,
+					htlb_alloc_mask, &mpol, &nodemask);
 	/*
 	 * A child process with MAP_PRIVATE mappings created by their parent
 	 * have no page reserves. This check ensures that reservations are
@@ -477,11 +479,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	 */
 	if (!vma_has_reserves(vma) &&
 			h->free_huge_pages - h->resv_huge_pages == 0)
-		return NULL;
+		goto err;
 
 	/* If reserves cannot be used, ensure enough pages are in the pool */
 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
-		return NULL;
+		goto err;;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
@@ -500,7 +502,9 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 			break;
 		}
 	}
+err:
 	mpol_cond_put(mpol);
+	put_mems_allowed();
 	return page;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8a993db88029..721b2b338032 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1639,6 +1639,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * to the struct mempolicy for conditional unref after allocation.
  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
  * @nodemask for filtering the zonelist.
+ *
+ * Must be protected by get_mems_allowed()
  */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 				gfp_t gfp_flags, struct mempolicy **mpol,
@@ -1684,6 +1686,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	if (!(mask && current->mempolicy))
 		return false;
 
+	task_lock(current);
 	mempolicy = current->mempolicy;
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
@@ -1703,6 +1706,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	default:
 		BUG();
 	}
+	task_unlock(current);
 
 	return true;
 }
@@ -1750,13 +1754,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
+	struct page *page;
 
+	get_mems_allowed();
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		mpol_cond_put(pol);
-		return alloc_page_interleave(gfp, 0, nid);
+		page = alloc_page_interleave(gfp, 0, nid);
+		put_mems_allowed();
+		return page;
 	}
 	zl = policy_zonelist(gfp, pol);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
@@ -1766,12 +1774,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
 						zl, policy_nodemask(gfp, pol));
 		__mpol_put(pol);
+		put_mems_allowed();
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	put_mems_allowed();
+	return page;
 }
 
 /**
@@ -1796,18 +1807,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
+	struct page *page;
 
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
+	get_mems_allowed();
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
-		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	return __alloc_pages_nodemask(gfp, order,
+		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+	else
+		page = __alloc_pages_nodemask(gfp, order,
 			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+	put_mems_allowed();
+	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 596180fedd3a..f7da2a2934b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1990,10 +1990,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
+	get_mems_allowed();
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
-	if (!preferred_zone)
+	if (!preferred_zone) {
+		put_mems_allowed();
 		return NULL;
+	}
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2003,6 +2006,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
+	put_mems_allowed();
 
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 	return page;
diff --git a/mm/slab.c b/mm/slab.c
index 50a73fca19c4..02786e1a32d2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3217,10 +3217,12 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
 	nid_alloc = nid_here = numa_node_id();
+	get_mems_allowed();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_mem_spread_node();
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
+	put_mems_allowed();
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3247,6 +3249,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
+	get_mems_allowed();
 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
@@ -3302,6 +3305,7 @@ retry:
 			}
 		}
 	}
+	put_mems_allowed();
 	return obj;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index e46e3129697d..26f0cb9cc584 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1360,6 +1360,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
+	get_mems_allowed();
 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		struct kmem_cache_node *n;
@@ -1369,10 +1370,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
 				n->nr_partial > s->min_partial) {
 			page = get_partial_node(n);
-			if (page)
+			if (page) {
+				put_mems_allowed();
 				return page;
+			}
 		}
 	}
+	put_mems_allowed();
 #endif
 	return NULL;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ff3311447f5..f2c367c9ec12 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1774,6 +1774,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	unsigned long writeback_threshold;
 
+	get_mems_allowed();
 	delayacct_freepages_start();
 
 	if (scanning_global_lru(sc))
@@ -1857,6 +1858,7 @@ out:
 		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
 
 	delayacct_freepages_end();
+	put_mems_allowed();
 
 	return ret;
 }
-- 
cgit v1.2.3-55-g7522


From 76ab0f530e4a01d4dc20cdc1d5e87753c579dc18 Mon Sep 17 00:00:00 2001
From: Mel Gorman
Date: Mon, 24 May 2010 14:32:28 -0700
Subject: mm: compaction: add /proc trigger for memory compaction

Add a proc file /proc/sys/vm/compact_memory.  When an arbitrary value is
written to the file, all zones are compacted.  The expected user of such a
trigger is a job scheduler that prepares the system before the target
application runs.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 10 ++++++++
 include/linux/compaction.h  |  6 +++++
 kernel/sysctl.c             | 10 ++++++++
 mm/compaction.c             | 62 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6c7d18c53f84..56dd29b97a91 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -19,6 +19,7 @@ files can be found in mm/swap.c.
 Currently, these files are in /proc/sys/vm:
 
 - block_dump
+- compact_memory
 - dirty_background_bytes
 - dirty_background_ratio
 - dirty_bytes
@@ -64,6 +65,15 @@ information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
 
 ==============================================================
 
+compact_memory
+
+Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
+all zones are compacted such that free memory is available in contiguous
+blocks where possible. This can be important for example in the allocation of
+huge pages although processes will also directly compact memory as required.
+
+==============================================================
+
 dirty_background_bytes
 
 Contains the amount of dirty memory at which the pdflush background writeback
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 465ca88615ed..572388880ba8 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -6,4 +6,10 @@
 #define COMPACT_PARTIAL		1
 #define COMPACT_COMPLETE	2
 
+#ifdef CONFIG_COMPACTION
+extern int sysctl_compact_memory;
+extern int sysctl_compaction_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos);
+#endif /* CONFIG_COMPACTION */
+
 #endif /* _LINUX_COMPACTION_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4c93486b45d1..284f330d6a01 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/ratelimit.h>
+#include <linux/compaction.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
@@ -1121,6 +1122,15 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
 	},
+#ifdef CONFIG_COMPACTION
+	{
+		.procname	= "compact_memory",
+		.data		= &sysctl_compact_memory,
+		.maxlen		= sizeof(int),
+		.mode		= 0200,
+		.proc_handler	= sysctl_compaction_handler,
+	},
+#endif /* CONFIG_COMPACTION */
 	{
 		.procname	= "min_free_kbytes",
 		.data		= &min_free_kbytes,
diff --git a/mm/compaction.c b/mm/compaction.c
index be1ff3f7552b..77854fbc0f56 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -12,6 +12,7 @@
 #include <linux/compaction.h>
 #include <linux/mm_inline.h>
 #include <linux/backing-dev.h>
+#include <linux/sysctl.h>
 #include "internal.h"
 
 /*
@@ -391,3 +392,64 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
 	return ret;
 }
+
+/* Compact all zones within a node */
+static int compact_node(int nid)
+{
+	int zoneid;
+	pg_data_t *pgdat;
+	struct zone *zone;
+
+	if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
+		return -EINVAL;
+	pgdat = NODE_DATA(nid);
+
+	/* Flush pending updates to the LRU lists */
+	lru_add_drain_all();
+
+	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+		struct compact_control cc = {
+			.nr_freepages = 0,
+			.nr_migratepages = 0,
+		};
+
+		zone = &pgdat->node_zones[zoneid];
+		if (!populated_zone(zone))
+			continue;
+
+		cc.zone = zone;
+		INIT_LIST_HEAD(&cc.freepages);
+		INIT_LIST_HEAD(&cc.migratepages);
+
+		compact_zone(zone, &cc);
+
+		VM_BUG_ON(!list_empty(&cc.freepages));
+		VM_BUG_ON(!list_empty(&cc.migratepages));
+	}
+
+	return 0;
+}
+
+/* Compact all nodes in the system */
+static int compact_nodes(void)
+{
+	int nid;
+
+	for_each_online_node(nid)
+		compact_node(nid);
+
+	return COMPACT_COMPLETE;
+}
+
+/* The written value is actually unused, all memory is compacted */
+int sysctl_compact_memory;
+
+/* This is the entry point for compacting all nodes via /proc/sys/vm */
+int sysctl_compaction_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	if (write)
+		return compact_nodes();
+
+	return 0;
+}
-- 
cgit v1.2.3-55-g7522


From 5e7719058079a1423ccce56148b0aaa56b2df821 Mon Sep 17 00:00:00 2001
From: Mel Gorman
Date: Mon, 24 May 2010 14:32:31 -0700
Subject: mm: compaction: add a tunable that decides when memory should be
 compacted and when it should be reclaimed

The kernel applies some heuristics when deciding if memory should be
compacted or reclaimed to satisfy a high-order allocation.  One of these
is based on the fragmentation.  If the index is below 500, memory will not
be compacted.  This choice is arbitrary and not based on data.  To help
optimise the system and set a sensible default for this value, this patch
adds a sysctl extfrag_threshold.  The kernel will only compact memory if
the fragmentation index is above the extfrag_threshold.

[randy.dunlap@oracle.com: Fix build errors when proc fs is not configured]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 15 +++++++++++++++
 include/linux/compaction.h  |  3 +++
 kernel/sysctl.c             | 15 +++++++++++++++
 mm/compaction.c             | 12 +++++++++++-
 4 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 56dd29b97a91..5fdbb612aeb8 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/vm:
 - dirty_ratio
 - dirty_writeback_centisecs
 - drop_caches
+- extfrag_threshold
 - hugepages_treat_as_movable
 - hugetlb_shm_group
 - laptop_mode
@@ -149,6 +150,20 @@ user should run `sync' first.
 
 ==============================================================
 
+extfrag_threshold
+
+This parameter affects whether the kernel will compact memory or direct
+reclaim to satisfy a high-order allocation. /proc/extfrag_index shows what
+the fragmentation index for each order is in each zone in the system. Values
+tending towards 0 imply allocations would fail due to lack of memory,
+values towards 1000 imply failures are due to fragmentation and -1 implies
+that the allocation will succeed as long as watermarks are met.
+
+The kernel will not compact memory in a zone if the
+fragmentation index is <= extfrag_threshold. The default value is 500.
+
+==============================================================
+
 hugepages_treat_as_movable
 
 This parameter is only useful when kernelcore= is specified at boot time to
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index eed40ec4280b..3719325c6091 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -15,6 +15,9 @@
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
+extern int sysctl_extfrag_threshold;
+extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos);
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 284f330d6a01..84ff5e75c084 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -263,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
 static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 
+#ifdef CONFIG_COMPACTION
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
+#endif
+
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "sched_child_runs_first",
@@ -1130,6 +1135,16 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0200,
 		.proc_handler	= sysctl_compaction_handler,
 	},
+	{
+		.procname	= "extfrag_threshold",
+		.data		= &sysctl_extfrag_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_extfrag_handler,
+		.extra1		= &min_extfrag_threshold,
+		.extra2		= &max_extfrag_threshold,
+	},
+
 #endif /* CONFIG_COMPACTION */
 	{
 		.procname	= "min_free_kbytes",
diff --git a/mm/compaction.c b/mm/compaction.c
index 9583e193dc47..94cce51b0b35 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -433,6 +433,8 @@ static unsigned long compact_zone_order(struct zone *zone,
 	return compact_zone(zone, &cc);
 }
 
+int sysctl_extfrag_threshold = 500;
+
 /**
  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
  * @zonelist: The zonelist used for the current allocation
@@ -491,7 +493,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		 * Only compact if a failure would be due to fragmentation.
 		 */
 		fragindex = fragmentation_index(zone, order);
-		if (fragindex >= 0 && fragindex <= 500)
+		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
 			continue;
 
 		if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
@@ -572,6 +574,14 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+int sysctl_extfrag_handler(struct ctl_table *table, int write,
+			void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+	return 0;
+}
+
 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
 ssize_t sysfs_compact_node(struct sys_device *dev,
 			struct sysdev_attribute *attr,
-- 
cgit v1.2.3-55-g7522


From cf23422b9d76215316855253da491d4c9f294372 Mon Sep 17 00:00:00 2001
From: minskey guo
Date: Mon, 24 May 2010 14:32:41 -0700
Subject: cpu/mem hotplug: enable CPUs online before local memory online

Enable users to online CPUs even if the CPUs belongs to a numa node which
doesn't have onlined local memory.

The zonlists(pg_data_t.node_zonelists[]) of a numa node are created either
in system boot/init period, or at the time of local memory online.  For a
numa node without onlined local memory, its zonelists are not initialized
at present.  As a result, any memory allocation operations executed by
CPUs within this node will fail.  In fact, an out-of-memory error is
triggered when attempt to online CPUs before memory comes to online.

This patch tries to create zonelists for such numa nodes, so that the
memory allocation for this node can be fallback'ed to other nodes.

[akpm@linux-foundation.org: remove unneeded export]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: minskey guo<chaohong.guo@intel.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  1 +
 kernel/cpu.c                   | 25 +++++++++++++++++++++++++
 mm/memory_hotplug.c            | 23 +++++++++++++++++++++++
 3 files changed, 49 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 35b07b773e6c..864035fb8f8a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -202,6 +202,7 @@ static inline int is_mem_section_removable(unsigned long pfn,
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 545777574779..a3fbcc0a0abc 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -326,6 +326,12 @@ out_notify:
 int __cpuinit cpu_up(unsigned int cpu)
 {
 	int err = 0;
+
+#ifdef	CONFIG_MEMORY_HOTPLUG
+	int nid;
+	pg_data_t	*pgdat;
+#endif
+
 	if (!cpu_possible(cpu)) {
 		printk(KERN_ERR "can't online cpu %d because it is not "
 			"configured as may-hotadd at boot time\n", cpu);
@@ -336,6 +342,25 @@ int __cpuinit cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
+#ifdef	CONFIG_MEMORY_HOTPLUG
+	nid = cpu_to_node(cpu);
+	if (!node_online(nid)) {
+		err = mem_online_node(nid);
+		if (err)
+			return err;
+	}
+
+	pgdat = NODE_DATA(nid);
+	if (!pgdat) {
+		printk(KERN_ERR
+			"Can't online cpu %d due to NULL pgdat\n", cpu);
+		return -ENOMEM;
+	}
+
+	if (pgdat->node_zonelists->_zonerefs->zone == NULL)
+		build_all_zonelists();
+#endif
+
 	cpu_maps_update_begin();
 
 	if (cpu_hotplug_disabled) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index be211a582930..85eb4d342ac5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -482,6 +482,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
 
 
+/*
+ * called by cpu_up() to online a node without onlined memory.
+ */
+int mem_online_node(int nid)
+{
+	pg_data_t	*pgdat;
+	int	ret;
+
+	lock_system_sleep();
+	pgdat = hotadd_new_pgdat(nid, 0);
+	if (pgdat) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	node_set_online(nid);
+	ret = register_one_node(nid);
+	BUG_ON(ret);
+
+out:
+	unlock_system_sleep();
+	return ret;
+}
+
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 int __ref add_memory(int nid, u64 start, u64 size)
 {
-- 
cgit v1.2.3-55-g7522


From 1f522509c77a5dea8dc384b735314f03908a6415 Mon Sep 17 00:00:00 2001
From: Haicheng Li
Date: Mon, 24 May 2010 14:32:51 -0700
Subject: mem-hotplug: avoid multiple zones sharing same boot strapping
 boot_pageset

For each new populated zone of hotadded node, need to update its pagesets
with dynamically allocated per_cpu_pageset struct for all possible CPUs:

    1) Detach zone->pageset from the shared boot_pageset
       at end of __build_all_zonelists().

    2) Use mutex to protect zone->pageset when it's still
       shared in onlined_pages()

Otherwises, multiple zones of different nodes would share same boot strapping
boot_pageset for same CPU, which will finally cause below kernel panic:

  ------------[ cut here ]------------
  kernel BUG at mm/page_alloc.c:1239!
  invalid opcode: 0000 [#1] SMP
  ...
  Call Trace:
   [<ffffffff811300c1>] __alloc_pages_nodemask+0x131/0x7b0
   [<ffffffff81162e67>] alloc_pages_current+0x87/0xd0
   [<ffffffff81128407>] __page_cache_alloc+0x67/0x70
   [<ffffffff811325f0>] __do_page_cache_readahead+0x120/0x260
   [<ffffffff81132751>] ra_submit+0x21/0x30
   [<ffffffff811329c6>] ondemand_readahead+0x166/0x2c0
   [<ffffffff81132ba0>] page_cache_async_readahead+0x80/0xa0
   [<ffffffff8112a0e4>] generic_file_aio_read+0x364/0x670
   [<ffffffff81266cfa>] nfs_file_read+0xca/0x130
   [<ffffffff8117b20a>] do_sync_read+0xfa/0x140
   [<ffffffff8117bf75>] vfs_read+0xb5/0x1a0
   [<ffffffff8117c151>] sys_read+0x51/0x80
   [<ffffffff8103c032>] system_call_fastpath+0x16/0x1b
  RIP  [<ffffffff8112ff13>] get_page_from_freelist+0x883/0x900
   RSP <ffff88000d1e78a8>
  ---[ end trace 4bda28328b9990db ]

[akpm@linux-foundation.org: merge fix]
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <andi.kleen@intel.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 init/main.c            |  2 +-
 kernel/cpu.c           |  2 +-
 mm/memory_hotplug.c    | 18 +++++++++++++-----
 mm/page_alloc.c        | 17 +++++++++++++----
 5 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f6f2c505fa7e..a367ed5bb3fe 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -652,7 +652,7 @@ typedef struct pglist_data {
 
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free);
-void build_all_zonelists(void);
+void build_all_zonelists(void *data);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
diff --git a/init/main.c b/init/main.c
index 22881b5e95e3..3bdb152f412f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -567,7 +567,7 @@ asmlinkage void __init start_kernel(void)
 	setup_per_cpu_areas();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
-	build_all_zonelists();
+	build_all_zonelists(NULL);
 	page_alloc_init();
 
 	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a3fbcc0a0abc..3e8b3ba27175 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -358,7 +358,7 @@ int __cpuinit cpu_up(unsigned int cpu)
 	}
 
 	if (pgdat->node_zonelists->_zonerefs->zone == NULL)
-		build_all_zonelists();
+		build_all_zonelists(NULL);
 #endif
 
 	cpu_maps_update_begin();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 85eb4d342ac5..089cc97aed3c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -389,6 +389,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	int nid;
 	int ret;
 	struct memory_notify arg;
+	/*
+	 * mutex to protect zone->pageset when it's still shared
+	 * in onlined_pages()
+	 */
+	static DEFINE_MUTEX(zone_pageset_mutex);
 
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@@ -415,12 +420,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	 * This means the page allocator ignores this zone.
 	 * So, zonelist must be updated after online.
 	 */
+	mutex_lock(&zone_pageset_mutex);
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 		online_pages_range);
 	if (ret) {
+		mutex_unlock(&zone_pageset_mutex);
 		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
 			nr_pages, pfn);
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -429,8 +436,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
+	if (need_zonelists_rebuild)
+		build_all_zonelists(zone);
+	else
+		zone_pcp_update(zone);
 
-	zone_pcp_update(zone);
+	mutex_unlock(&zone_pageset_mutex);
 	setup_per_zone_wmarks();
 	calculate_zone_inactive_ratio(zone);
 	if (onlined_pages) {
@@ -438,10 +449,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
 	}
 
-	if (need_zonelists_rebuild)
-		build_all_zonelists();
-	else
-		vm_total_pages = nr_free_pagecache_pages();
+	vm_total_pages = nr_free_pagecache_pages();
 
 	writeback_set_ratelimit();
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 595d0ac211e2..21c52d2d8624 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2572,7 +2572,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order)
-			build_all_zonelists();
+			build_all_zonelists(NULL);
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
@@ -2922,9 +2922,10 @@ static void build_zonelist_cache(pg_data_t *pgdat)
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
 
 /* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
@@ -2939,6 +2940,14 @@ static int __build_all_zonelists(void *dummy)
 		build_zonelist_cache(pgdat);
 	}
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/* Setup real pagesets for the new zone */
+	if (data) {
+		struct zone *zone = data;
+		setup_zone_pageset(zone);
+	}
+#endif
+
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
@@ -2958,7 +2967,7 @@ static int __build_all_zonelists(void *dummy)
 	return 0;
 }
 
-void build_all_zonelists(void)
+void build_all_zonelists(void *data)
 {
 	set_zonelist_order();
 
@@ -2969,7 +2978,7 @@ void build_all_zonelists(void)
 	} else {
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
-		stop_machine(__build_all_zonelists, NULL, NULL);
+		stop_machine(__build_all_zonelists, data, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
-- 
cgit v1.2.3-55-g7522


From 4eaf3f64397c3db3c5785eee508270d62a9fabd9 Mon Sep 17 00:00:00 2001
From: Haicheng Li
Date: Mon, 24 May 2010 14:32:52 -0700
Subject: mem-hotplug: fix potential race while building zonelist for new
 populated zone

Add global mutex zonelists_mutex to fix the possible race:

     CPU0                                  CPU1                    CPU2
(1) zone->present_pages += online_pages;
(2)                                       build_all_zonelists();
(3)                                                               alloc_page();
(4)                                                               free_page();
(5) build_all_zonelists();
(6)   __build_all_zonelists();
(7)     zone->pageset = alloc_percpu();

In step (3,4), zone->pageset still points to boot_pageset, so bad
things may happen if 2+ nodes are in this state. Even if only 1 node
is accessing the boot_pageset, (3) may still consume too much memory
to fail the memory allocations in step (7).

Besides, atomic operation ensures alloc_percpu() in step (7) will never fail
since there is a new fresh memory block added in step(6).

[haicheng.li@linux.intel.com: hold zonelists_mutex when build_all_zonelists]
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Andi Kleen <andi.kleen@intel.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 kernel/cpu.c           |  5 ++++-
 mm/memory_hotplug.c    | 11 +++--------
 mm/page_alloc.c        | 15 ++++++++++++++-
 4 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a367ed5bb3fe..0fa491326c4a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -650,6 +650,7 @@ typedef struct pglist_data {
 
 #include <linux/memory_hotplug.h>
 
+extern struct mutex zonelists_mutex;
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free);
 void build_all_zonelists(void *data);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3e8b3ba27175..124ad9d6be16 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -357,8 +357,11 @@ int __cpuinit cpu_up(unsigned int cpu)
 		return -ENOMEM;
 	}
 
-	if (pgdat->node_zonelists->_zonerefs->zone == NULL)
+	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+		mutex_lock(&zonelists_mutex);
 		build_all_zonelists(NULL);
+		mutex_unlock(&zonelists_mutex);
+	}
 #endif
 
 	cpu_maps_update_begin();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 089cc97aed3c..a4cfcdc00455 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -389,11 +389,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	int nid;
 	int ret;
 	struct memory_notify arg;
-	/*
-	 * mutex to protect zone->pageset when it's still shared
-	 * in onlined_pages()
-	 */
-	static DEFINE_MUTEX(zone_pageset_mutex);
 
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@@ -420,14 +415,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	 * This means the page allocator ignores this zone.
 	 * So, zonelist must be updated after online.
 	 */
-	mutex_lock(&zone_pageset_mutex);
+	mutex_lock(&zonelists_mutex);
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 		online_pages_range);
 	if (ret) {
-		mutex_unlock(&zone_pageset_mutex);
+		mutex_unlock(&zonelists_mutex);
 		printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
 			nr_pages, pfn);
 		memory_notify(MEM_CANCEL_ONLINE, &arg);
@@ -441,7 +436,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	else
 		zone_pcp_update(zone);
 
-	mutex_unlock(&zone_pageset_mutex);
+	mutex_unlock(&zonelists_mutex);
 	setup_per_zone_wmarks();
 	calculate_zone_inactive_ratio(zone);
 	if (onlined_pages) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 21c52d2d8624..08b349931ebc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2571,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 			strncpy((char*)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
-		} else if (oldval != user_zonelist_order)
+		} else if (oldval != user_zonelist_order) {
+			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL);
+			mutex_unlock(&zonelists_mutex);
+		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
@@ -2924,6 +2927,12 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
+
 /* return values int ....just for stop_machine() */
 static __init_refok int __build_all_zonelists(void *data)
 {
@@ -2967,6 +2976,10 @@ static __init_refok int __build_all_zonelists(void *data)
 	return 0;
 }
 
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
 void build_all_zonelists(void *data)
 {
 	set_zonelist_order();
-- 
cgit v1.2.3-55-g7522


From 7d52669b14e36f8365070324be009486d387ad00 Mon Sep 17 00:00:00 2001
From: Wenji Huang
Date: Mon, 24 May 2010 14:33:12 -0700
Subject: module: remove duplicate declaration of __ksymtab_gpl_future

Minor cleanup on duplicate __{start/stop}__ksymtab_gpl_future.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index a8014bfb5a4e..625985e70e9d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -180,8 +180,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
-- 
cgit v1.2.3-55-g7522


From 69e4469a39b67e9923731d5d77d45c04837d5def Mon Sep 17 00:00:00 2001
From: Andy Shevchenko
Date: Mon, 24 May 2010 14:33:26 -0700
Subject: sysctl: don't use own implementation of hex_to_bin()

Remove own implementation of hex_to_bin().

Signed-off-by: Andy Shevchenko <ext-andriy.shevchenko@nokia.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_binary.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 937d31dc8566..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
+#include <linux/kernel.h>
 #include <linux/slab.h>
 
 #ifdef CONFIG_SYSCTL_SYSCALL
@@ -1124,11 +1125,6 @@ out:
 	return result;
 }
 
-static unsigned hex_value(int ch)
-{
-	return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
-}
-
 static ssize_t bin_uuid(struct file *file,
 	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
 			if (!isxdigit(str[0]) || !isxdigit(str[1]))
 				goto out;
 
-			uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
+			uuid[i] = (hex_to_bin(str[0]) << 4) |
+					hex_to_bin(str[1]);
 			str += 2;
 			if (*str == '-')
 				str++;
-- 
cgit v1.2.3-55-g7522


From 563b04671017ea00ba563ebeebdc36bce79b1b60 Mon Sep 17 00:00:00 2001
From: J. R. Okajima
Date: Tue, 25 May 2010 16:10:14 -0700
Subject: proc_dointvec: write a single value

The commit 00b7c3395aec3df43de5bd02a3c5a099ca51169f
"sysctl: refactor integer handling proc code"
modified the behaviour of writing to /proc.
Before the commit, write("1\n") to /proc/sys/kernel/printk succeeded. But
now it returns EINVAL.

This commit supports writing a single value to a multi-valued entry.

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Reviewed-and-tested-by: WANG Cong <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b12583047757..f948f20f09cb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2253,6 +2253,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		if (write) {
 			left -= proc_skip_spaces(&kbuf);
 
+			if (!left)
+				break;
 			err = proc_get_long(&kbuf, &left, &lval, &neg,
 					     proc_wspace_sep,
 					     sizeof(proc_wspace_sep), NULL);
@@ -2279,7 +2281,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 
 	if (!write && !first && left && !err)
 		err = proc_put_char(&buffer, &left, '\n');
-	if (write && !err)
+	if (write && !err && left)
 		left -= proc_skip_spaces(&kbuf);
 free:
 	if (write) {
-- 
cgit v1.2.3-55-g7522


From 218ce7351413b8287a80fab1d7b94906a5559f01 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Tue, 25 May 2010 16:48:30 -0700
Subject: Revert "module: drop the lock while waiting for module to complete
 initialization."

This reverts commit 480b02df3aa9f07d1c7df0cd8be7a5ca73893455, since
Rafael reports that it causes occasional kernel paging request faults in
load_module().

Dropping the module lock and re-taking it deep in the call-chain is
definitely not the right thing to do.  That just turns the mutex from a
lock into a "random non-locking data structure" that doesn't actually
protect what it's supposed to protect.

Requested-and-tested-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Brandon Philips <brandon@ifup.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 59 +++++++++++++++++++++------------------------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 625985e70e9d..333fbcc96978 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -563,26 +563,33 @@ int use_module(struct module *a, struct module *b)
 	struct module_use *use;
 	int no_warn, err;
 
-	if (b == NULL || already_uses(a, b))
-		return 0;
+	if (b == NULL || already_uses(a, b)) return 1;
 
 	/* If we're interrupted or time out, we fail. */
-	err = strong_try_module_get(b);
+	if (wait_event_interruptible_timeout(
+		    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
+		    30 * HZ) <= 0) {
+		printk("%s: gave up waiting for init of module %s.\n",
+		       a->name, b->name);
+		return 0;
+	}
+
+	/* If strong_try_module_get() returned a different error, we fail. */
 	if (err)
-		return err;
+		return 0;
 
 	DEBUGP("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
 	if (!use) {
 		printk("%s: out of memory loading\n", a->name);
 		module_put(b);
-		return -ENOMEM;
+		return 0;
 	}
 
 	use->module_which_uses = a;
 	list_add(&use->list, &b->modules_which_use_me);
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
-	return 0;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(use_module);
 
@@ -875,7 +882,7 @@ static inline void module_unload_free(struct module *mod)
 
 int use_module(struct module *a, struct module *b)
 {
-	return strong_try_module_get(b);
+	return strong_try_module_get(b) == 0;
 }
 EXPORT_SYMBOL_GPL(use_module);
 
@@ -1046,39 +1053,17 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 	struct module *owner;
 	const struct kernel_symbol *sym;
 	const unsigned long *crc;
-	DEFINE_WAIT(wait);
-	int err;
-	long timeleft = 30 * HZ;
 
-again:
 	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	if (!sym)
-		return NULL;
-
-	if (!check_version(sechdrs, versindex, name, mod, crc, owner))
-		return NULL;
-
-	prepare_to_wait(&module_wq, &wait, TASK_INTERRUPTIBLE);
-	err = use_module(mod, owner);
-	if (likely(!err) || err != -EBUSY || signal_pending(current)) {
-		finish_wait(&module_wq, &wait);
-		return err ? NULL : sym;
-	}
-
-	/* Module is still loading.  Drop lock and wait. */
-	mutex_unlock(&module_mutex);
-	timeleft = schedule_timeout(timeleft);
-	mutex_lock(&module_mutex);
-	finish_wait(&module_wq, &wait);
-
-	/* Module might be gone entirely, or replaced.  Re-lookup. */
-	if (timeleft)
-		goto again;
-
-	printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
-	       mod->name, owner->name);
-	return NULL;
+	/* use_module can fail due to OOM,
+	   or module initialization or unloading */
+	if (sym) {
+		if (!check_version(sechdrs, versindex, name, mod, crc, owner)
+		    || !use_module(mod, owner))
+			sym = NULL;
+	}
+	return sym;
 }
 
 /*
-- 
cgit v1.2.3-55-g7522