From a953e4597abd51b74c99e0e3b7074532a60fd031 Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: sched: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c

  * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c,
    where appropriate.  This saves some allocated space as well as many
    wasted cycles going through node entries that are non-existent.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..1ed8011db826 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6879,9 +6879,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -7075,7 +7075,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -7263,7 +7263,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7407,7 +7407,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7431,7 +7431,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7470,9 +7470,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7525,7 +7525,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
-- 
cgit v1.2.3-55-g7522


From 363ab6f1424cdea63e5d182312d60e19077b892a Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: core: use performance variant for_each_cpu_mask_nr

Change references from for_each_cpu_mask to for_each_cpu_mask_nr
where appropriate

Reviewed-by: Paul Jackson <pj@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/cpu.c        |  2 +-
 kernel/rcuclassic.c |  2 +-
 kernel/rcupreempt.c | 10 +++++-----
 kernel/sched.c      | 36 ++++++++++++++++++------------------
 kernel/sched_fair.c |  2 +-
 kernel/sched_rt.c   |  6 +++---
 kernel/taskstats.c  |  4 ++--
 kernel/workqueue.c  |  6 +++---
 8 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..50ae922c6022 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -390,7 +390,7 @@ void __ref enable_nonboot_cpus(void)
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
-	for_each_cpu_mask(cpu, frozen_cpus) {
+	for_each_cpu_mask_nr(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..251358de70b0 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -92,7 +92,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 */
 		cpumask = rcp->cpumask;
 		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
+		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
 }
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..18af270125cf 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -657,7 +657,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -675,7 +675,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitack_needed(cpu) &&
 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -707,7 +707,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -722,7 +722,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -742,7 +742,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitmb_needed(cpu) &&
 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ed8011db826..814d6e17f1e1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2271,7 +2271,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2313,7 +2313,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 	/* Traverse only the allowed CPUs */
 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 
-	for_each_cpu_mask(i, *tmp) {
+	for_each_cpu_mask_nr(i, *tmp) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -3296,7 +3296,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			struct rq *rq;
 
 			if (!cpu_isset(i, *cpus))
@@ -3560,7 +3560,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu_mask(i, group->cpumask) {
+	for_each_cpu_mask_nr(i, group->cpumask) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -4100,7 +4100,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		int balance_cpu;
 
 		cpu_clear(this_cpu, cpus);
-		for_each_cpu_mask(balance_cpu, cpus) {
+		for_each_cpu_mask_nr(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
@@ -6832,7 +6832,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 
 	cpus_clear(*covered);
 
-	for_each_cpu_mask(i, *span) {
+	for_each_cpu_mask_nr(i, *span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
@@ -6843,7 +6843,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 
-		for_each_cpu_mask(j, *span) {
+		for_each_cpu_mask_nr(j, *span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
@@ -7043,7 +7043,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu_mask(j, sg->cpumask) {
+		for_each_cpu_mask_nr(j, sg->cpumask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
@@ -7068,7 +7068,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 
-	for_each_cpu_mask(cpu, *cpu_map) {
+	for_each_cpu_mask_nr(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 
@@ -7302,7 +7302,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
@@ -7374,7 +7374,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7391,7 +7391,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7458,7 +7458,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, *nodemask) {
+		for_each_cpu_mask_nr(j, *nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
@@ -7504,21 +7504,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 
 		init_sched_groups_power(i, sd);
@@ -7538,7 +7538,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -7621,7 +7621,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 
 	unregister_sched_domain_sysctl();
 
-	for_each_cpu_mask(i, *cpu_map)
+	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..e398318f1014 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1022,7 +1022,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
+			for_each_cpu_mask_nr(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..d73386c6e361 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -231,7 +231,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		return 1;
 
 	span = sched_rt_period_mask();
-	for_each_cpu_mask(i, span) {
+	for_each_cpu_mask_nr(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -272,7 +272,7 @@ static int balance_runtime(struct rt_rq *rt_rq)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	rt_period = ktime_to_ns(rt_b->rt_period);
-	for_each_cpu_mask(i, rd->span) {
+	for_each_cpu_mask_nr(i, rd->span) {
 		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 		s64 diff;
 
@@ -1000,7 +1000,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a6..06b17547f4e7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 		return -EINVAL;
 
 	if (isadd == REGISTER) {
-		for_each_cpu_mask(cpu, mask) {
+		for_each_cpu_mask_nr(cpu, mask) {
 			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
 					 cpu_to_node(cpu));
 			if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 
 	/* Deregister or cleanup */
 cleanup:
-	for_each_cpu_mask(cpu, mask) {
+	for_each_cpu_mask_nr(cpu, mask) {
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
 		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029c..28c2b2c96ac7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -397,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 	lock_release(&wq->lockdep_map, 1, _THIS_IP_);
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -477,7 +477,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -813,7 +813,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
 	put_online_cpus();
 
-- 
cgit v1.2.3-55-g7522


From bd3bff9e20f454b242d979ec2f9a4dca0d5fa06f Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: sched: add latency tracer callbacks to the scheduler

add 3 lightweight callbacks to the tracer backend.

zero impact if tracing is turned off.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 26 ++++++++++++++++++++++++++
 kernel/sched.c        |  3 +++
 2 files changed, 29 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4b..717cab8a0c83 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2117,6 +2117,32 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern void
+ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next);
+#else
+static inline void
+ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+{
+}
+#endif
+
+#ifdef CONFIG_SCHED_TRACER
+extern void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
+extern void
+ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr);
+#else
+static inline void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+{
+}
+static inline void
+ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
+{
+}
+#endif
+
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..463dcdb36ef8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2467,6 +2467,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
+	ftrace_wake_up_task(p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2611,6 +2612,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
+	ftrace_wake_up_new_task(p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2783,6 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
+	ftrace_ctx_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
-- 
cgit v1.2.3-55-g7522


From 7c731e0a495e25e79dc1e9e68772a67a55721a65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: ftrace: make the task state char-string visible to all

The tracer wants to be able to convert the state number
into a user visible character. This patch pulls that conversion
string out the scheduler into the header. This way if it were to
ever change, other parts of the kernel will know.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 2 ++
 kernel/sched.c        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 717cab8a0c83..6e26f1fdbfe2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2237,6 +2237,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 #endif /* CONFIG_MM_OWNER */
 
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 463dcdb36ef8..73e600852365 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5729,7 +5729,7 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 void sched_show_task(struct task_struct *p)
 {
-- 
cgit v1.2.3-55-g7522


From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace preempt off critical timings

Add preempt off timings. A lot of kernel core code is taken from the RT patch
latency trace that was written by Ingo Molnar.

This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers

Now instead of just tracing irqs off, preemption off can be selected
to be recorded.

When this is selected, it shares the same files as irqs off timings.
One can either trace preemption off, irqs off, or one or the other off.

By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording
of preempt off only is performed. "irqsoff" will only record the time
irqs are disabled, but "preemptirqsoff" will take the total time irqs
or preemption are disabled. Runtime switching of these options is now
supported by simpling echoing in the appropriate trace name into
/debugfs/tracing/current_tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c |   3 +
 include/linux/ftrace.h       |   8 ++
 include/linux/irqflags.h     |   3 +-
 include/linux/preempt.h      |   2 +-
 kernel/sched.c               |  24 +++++-
 kernel/trace/Kconfig         |  25 ++++++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------
 8 files changed, 197 insertions(+), 53 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60d..a30aa1f2607a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -185,7 +185,10 @@ void cpu_idle(void)
 
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0a20445dcbcc..740c97dcf9cb 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,4 +58,12 @@ extern void mcount(void);
 # define time_hardirqs_off(a0, a1)		do { } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)		do { } while (0)
+# define trace_preempt_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5b711d4e9fd9..2b1c2e58566e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -41,7 +41,8 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
-#ifdef CONFIG_IRQSOFF_TRACER
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACER)
  extern void stop_critical_timings(void);
  extern void start_critical_timings(void);
 #else
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 36b03d50bf40..72b1a10a59b6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
diff --git a/kernel/sched.c b/kernel/sched.c
index 73e600852365..328494e28df2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -4365,26 +4366,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 896df1cf6adc..6430016b98e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,6 +44,31 @@ config IRQSOFF_TRACER
 
 	      echo 0 > /debugfs/tracing/tracing_max_latency
 
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46be8647fb65..3fec653d6533 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a9131b0cf1a5..8b1231633dc5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -21,6 +21,36 @@
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int notrace
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int notrace
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (likely(!tracer_enabled))
+	if (likely(!__get_cpu_var(tracing_cpu)))
 		return;
 
 	local_save_flags(flags);
 
-	if (!irqs_disabled_flags(flags))
-		return;
-
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (likely(!tracer_enabled))
 		return;
 
+	if (__get_cpu_var(tracing_cpu))
+		return;
+
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
 	if (unlikely(!data) || unlikely(!data->trace) ||
-	    data->critical_start || atomic_read(&data->disabled))
+	    atomic_read(&data->disabled))
 		return;
 
 	atomic_inc(&data->disabled);
 
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = now(cpu);
-	data->critical_start = parent_ip;
+	data->critical_start = parent_ip ? : ip;
 	tracing_reset(data);
 
 	local_save_flags(flags);
+
 	ftrace(tr, data, ip, parent_ip, flags);
 
+	__get_cpu_var(tracing_cpu) = 1;
+
 	atomic_dec(&data->disabled);
 }
 
@@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 
-	if (likely(!tracer_enabled))
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(__get_cpu_var(tracing_cpu)))
+		__get_cpu_var(tracing_cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
 		return;
 
 	cpu = raw_smp_processor_id();
@@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 	local_save_flags(flags);
 	ftrace(tr, data, ip, parent_ip, flags);
-	check_critical_timing(tr, data, parent_ip, cpu);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
 }
 
+/* start and stop critical timings used to for stoppage (in idle) */
 void notrace start_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
 void notrace stop_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
 void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
 }
 
@@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void notrace trace_hardirqs_on(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void notrace trace_hardirqs_off(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void notrace trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
 
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
@@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 	tracer_enabled = 0;
 }
 
-static void irqsoff_tracer_init(struct trace_array *tr)
+static void __irqsoff_tracer_init(struct trace_array *tr)
 {
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visibel */
@@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
 		start_irqsoff_tracer(iter->tr);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
 static struct tracer irqsoff_tracer __read_mostly =
 {
 	.name		= "irqsoff",
@@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 };
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
 
 __init static int init_irqsoff_tracer(void)
 {
-	register_tracer(&irqsoff_tracer);
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
 
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 8ac0fca4ccb355ce50471d7aa3f10f5900b28b95 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: sched tracer fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             |  6 ------
 kernel/sched.c                    |  2 +-
 kernel/trace/trace_sched_wakeup.c | 13 +++----------
 3 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e26f1fdbfe2..05744f9cb096 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2130,17 +2130,11 @@ ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 #ifdef CONFIG_SCHED_TRACER
 extern void
 ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
-extern void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr);
 #else
 static inline void
 ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
 {
 }
-static inline void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-}
 #endif
 
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
diff --git a/kernel/sched.c b/kernel/sched.c
index 328494e28df2..53ab1174664f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_new_task(p, rq->curr);
+	ftrace_wake_up_task(p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 87fa7b253b57..2a012423f9d0 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -201,20 +201,13 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
 {
 	if (likely(!tracer_enabled))
 		return;
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
-}
-
-void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-	if (likely(!tracer_enabled))
-		return;
+	tracing_record_cmdline(curr);
+	tracing_record_cmdline(wakee);
 
 	wakeup_check_start(wakeup_trace, wakee, curr);
 }
-- 
cgit v1.2.3-55-g7522


From 4e65551905fb0300ae7e667cbaa41ee2e3f29a13 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: sched tracer, trace full rbtree

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             | 32 ++++++++++++++++-------
 kernel/sched.c                    | 35 ++++++++++++++++++++++---
 kernel/trace/trace.c              | 55 ++++++++++++++++-----------------------
 kernel/trace/trace.h              | 14 ++++++++++
 kernel/trace/trace_sched_switch.c | 24 +++++++++++------
 5 files changed, 108 insertions(+), 52 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05744f9cb096..652d380ae563 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2119,20 +2119,34 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 extern void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next);
+ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next);
+extern void
+ftrace_wake_up_task(void *rq, struct task_struct *wakee,
+		    struct task_struct *curr);
+extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data);
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
 #else
 static inline void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next)
+{
+}
+static inline void
+sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3)
+{
+}
+static inline void
+ftrace_wake_up_task(void *rq, struct task_struct *wakee,
+		    struct task_struct *curr)
+{
+}
+static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
 }
-#endif
-
-#ifdef CONFIG_SCHED_TRACER
-extern void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
-#else
 static inline void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
 }
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 53ab1174664f..b9208a0e33a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2394,6 +2394,35 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+
+void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
+{
+	struct sched_entity *se;
+	struct task_struct *p;
+	struct rb_node *curr;
+	struct rq *rq = __rq;
+
+	curr = first_fair(&rq->cfs);
+	if (!curr)
+		return;
+
+	while (curr) {
+		se = rb_entry(curr, struct sched_entity, run_node);
+		if (!entity_is_task(se))
+			continue;
+
+		p = task_of(se);
+
+		__trace_special(__tr, __data,
+			      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+
+		curr = rb_next(curr);
+	}
+}
+
+#endif
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2468,7 +2497,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	ftrace_wake_up_task(p, rq->curr);
+	ftrace_wake_up_task(rq, p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2613,7 +2642,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_task(p, rq->curr);
+	ftrace_wake_up_task(rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2786,7 +2815,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(prev, next);
+	ftrace_ctx_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e4b7119e263..65173b14b914 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -66,7 +66,18 @@ static struct tracer		*current_trace __read_mostly;
 static int			max_tracer_type_len;
 
 static DEFINE_MUTEX(trace_types_lock);
-static DECLARE_WAIT_QUEUE_HEAD (trace_wait);
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+/*
+ * FIXME: where should this be called?
+ */
+void trace_wake_up(void)
+{
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up(&trace_wait);
+}
 
 #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
 
@@ -103,18 +114,6 @@ enum trace_flag_type {
 	TRACE_FLAG_SOFTIRQ		= 0x08,
 };
 
-enum trace_iterator_flags {
-	TRACE_ITER_PRINT_PARENT		= 0x01,
-	TRACE_ITER_SYM_OFFSET		= 0x02,
-	TRACE_ITER_SYM_ADDR		= 0x04,
-	TRACE_ITER_VERBOSE		= 0x08,
-	TRACE_ITER_RAW			= 0x10,
-	TRACE_ITER_HEX			= 0x20,
-	TRACE_ITER_BIN			= 0x40,
-	TRACE_ITER_BLOCK		= 0x80,
-	TRACE_ITER_STACKTRACE		= 0x100,
-};
-
 #define TRACE_ITER_SYM_MASK \
 	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
 
@@ -132,8 +131,6 @@ static const char *trace_options[] = {
 	NULL
 };
 
-static unsigned trace_flags = TRACE_ITER_PRINT_PARENT;
-
 static DEFINE_SPINLOCK(ftrace_max_lock);
 
 /*
@@ -660,9 +657,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 void
@@ -673,10 +667,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+
 void
-trace_special(struct trace_array *tr, struct trace_array_cpu *data,
-	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
@@ -688,11 +686,10 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
+#endif
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
@@ -739,9 +736,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= next->prio;
 	__trace_stack(tr, data, flags, 4);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 void
@@ -765,9 +759,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 #ifdef CONFIG_FTRACE
@@ -1258,7 +1249,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 comm);
 		break;
 	case TRACE_SPECIAL:
-		trace_seq_printf(s, " %lx %lx %lx\n",
+		trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1344,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
-		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+		ret = trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1409,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
-		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+		ret = trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 387bdcf45e28..75e237475674 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -274,4 +274,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 
+extern unsigned long trace_flags;
+
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
+};
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8b1cf1a3aee0..12658b3f2b28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,7 +18,7 @@ static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 
 static void
-ctx_switch_func(struct task_struct *prev, struct task_struct *next)
+ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array *tr = ctx_trace;
 	struct trace_array_cpu *data;
@@ -34,14 +34,17 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(disabled == 1)) {
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
+		ftrace_all_fair_tasks(__rq, tr, data);
+	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
-static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
+static void
+wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 {
 	struct trace_array *tr = ctx_trace;
 	struct trace_array_cpu *data;
@@ -57,14 +60,18 @@ static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(disabled == 1)) {
 		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+		ftrace_all_fair_tasks(__rq, tr, data);
+	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
-void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+void
+ftrace_ctx_switch(void *__rq, struct task_struct *prev,
+		  struct task_struct *next)
 {
 	tracing_record_cmdline(prev);
 
@@ -72,7 +79,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
 	 */
-	ctx_switch_func(prev, next);
+	ctx_switch_func(__rq, prev, next);
 
 	/*
 	 * Chain to the wakeup tracer (this is a NOP if disabled):
@@ -81,11 +88,12 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 }
 
 void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
+		    struct task_struct *curr)
 {
 	tracing_record_cmdline(curr);
 
-	wakeup_func(wakee, curr);
+	wakeup_func(__rq, wakee, curr);
 
 	/*
 	 * Chain to the wakeup tracer (this is a NOP if disabled):
-- 
cgit v1.2.3-55-g7522


From 24cd5d111e8c713e62cda7ca1d01232402e3d3c9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: trace curr/next tasks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index b9208a0e33a0..673b588b713b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2398,8 +2398,8 @@ static int sched_balance_self(int cpu, int flag)
 
 void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
-	struct sched_entity *se;
 	struct task_struct *p;
+	struct sched_entity *se;
 	struct rb_node *curr;
 	struct rq *rq = __rq;
 
@@ -2407,6 +2407,17 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 	if (!curr)
 		return;
 
+	if (rq->cfs.curr) {
+		p = task_of(rq->cfs.curr);
+		__trace_special(__tr, __data,
+		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+	}
+	if (rq->cfs.next) {
+		p = task_of(rq->cfs.next);
+		__trace_special(__tr, __data,
+		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+	}
+
 	while (curr) {
 		se = rb_entry(curr, struct sched_entity, run_node);
 		if (!entity_is_task(se))
-- 
cgit v1.2.3-55-g7522


From 017730c11241e26577673eb9d957cfc66172ea91 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: fix wakeups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 18 ++++++++++++++++++
 kernel/trace/trace.c  | 15 +++++++++++----
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 652d380ae563..a3970b563757 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,6 +246,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
+extern int runqueue_is_locked(void);
+
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index 673b588b713b..9ca4a2e6a236 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -642,6 +642,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 #endif
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	ret = spin_is_locked(&rq->lock);
+	put_cpu();
+	return ret;
+}
+
 /*
  * Debugging: various feature bits
  */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 65173b14b914..2ca9d66aa74e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -70,12 +70,13 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
-/*
- * FIXME: where should this be called?
- */
 void trace_wake_up(void)
 {
-	if (!(trace_flags & TRACE_ITER_BLOCK))
+	/*
+	 * The runqueue_is_locked() can fail, but this is the best we
+	 * have for now:
+	 */
+	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
 		wake_up(&trace_wait);
 }
 
@@ -657,6 +658,8 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 void
@@ -686,6 +689,8 @@ __trace_special(void *__tr, void *__data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 #endif
@@ -759,6 +764,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 #ifdef CONFIG_FTRACE
-- 
cgit v1.2.3-55-g7522


From 36fc25a9f48deacd8aa08cd2d1c186a4e412604f Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: sched tree fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9ca4a2e6a236..3bc7c5362998 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2414,6 +2414,23 @@ static int sched_balance_self(int cpu, int flag)
 
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 
+void ftrace_task(struct task_struct *p, void *__tr, void *__data)
+{
+#if 0
+	/*  
+	 * trace timeline tree
+	 */
+	__trace_special(__tr, __data,
+			p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+#else
+	/*
+	 * trace balance metrics
+	 */
+	__trace_special(__tr, __data,
+			p->pid, p->se.avg_overlap, 0);
+#endif
+}
+
 void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
 	struct task_struct *p;
@@ -2421,32 +2438,22 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 	struct rb_node *curr;
 	struct rq *rq = __rq;
 
-	curr = first_fair(&rq->cfs);
-	if (!curr)
-		return;
-
 	if (rq->cfs.curr) {
 		p = task_of(rq->cfs.curr);
-		__trace_special(__tr, __data,
-		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+		ftrace_task(p, __tr, __data);
 	}
 	if (rq->cfs.next) {
 		p = task_of(rq->cfs.next);
-		__trace_special(__tr, __data,
-		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+		ftrace_task(p, __tr, __data);
 	}
 
-	while (curr) {
+	for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) {
 		se = rb_entry(curr, struct sched_entity, run_node);
 		if (!entity_is_task(se))
 			continue;
 
 		p = task_of(se);
-
-		__trace_special(__tr, __data,
-			      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
-
-		curr = rb_next(curr);
+		ftrace_task(p, __tr, __data);
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 5429db2d26a59903c81a4f6c6dae7eb9daaea5fc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: fix wakeup callback

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3bc7c5362998..1ec3fb2efee6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2533,7 +2533,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	ftrace_wake_up_task(rq, p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2548,6 +2547,7 @@ out_activate:
 	success = 1;
 
 out_running:
+	ftrace_wake_up_task(rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
-- 
cgit v1.2.3-55-g7522


From 4d9493c90f8e6e1b164aede3814010a290161abb Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: remove add-hoc code

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c                    | 47 ---------------------------------------
 kernel/sched_fair.c               |  3 ---
 kernel/trace/trace_sched_switch.c | 10 ++-------
 3 files changed, 2 insertions(+), 58 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1ec3fb2efee6..ad95cca4e42e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2412,53 +2412,6 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-
-void ftrace_task(struct task_struct *p, void *__tr, void *__data)
-{
-#if 0
-	/*  
-	 * trace timeline tree
-	 */
-	__trace_special(__tr, __data,
-			p->pid, p->se.vruntime, p->se.sum_exec_runtime);
-#else
-	/*
-	 * trace balance metrics
-	 */
-	__trace_special(__tr, __data,
-			p->pid, p->se.avg_overlap, 0);
-#endif
-}
-
-void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
-{
-	struct task_struct *p;
-	struct sched_entity *se;
-	struct rb_node *curr;
-	struct rq *rq = __rq;
-
-	if (rq->cfs.curr) {
-		p = task_of(rq->cfs.curr);
-		ftrace_task(p, __tr, __data);
-	}
-	if (rq->cfs.next) {
-		p = task_of(rq->cfs.next);
-		ftrace_task(p, __tr, __data);
-	}
-
-	for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) {
-		se = rb_entry(curr, struct sched_entity, run_node);
-		if (!entity_is_task(se))
-			continue;
-
-		p = task_of(se);
-		ftrace_task(p, __tr, __data);
-	}
-}
-
-#endif
-
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index dc1856f10795..e24ecd39c4b8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1061,8 +1061,6 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	if (!(this_sd->flags & SD_WAKE_AFFINE))
 		return 0;
 
-	ftrace_special(__LINE__, curr->se.avg_overlap, sync);
-	ftrace_special(__LINE__, p->se.avg_overlap, -1);
 	/*
 	 * If the currently running task will sleep within
 	 * a reasonable amount of time then attract this newly
@@ -1240,7 +1238,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(se == pse))
 		return;
 
-	ftrace_special(__LINE__, p->pid, se->last_wakeup);
 	cfs_rq_of(pse)->next = pse;
 
 	/*
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index bddf676914ed..5671db0e1827 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -36,11 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1)) {
+	if (likely(disabled == 1))
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
-		if (trace_flags & TRACE_ITER_SCHED_TREE)
-			ftrace_all_fair_tasks(__rq, tr, data);
-	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -65,11 +62,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1)) {
+	if (likely(disabled == 1))
 		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
-		if (trace_flags & TRACE_ITER_SCHED_TREE)
-			ftrace_all_fair_tasks(__rq, tr, data);
-	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
-- 
cgit v1.2.3-55-g7522


From 5b82a1b08a00b2adca3d9dd9777efff40b7aaaa1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers
Date: Mon, 12 May 2008 21:21:10 +0200
Subject: Port ftrace to markers

Porting ftrace to the marker infrastructure.

Don't need to chain to the wakeup tracer from the sched tracer, because markers
support multiple probes connected.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             |  32 -------
 kernel/sched.c                    |  14 +++-
 kernel/trace/trace.h              |  20 +----
 kernel/trace/trace_sched_switch.c | 171 +++++++++++++++++++++++++++++++-------
 kernel/trace/trace_sched_wakeup.c | 106 +++++++++++++++++++++--
 5 files changed, 255 insertions(+), 88 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 360ca99033d2..c0b1c69b55ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2131,38 +2131,6 @@ __trace_special(void *__tr, void *__data,
 }
 #endif
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-extern void
-ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next);
-extern void
-ftrace_wake_up_task(void *rq, struct task_struct *wakee,
-		    struct task_struct *curr);
-extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data);
-extern void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-#else
-static inline void
-ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next)
-{
-}
-static inline void
-sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3)
-{
-}
-static inline void
-ftrace_wake_up_task(void *rq, struct task_struct *wakee,
-		    struct task_struct *curr)
-{
-}
-static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
-{
-}
-static inline void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-}
-#endif
-
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index ad95cca4e42e..e2e985eeee78 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2500,7 +2500,9 @@ out_activate:
 	success = 1;
 
 out_running:
-	ftrace_wake_up_task(rq, p, rq->curr);
+	trace_mark(kernel_sched_wakeup,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
@@ -2631,7 +2633,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_task(rq, p, rq->curr);
+	trace_mark(kernel_sched_wakeup_new,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2804,7 +2808,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(rq, prev, next);
+	trace_mark(kernel_sched_schedule,
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		prev->pid, next->pid, prev->state,
+		rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8845033ab49d..f5de0601b408 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -234,25 +234,10 @@ void update_max_tr_single(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_SCHED_TRACER
-extern void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
-extern void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr);
-#else
-static inline void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
-{
-}
-static inline void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
-{
-}
-#endif
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
+			void *__rq,
 			struct task_struct *prev,
 			struct task_struct *next);
 
@@ -262,9 +247,6 @@ struct tracer_switch_ops {
 	struct tracer_switch_ops	*next;
 };
 
-extern int register_tracer_switch(struct tracer_switch_ops *ops);
-extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
-
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a3376478fc2c..d25ffa5eaf2b 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,11 +16,14 @@
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
 
 static void
-ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+			struct task_struct *next)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -41,10 +44,40 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	local_irq_restore(flags);
 }
 
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	sched_switch_func(probe_data, __rq, prev, next);
+}
+
 static void
-wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+			task_struct *curr)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -67,35 +100,29 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 	local_irq_restore(flags);
 }
 
-void
-ftrace_ctx_switch(void *__rq, struct task_struct *prev,
-		  struct task_struct *next)
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
 {
-	if (unlikely(atomic_read(&trace_record_cmdline_enabled)))
-		tracing_record_cmdline(prev);
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
 
-	/*
-	 * If tracer_switch_func only points to the local
-	 * switch func, it still needs the ptr passed to it.
-	 */
-	ctx_switch_func(__rq, prev, next);
+	if (likely(!tracer_enabled))
+		return;
 
-	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
-	 */
-	wakeup_sched_switch(prev, next);
-}
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
 
-void
-ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
-		    struct task_struct *curr)
-{
-	wakeup_func(__rq, wakee, curr);
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
 
-	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
-	 */
-	wakeup_sched_wakeup(wakee, curr);
+	wakeup_func(probe_data, __rq, task, curr);
 }
 
 void
@@ -132,15 +159,95 @@ static void sched_switch_reset(struct trace_array *tr)
 		tracing_reset(tr->data[cpu]);
 }
 
+static int tracing_sched_register(void)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return ret;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&ctx_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	return ret;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+	return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+}
+
+void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (ref == 1)
+		tracing_sched_register();
+}
+
+void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (ref)
+		tracing_sched_unregister();
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
 	atomic_inc(&trace_record_cmdline_enabled);
 	tracer_enabled = 1;
+	tracing_start_sched_switch();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
+	tracing_stop_sched_switch();
 	atomic_dec(&trace_record_cmdline_enabled);
 	tracer_enabled = 0;
 }
@@ -181,6 +288,14 @@ static struct tracer sched_switch_trace __read_mostly =
 
 __init static int init_sched_switch_trace(void)
 {
+	int ret = 0;
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_register();
+	if (ret) {
+		pr_info("error registering scheduler trace\n");
+		return ret;
+	}
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5948011006bc..5d2fb48e47f8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,6 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/marker.h>
 
 #include "trace.h"
 
@@ -44,11 +45,13 @@ static int report_latency(cycle_t delta)
 	return 1;
 }
 
-void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+	struct task_struct *next)
 {
 	unsigned long latency = 0, t0 = 0, t1 = 0;
-	struct trace_array *tr = wakeup_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
 	unsigned long flags;
@@ -113,6 +116,31 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
 static void __wakeup_reset(struct trace_array *tr)
 {
 	struct trace_array_cpu *data;
@@ -188,19 +216,68 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
 {
+	struct trace_array **ptr = probe_data;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
 	if (likely(!tracer_enabled))
 		return;
 
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
 	tracing_record_cmdline(curr);
-	tracing_record_cmdline(wakee);
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
+	wakeup_check_start(tr, task, curr);
 }
 
 static void start_wakeup_tracer(struct trace_array *tr)
 {
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&wakeup_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
 	wakeup_reset(tr);
 
 	/*
@@ -215,11 +292,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	tracer_enabled = 1;
 
 	return;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
 }
 
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
 }
 
 static void wakeup_tracer_init(struct trace_array *tr)
-- 
cgit v1.2.3-55-g7522


From 962cf36c5bf6d2840b8d66ee9a606fae2f540bbd Mon Sep 17 00:00:00 2001
From: Carlos R. Mafra
Date: Thu, 15 May 2008 11:15:37 -0300
Subject: Remove argument from open_softirq which is always NULL

As git-grep shows, open_softirq() is always called with the last argument
being NULL

block/blk-core.c:       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
kernel/hrtimer.c:       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
kernel/rcuclassic.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/rcupreempt.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/sched.c: open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
kernel/softirq.c:       open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
kernel/softirq.c:       open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
kernel/timer.c: open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
net/core/dev.c: open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
net/core/dev.c: open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

This observation has already been made by Matthew Wilcox in June 2002
(http://www.cs.helsinki.fi/linux/linux-kernel/2002-25/0687.html)

"I notice that none of the current softirq routines use the data element
passed to them."

and the situation hasn't changed since them. So it appears we can safely
remove that extra argument to save 128 (54) bytes of kernel data (text).

Signed-off-by: Carlos R. Mafra <crmafra@ift.unesp.br>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/blk-core.c          | 2 +-
 include/linux/interrupt.h | 3 +--
 kernel/hrtimer.c          | 2 +-
 kernel/rcuclassic.c       | 2 +-
 kernel/rcupreempt.c       | 2 +-
 kernel/sched.c            | 2 +-
 kernel/softirq.c          | 7 +++----
 kernel/timer.c            | 2 +-
 net/core/dev.c            | 4 ++--
 9 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6a9cc0d22a61..75fdc65136e8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2048,7 +2048,7 @@ int __init blk_dev_init(void)
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
-	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
 	register_hotcpu_notifier(&blk_cpu_notifier);
 
 	return 0;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f1fc7470d26c..a86186dd0474 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -285,12 +285,11 @@ enum
 struct softirq_action
 {
 	void	(*action)(struct softirq_action *);
-	void	*data;
 };
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
-extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
+extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..861b4088092a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1669,7 +1669,7 @@ void __init hrtimers_init(void)
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
 #ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
 #endif
 }
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..f6e01f3ae9c6 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -529,7 +529,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
 
 	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
 	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..9dd827db359f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1125,7 +1125,7 @@ void __init __rcu_init(void)
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
 
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..56ea3a203a5a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8154,7 +8154,7 @@ void __init sched_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..059256874e9b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,9 +347,8 @@ void raise_softirq(unsigned int nr)
 	local_irq_restore(flags);
 }
 
-void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
-	softirq_vec[nr].data = data;
 	softirq_vec[nr].action = action;
 }
 
@@ -503,8 +502,8 @@ void __init softirq_init(void)
 			&per_cpu(tasklet_hi_vec, cpu).head;
 	}
 
-	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
-	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
+	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
 static int ksoftirqd(void * __bind_cpu)
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..b4da888497fa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1502,7 +1502,7 @@ void __init init_timers(void)
 
 	BUG_ON(err == NOTIFY_BAD);
 	register_cpu_notifier(&timers_nb);
-	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
+	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 582963077877..cf0e16731dc7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4563,8 +4563,8 @@ static int __init net_dev_init(void)
 
 	dev_boot_phase = 0;
 
-	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
-	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
 	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
-- 
cgit v1.2.3-55-g7522


From c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: do not trace sched_clock

The tracer uses sched_clock, so do not trace it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index e2e985eeee78..6590a828138f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -884,12 +884,12 @@ static unsigned long long __cpu_clock(int cpu)
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
-unsigned long long cpu_clock(int cpu)
+unsigned long long notrace cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
@@ -898,7 +898,7 @@ unsigned long long cpu_clock(int cpu)
 		time = __sync_cpu_clock(time, cpu);
 		per_cpu(prev_cpu_time, cpu) = time;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return time;
 }
-- 
cgit v1.2.3-55-g7522


From 45c01e824991b2dd0a332e19efc4901acb31209f Mon Sep 17 00:00:00 2001
From: Gregory Haskins
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: sched: prioritize non-migratable tasks over migratable ones

Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm
that could allow suboptimal balancing if a non-migratable task gets
queued behind a running migratable one.  It is discussed in this thread:

http://lkml.org/lkml/2008/4/22/296

This issue has been further exacerbated by a recent checkin to
sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3).

>From a pure priority standpoint, the run-queue is doing the "right"
thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1
wakes up at equal or lower priority (affined only to cpu1) later, it
*should* wait for T0 to finish.  However, in reality that is likely
suboptimal from a system perspective if there are other cores that
could allow T0 and T1 to run concurrently.  Since T1 can not migrate,
the only choice for higher concurrency is to try to move T0.  This is
not something we addessed in the recent rt-balancing re-work.

This patch tries to enhance the balancing algorithm by accomodating this
scenario.  It accomplishes this by incorporating the migratability of a
task into its priority calculation.  Within a numerical tsk->prio, a
non-migratable task is logically higher than a migratable one.  We
maintain this by introducing a new per-priority queue (xqueue, or
exclusive-queue) for holding non-migratable tasks.  The scheduler will
draw from the xqueue over the standard shared-queue (squeue) when
available.

There are several details for utilizing this properly.

1) During task-wake-up, we not only need to check if the priority
   preempts the current task, but we also need to check for this
   non-migratable condition.  Therefore, if a non-migratable task wakes
   up and sees an equal priority migratable task already running, it
   will attempt to preempt it *if* there is a likelyhood that the
   current task will find an immediate home.

2) Tasks only get this non-migratable "priority boost" on wake-up.  Any
   requeuing will result in the non-migratable task being queued to the
   end of the shared queue.  This is an attempt to prevent the system
   from being completely unfair to migratable tasks during things like
   SCHED_RR timeslicing.

I am sure this patch introduces potentially "odd" behavior if you
concoct a scenario where a bunch of non-migratable threads could starve
migratable ones given the right pattern.  I am not yet convinced that
this is a problem since we are talking about tasks of equal RT priority
anyway, and there never is much in the way of guarantees against
starvation under that scenario anyway. (e.g. you could come up with a
similar scenario with a specific timing environment verses an affinity
environment).  I can be convinced otherwise, but for now I think this is
"ok".

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c    |  6 +++--
 kernel/sched_rt.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 72 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index bfb8ad8ed171..7178b8c2351c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -151,7 +151,8 @@ static inline int task_has_rt_policy(struct task_struct *p)
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head queue[MAX_RT_PRIO];
+	struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */
+	struct list_head squeue[MAX_RT_PRIO];  /* shared queue */
 };
 
 struct rt_bandwidth {
@@ -7542,7 +7543,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
-		INIT_LIST_HEAD(array->queue + i);
+		INIT_LIST_HEAD(array->xqueue + i);
+		INIT_LIST_HEAD(array->squeue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3432d573205d..fefed39fafd8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -458,7 +458,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
-	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	if (rt_se->nr_cpus_allowed == 1)
+		list_add_tail(&rt_se->run_list,
+			      array->xqueue + rt_se_prio(rt_se));
+	else
+		list_add_tail(&rt_se->run_list,
+			      array->squeue + rt_se_prio(rt_se));
+
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -470,7 +476,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
+	if (list_empty(array->squeue + rt_se_prio(rt_se))
+	    && list_empty(array->xqueue + rt_se_prio(rt_se)))
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 	dec_rt_tasks(rt_se, rt_rq);
@@ -537,13 +544,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
+ *
+ * Note: We always enqueue the task to the shared-queue, regardless of its
+ * previous position w.r.t. exclusive vs shared.  This is so that exclusive RR
+ * tasks fairly round-robin with all tasks on the runqueue, not just other
+ * exclusive tasks.
  */
 static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	list_del_init(&rt_se->run_list);
+	list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -601,13 +614,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 }
 #endif /* CONFIG_SMP */
 
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+						   struct rt_rq *rt_rq);
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-	if (p->prio < rq->curr->prio)
+	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
+		return;
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * If:
+	 *
+	 * - the newly woken task is of equal priority to the current task
+	 * - the newly woken task is non-migratable while current is migratable
+	 * - current will be preempted on the next reschedule
+	 *
+	 * we should check to see if current can readily move to a different
+	 * cpu.  If so, we will reschedule to allow the push logic to try
+	 * to move current somewhere else, making room for our non-migratable
+	 * task.
+	 */
+	if((p->prio == rq->curr->prio)
+	   && p->rt.nr_cpus_allowed == 1
+	   && rq->curr->rt.nr_cpus_allowed != 1
+	   && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
+		cpumask_t mask;
+
+		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+			/*
+			 * There appears to be other cpus that can accept
+			 * current, so lets reschedule to try and push it away
+			 */
+			resched_task(rq->curr);
+	}
+#endif
 }
 
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -621,8 +667,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct sched_rt_entity, run_list);
+	queue = array->xqueue + idx;
+	if (!list_empty(queue))
+		next = list_entry(queue->next, struct sched_rt_entity,
+				  run_list);
+	else {
+		queue = array->squeue + idx;
+		next = list_entry(queue->next, struct sched_rt_entity,
+				  run_list);
+	}
 
 	return next;
 }
@@ -692,7 +745,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 			continue;
 		if (next && next->prio < idx)
 			continue;
-		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+		list_for_each_entry(rt_se, array->squeue + idx, run_list) {
 			struct task_struct *p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
@@ -1146,6 +1199,14 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 		}
 
 		update_rt_migration(rq);
+
+		if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
+			/*
+			 * If either the new or old weight is a "1", we need
+			 * to requeue to properly move between shared and
+			 * exclusive queues.
+			 */
+			requeue_task_rt(rq, p);
 	}
 
 	p->cpus_allowed    = *new_mask;
-- 
cgit v1.2.3-55-g7522


From f333fdc9098b71e2687e4e9b6349fcb352960d66 Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: sched: make !hrtick faster

it is safe to ignore timers and flags when the feature is disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7178b8c2351c..aa960b84b881 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4134,7 +4134,7 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int cpu;
+	int cpu, hrtick = sched_feat(HRTICK);
 
 need_resched:
 	preempt_disable();
@@ -4149,7 +4149,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	hrtick_clear(rq);
+	if (hrtick)
+		hrtick_clear(rq);
 
 	/*
 	 * Do the rq-clock update outside the rq lock:
@@ -4197,7 +4198,8 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	hrtick_set(rq);
+	if (hrtick)
+		hrtick_set(rq);
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-- 
cgit v1.2.3-55-g7522


From 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 Mon Sep 17 00:00:00 2001
From: Gregory Haskins
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: sched: use a 2-d bitmap for searching lowest-pri CPU

The current code use a linear algorithm which causes scaling issues
on larger SMP machines.  This patch replaces that algorithm with a
2-dimensional bitmap to reduce latencies in the wake-up path.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile       |   1 +
 kernel/sched.c        |   7 ++
 kernel/sched_cpupri.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_cpupri.h |  36 +++++++++++
 kernel/sched_rt.c     |  98 ++++++----------------------
 5 files changed, 239 insertions(+), 77 deletions(-)
 create mode 100644 kernel/sched_cpupri.c
 create mode 100644 kernel/sched_cpupri.h

(limited to 'kernel/sched.c')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..ecdd2d335639 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_SMP) += sched_cpupri.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched.c b/kernel/sched.c
index aa960b84b881..8a1257b65560 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
+#include "sched_cpupri.h"
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -450,6 +452,9 @@ struct root_domain {
 	 */
 	cpumask_t rto_mask;
 	atomic_t rto_count;
+#ifdef CONFIG_SMP
+	struct cpupri cpupri;
+#endif
 };
 
 /*
@@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd)
 
 	cpus_clear(rd->span);
 	cpus_clear(rd->online);
+
+	cpupri_init(&rd->cpupri);
 }
 
 static void init_defrootdomain(void)
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ *  kernel/sched_cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+
+#include "sched_cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+	int cpupri;
+
+	if (prio == CPUPRI_INVALID)
+		cpupri = CPUPRI_INVALID;
+	else if (prio == MAX_PRIO)
+		cpupri = CPUPRI_IDLE;
+	else if (prio >= MAX_RT_PRIO)
+		cpupri = CPUPRI_NORMAL;
+	else
+		cpupri = MAX_RT_PRIO - prio + 1;
+
+	return cpupri;
+}
+
+#define for_each_cpupri_active(array, idx)                    \
+  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+       idx < CPUPRI_NR_PRIORITIES;                            \
+       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+		cpumask_t *lowest_mask)
+{
+	int                  idx      = 0;
+	int                  task_pri = convert_prio(p->prio);
+
+	for_each_cpupri_active(cp->pri_active, idx) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+		cpumask_t mask;
+
+		if (idx >= task_pri)
+			break;
+
+		cpus_and(mask, p->cpus_allowed, vec->mask);
+
+		if (cpus_empty(mask))
+			continue;
+
+		*lowest_mask = mask;
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+	int                 *currpri = &cp->cpu_to_pri[cpu];
+	int                  oldpri  = *currpri;
+	unsigned long        flags;
+
+	newpri = convert_prio(newpri);
+
+	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+	if (newpri == oldpri)
+		return;
+
+	/*
+	 * If the cpu was currently mapped to a different value, we
+	 * first need to unmap the old value
+	 */
+	if (likely(oldpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		vec->count--;
+		if (!vec->count)
+			clear_bit(oldpri, cp->pri_active);
+		cpu_clear(cpu, vec->mask);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
+
+	if (likely(newpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		cpu_set(cpu, vec->mask);
+		vec->count++;
+		if (vec->count == 1)
+			set_bit(newpri, cp->pri_active);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
+
+	*currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+	int i;
+
+	memset(cp, 0, sizeof(*cp));
+
+	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+		spin_lock_init(&vec->lock);
+		vec->count = 0;
+		cpus_clear(vec->mask);
+	}
+
+	for_each_possible_cpu(i)
+		cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
+
+
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..0b6a3d110fac
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
+#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+	spinlock_t lock;
+	int        count;
+	cpumask_t  mask;
+};
+
+struct cpupri {
+	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+	long              pri_active[CPUPRI_NR_PRI_WORDS];
+	int               cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+		 struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fefed39fafd8..44b06d75416e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rt_rq->highest_prio = rt_se_prio(rt_se);
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+	}
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1) {
@@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_SMP
+	int highest_prio = rt_rq->highest_prio;
+#endif
+
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
@@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		rq->rt.rt_nr_migratory--;
 	}
 
+	if (rt_rq->highest_prio != highest_prio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+	}
+
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
-	int       lowest_prio = -1;
-	int       lowest_cpu  = -1;
-	int       count       = 0;
-	int       cpu;
-
-	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-
-	/*
-	 * Scan each rq for the lowest prio.
-	 */
-	for_each_cpu_mask(cpu, *lowest_mask) {
-		struct rq *rq = cpu_rq(cpu);
-
-		/* We look for lowest RT prio or non-rt CPU */
-		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			/*
-			 * if we already found a low RT queue
-			 * and now we found this non-rt queue
-			 * clear the mask and set our bit.
-			 * Otherwise just return the queue as is
-			 * and the count==1 will cause the algorithm
-			 * to use the first bit found.
-			 */
-			if (lowest_cpu != -1) {
-				cpus_clear(*lowest_mask);
-				cpu_set(rq->cpu, *lowest_mask);
-			}
-			return 1;
-		}
-
-		/* no locking for now */
-		if ((rq->rt.highest_prio > task->prio)
-		    && (rq->rt.highest_prio >= lowest_prio)) {
-			if (rq->rt.highest_prio > lowest_prio) {
-				/* new low - clear old data */
-				lowest_prio = rq->rt.highest_prio;
-				lowest_cpu = cpu;
-				count = 0;
-			}
-			count++;
-		} else
-			cpu_clear(cpu, *lowest_mask);
-	}
-
-	/*
-	 * Clear out all the set bits that represent
-	 * runqueues that were of higher prio than
-	 * the lowest_prio.
-	 */
-	if (lowest_cpu > 0) {
-		/*
-		 * Perhaps we could add another cpumask op to
-		 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-		 * Then that could be optimized to use memset and such.
-		 */
-		for_each_cpu_mask(cpu, *lowest_mask) {
-			if (cpu >= lowest_cpu)
-				break;
-			cpu_clear(cpu, *lowest_mask);
-		}
-	}
-
-	return count;
-}
-
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
 	int first;
@@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task)
 	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
-	int count    = find_lowest_cpus(task, lowest_mask);
 
-	if (!count)
-		return -1; /* No targets found */
+	if (task->rt.nr_cpus_allowed == 1)
+		return -1; /* No other targets possible */
 
-	/*
-	 * There is no sense in performing an optimal search if only one
-	 * target is found.
-	 */
-	if (count == 1)
-		return first_cpu(*lowest_mask);
+	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+		return -1; /* No targets found */
 
 	/*
 	 * At this point we have built a mask of cpus representing the
@@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
+
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 
 /* Assumes rq->lock is held */
@@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
+
+	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From d07355f5def74d060333563b36ab51b89fd44cdd Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko
Date: Mon, 12 May 2008 21:21:15 +0200
Subject: sched: check for SD_SERIALIZE atomically in rebalance_domains()

Nothing really serious here, mainly just a matter of nit-picking :-/

From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
For CONFIG_SCHED_DEBUG && CONFIG_SYSCT configs, sd->flags can be altered
while being manipulated in rebalance_domains(). Let's do an atomic check.
We rely here on the atomicity of read/write accesses for aligned words.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8a1257b65560..90329f1f8941 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3668,6 +3668,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	int need_serialize;
 	cpumask_t tmp;
 
 	for_each_domain(cpu, sd) {
@@ -3685,8 +3686,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		if (interval > HZ*NR_CPUS/10)
 			interval = HZ*NR_CPUS/10;
 
+		need_serialize = sd->flags & SD_SERIALIZE;
 
-		if (sd->flags & SD_SERIALIZE) {
+		if (need_serialize) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
@@ -3702,7 +3704,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 			}
 			sd->last_balance = jiffies;
 		}
-		if (sd->flags & SD_SERIALIZE)
+		if (need_serialize)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval)) {
-- 
cgit v1.2.3-55-g7522


From f7dcd80bbc8e7032443e6539ea1b830364f82200 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sat, 24 May 2008 23:20:38 +0200
Subject: namespacecheck: fixes in kernel/sched.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 90329f1f8941..02a5eeedcb94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1633,7 +1633,7 @@ inline int task_curr(const struct task_struct *p)
 }
 
 /* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->load.weight;
 }
-- 
cgit v1.2.3-55-g7522


From 81d41d7ece23a1c3b4bcd1604026d3a06cc4dc79 Mon Sep 17 00:00:00 2001
From: Rabin Vincent
Date: Sun, 11 May 2008 05:55:33 +0530
Subject: sched: fix defined-but-unused warning

Fix this warning, which appears with !CONFIG_SMP:
kernel/sched.c:1216: warning: `init_hrtick' defined but not used

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 02a5eeedcb94..f3faec52c5ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1130,6 +1130,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+#ifdef CONFIG_SMP
 static void hotplug_hrtick_disable(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -1185,6 +1186,7 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
+#endif /* CONFIG_SMP */
 
 static void init_rq_hrtick(struct rq *rq)
 {
-- 
cgit v1.2.3-55-g7522


From e21f5b153b9b4a6775d7d41964e372e13a9178ab Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Fri, 23 May 2008 09:05:58 -0700
Subject: sched: print module list in the "scheduling while atomic" warning

For the normal WARN_ON() etc we added a print-the-modules-list already,
which is very useful to figure out candidates for certain types of bugs.

This patch adds the same print to the "scheduling while atomic" BUG warning,
for the same reason: when we get here it's very useful to see which modules
are loaded, to narrow down the candidate code list.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index f3faec52c5ab..84a360670b9d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4070,6 +4070,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
 		prev->comm, prev->pid, preempt_count());
 
 	debug_show_held_locks(prev);
+	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 
-- 
cgit v1.2.3-55-g7522


From 6d6bc0ad867c46896d0994bb039e7550ecb9b51d Mon Sep 17 00:00:00 2001
From: Dhaval Giani
Date: Fri, 30 May 2008 14:23:45 +0200
Subject: sched: add comments for ifdefs in sched.c

make sched.c easier to read.

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 76 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 84a360670b9d..ef4e25604bbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -292,15 +292,15 @@ struct task_group root_task_group;
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-#endif
-#else
+#endif /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 #define root_task_group init_task_group
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -310,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
-#else
+#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
-#endif
+#endif /* CONFIG_USER_SCHED */
 
 /*
  * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
@@ -1316,15 +1316,15 @@ void wake_up_idle_cpu(int cpu)
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
-#endif
+#endif /* CONFIG_NO_HZ */
 
-#else
+#else /* !CONFIG_SMP */
 static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_thread_flag(p, tif_bit);
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
@@ -2129,7 +2129,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			}
 		}
 	}
-#endif
+#endif /* CONFIG_SCHEDSTATS */
 
 out_activate:
 #endif /* CONFIG_SMP */
@@ -2329,7 +2329,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 		notifier->ops->sched_out(notifier, next);
 }
 
-#else
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
 
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
@@ -2341,7 +2341,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 {
 }
 
-#endif
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
 
 /**
  * prepare_task_switch - prepare to switch tasks
@@ -6300,9 +6300,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 	}
 	kfree(groupmask);
 }
-#else
+#else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
 
 static int sd_degenerate(struct sched_domain *sd)
 {
@@ -6598,7 +6598,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
 		cpus_or(*span, *span, *nodemask);
 	}
 }
-#endif
+#endif /* CONFIG_NUMA */
 
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
@@ -6617,7 +6617,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
-#endif
+#endif /* CONFIG_SCHED_SMT */
 
 /*
  * multi-core sched-domains:
@@ -6625,7 +6625,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
-#endif
+#endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
@@ -6727,7 +6727,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		sg = sg->next;
 	} while (sg != group_head);
 }
-#endif
+#endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
@@ -6764,11 +6764,11 @@ next_sg:
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
-#else
+#else /* !CONFIG_NUMA */
 static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 }
-#endif
+#endif /* CONFIG_NUMA */
 
 /*
  * Initialize sched groups cpu_power.
@@ -7459,7 +7459,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 #endif
 	return err;
 }
-#endif
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
 /*
  * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7677,8 +7677,8 @@ void __init sched_init(void)
 
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
@@ -7692,8 +7692,8 @@ void __init sched_init(void)
 
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
 	}
 
 #ifdef CONFIG_SMP
@@ -7709,8 +7709,8 @@ void __init sched_init(void)
 #ifdef CONFIG_USER_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), RUNTIME_INF);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
@@ -7720,8 +7720,8 @@ void __init sched_init(void)
 	INIT_LIST_HEAD(&root_task_group.children);
 	init_task_group.parent = &root_task_group;
 	list_add(&init_task_group.siblings, &root_task_group.children);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
 
 	for_each_possible_cpu(i) {
 		struct rq *rq;
@@ -8040,7 +8040,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
 }
-#else
+#else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8058,7 +8058,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
@@ -8129,7 +8129,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
@@ -8147,7 +8147,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
@@ -8258,7 +8258,7 @@ void sched_move_task(struct task_struct *tsk)
 
 	task_rq_unlock(rq, &flags);
 }
-#endif
+#endif /* CONFIG_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8508,7 +8508,7 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
@@ -8526,7 +8526,7 @@ static int sched_rt_global_constraints(void)
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 int sched_rt_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
@@ -8634,7 +8634,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8658,7 +8658,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_period(cgroup_tg(cgrp));
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.2.3-55-g7522


From 099f98c8a1f13501a98afbfff4756395a610581c Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy
Date: Thu, 29 May 2008 20:56:32 +0530
Subject: sched: print the sd->level in sched_domain_debug code

While printing out the visual representation of the sched-domains, print
the level (MC, SMT, CPU, NODE, ... ) of each of the sched_domains.

Credit: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index ef4e25604bbe..dc0be113f41d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6197,6 +6197,28 @@ void __init migration_init(void)
 
 #ifdef CONFIG_SCHED_DEBUG
 
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+	switch (lvl) {
+	case SD_LV_NONE:
+			return "NONE";
+	case SD_LV_SIBLING:
+			return "SIBLING";
+	case SD_LV_MC:
+			return "MC";
+	case SD_LV_CPU:
+			return "CPU";
+	case SD_LV_NODE:
+			return "NODE";
+	case SD_LV_ALLNODES:
+			return "ALLNODES";
+	case SD_LV_MAX:
+			return "MAX";
+
+	}
+	return "MAX";
+}
+
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
@@ -6216,7 +6238,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
 
-	printk(KERN_CONT "span %s\n", str);
+	printk(KERN_CONT "span %s level %s\n",
+		str, sd_level_to_string(sd->level));
 
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
-- 
cgit v1.2.3-55-g7522


From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001
From: Gregory Haskins
Date: Wed, 4 Jun 2008 15:04:05 -0400
Subject: sched: fix cpupri hotplug support

The RT folks over at RedHat found an issue w.r.t. hotplug support which
was traced to problems with the cpupri infrastructure in the scheduler:

https://bugzilla.redhat.com/show_bug.cgi?id=449676

This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel.  This patch
applies to 25.4-rt4, though it should trivially apply to most cpupri enabled
kernels mentioned above.

It turned out that the issue was that offline cpus could get inadvertently
registered with cpupri so that they were erroneously selected during
migration decisions.  The end result would be an OOPS as the offline cpu
had tasks routed to it.

This patch generalizes the old join/leave domain interface into an
online/offline interface, and adjusts the root-domain/hotplug code to
utilize it.

I was able to easily reproduce the issue prior to this patch, and am no
longer able to reproduce it after this patch.  I can offline cpus
indefinately and everything seems to be in working order.

Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point
me in the right direction.  Also thank you to Peter for reviewing the
early iterations of this patch.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  4 ++--
 kernel/sched.c        | 54 ++++++++++++++++++++++++++++++++++++++-------------
 kernel/sched_rt.c     | 24 +++++++++++++++++------
 3 files changed, 60 insertions(+), 22 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ea2857b99596..d25acf600a32 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -903,8 +903,8 @@ struct sched_class {
 	void (*set_cpus_allowed)(struct task_struct *p,
 				 const cpumask_t *newmask);
 
-	void (*join_domain)(struct rq *rq);
-	void (*leave_domain)(struct rq *rq);
+	void (*rq_online)(struct rq *rq);
+	void (*rq_offline)(struct rq *rq);
 
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
 			       int running);
diff --git a/kernel/sched.c b/kernel/sched.c
index dc0be113f41d..f0ed81b71282 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -529,6 +529,7 @@ struct rq {
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
+	int online;
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
@@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 
 #define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
 
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
@@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void)
 }
 #endif
 
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		const struct sched_class *class;
+
+		cpu_set(rq->cpu, rq->rd->online);
+		rq->online = 1;
+
+		for_each_class(class) {
+			if (class->rq_online)
+				class->rq_online(rq);
+		}
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		const struct sched_class *class;
+
+		for_each_class(class) {
+			if (class->rq_offline)
+				class->rq_offline(rq);
+		}
+
+		cpu_clear(rq->cpu, rq->rd->online);
+		rq->online = 0;
+	}
+}
+
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
@@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
-			cpu_set(cpu, rq->rd->online);
+
+			set_rq_online(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
-			cpu_clear(cpu, rq->rd->online);
+			set_rq_offline(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	unsigned long flags;
-	const struct sched_class *class;
 
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 
-		for (class = sched_class_highest; class; class = class->next) {
-			if (class->leave_domain)
-				class->leave_domain(rq);
-		}
+		if (cpu_isset(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
 
 		cpu_clear(rq->cpu, old_rd->span);
-		cpu_clear(rq->cpu, old_rd->online);
 
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
@@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 	cpu_set(rq->cpu, rd->span);
 	if (cpu_isset(rq->cpu, cpu_online_map))
-		cpu_set(rq->cpu, rd->online);
-
-	for (class = sched_class_highest; class; class = class->next) {
-		if (class->join_domain)
-			class->join_domain(rq);
-	}
+		set_rq_online(rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -7824,6 +7849,7 @@ void __init sched_init(void)
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
+		rq->online = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 44b06d75416e..e4821593d4de 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
 
 static inline void rt_set_overload(struct rq *rq)
 {
+	if (!rq->online)
+		return;
+
 	cpu_set(rq->cpu, rq->rd->rto_mask);
 	/*
 	 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
 
 static inline void rt_clear_overload(struct rq *rq)
 {
+	if (!rq->online)
+		return;
+
 	/* the order here really doesn't matter */
 	atomic_dec(&rq->rd->rto_count);
 	cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rt_rq->highest_prio = rt_se_prio(rt_se);
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+
+		if (rq->online)
+			cpupri_set(&rq->rd->cpupri, rq->cpu,
+				   rt_se_prio(rt_se));
 	}
 #endif
 #ifdef CONFIG_SMP
@@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	if (rt_rq->highest_prio != highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+
+		if (rq->online)
+			cpupri_set(&rq->rd->cpupri, rq->cpu,
+				   rt_rq->highest_prio);
 	}
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
@@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 }
 
 /* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
@@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
@@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = {
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
-	.join_domain            = join_domain_rt,
-	.leave_domain           = leave_domain_rt,
+	.rq_online              = rq_online_rt,
+	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
-- 
cgit v1.2.3-55-g7522


From 5c8e1ed1d204a6770ca2854cd3b3597070fe7e5a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky
Date: Thu, 29 May 2008 11:17:01 -0700
Subject: sched: CPU hotplug events must not destroy scheduler domains created
 by the cpusets

First issue is not related to the cpusets. We're simply leaking doms_cur.
It's allocated in arch_init_sched_domains() which is called for every
hotplug event. So we just keep reallocation doms_cur without freeing it.
I introduced free_sched_domains() function that cleans things up.

Second issue is that sched domains created by the cpusets are
completely destroyed by the CPU hotplug events. For all CPU hotplug
events scheduler attaches all CPUs to the NULL domain and then puts
them all into the single domain thereby destroying domains created
by the cpusets (partition_sched_domains).
The solution is simple, when cpusets are enabled scheduler should not
create default domain and instead let cpusets do that. Which is
exactly what the patch does.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: menage@google.com
Cc: rostedt@goodmis.org
Cc: mingo@elte.hu
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpuset.c |  6 ++++++
 kernel/sched.c  | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 86ea9e34e326..6090d18b58a9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1886,6 +1886,12 @@ static void common_cpu_mem_hotplug_unplug(void)
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 	scan_for_empty_cpusets(&top_cpuset);
 
+	/*
+	 * Scheduler destroys domains on hotplug events.
+	 * Rebuild them based on the current settings.
+	 */
+	rebuild_sched_domains();
+
 	cgroup_unlock();
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f0ed81b71282..1ddb0a8c7976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7292,6 +7292,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
+/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+	ndoms_cur = 0;
+	if (doms_cur != &fallback_doms)
+		kfree(doms_cur);
+	doms_cur = &fallback_doms;
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7439,6 +7451,7 @@ int arch_reinit_sched_domains(void)
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	detach_destroy_domains(&cpu_online_map);
+	free_sched_domains();
 	err = arch_init_sched_domains(&cpu_online_map);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
@@ -7524,6 +7537,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
+		free_sched_domains();
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -7542,8 +7556,16 @@ static int update_sched_domains(struct notifier_block *nfb,
 		return NOTIFY_DONE;
 	}
 
+#ifndef CONFIG_CPUSETS
+	/*
+	 * Create default domain partitioning if cpusets are disabled.
+	 * Otherwise we let cpusets rebuild the domains based on the
+	 * current setup.
+	 */
+
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
+#endif
 
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.3-55-g7522


From 68f4f1ec08e3d95730a2693b99df8260aa0d06ae Mon Sep 17 00:00:00 2001
From: Max Krasnyansky
Date: Thu, 29 May 2008 11:17:02 -0700
Subject: sched: Move cpu masks from kernel/sched.c into kernel/cpu.c

kernel/cpu.c seems a more logical place for those maps since they do not really
have much to do with the scheduler these days.

kernel/cpu.c is now built for the UP kernel too, but it does not affect the size
the kernel sections.

$ size vmlinux

before
   text       data        bss        dec        hex    filename
3313797     307060     310352    3931209     3bfc49    vmlinux

after
   text       data        bss        dec        hex    filename
3313797     307060     310352    3931209     3bfc49    vmlinux

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: menage@google.com
Cc: rostedt@goodmis.org
Cc: mingo@elte.hu
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile |  4 ++--
 kernel/cpu.c    | 24 ++++++++++++++++++++++++
 kernel/sched.c  | 18 ------------------
 3 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/Makefile b/kernel/Makefile
index ecdd2d335639..6c55301112e0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
-	    exit.o itimer.o time.o softirq.o resource.o \
+	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..b11f06dc149a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+
+/*
+ * Represents all cpu's that are currently online.
+ */
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+
+#else /* CONFIG_SMP */
+
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
@@ -403,3 +425,5 @@ out:
 	cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
+
+#endif /* CONFIG_SMP */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ddb0a8c7976..f36f549e5744 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5080,24 +5080,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 	return sched_setaffinity(pid, &new_mask);
 }
 
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-
-#ifndef CONFIG_SMP
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-#endif
-
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	struct task_struct *p;
-- 
cgit v1.2.3-55-g7522


From e958b3600484533ff801920290468adc8135f89d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Wed, 4 Jun 2008 23:22:32 +0200
Subject: sched: move weighted_cpuload into #ifdef CONFIG_SMP section

weighted_cpuload is only used on SMP. move it into the CONFIG_SMP
section.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index f36f549e5744..727bdef76161 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1637,12 +1637,6 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-	return cpu_rq(cpu)->load.weight;
-}
-
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
@@ -1671,6 +1665,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 
 #ifdef CONFIG_SMP
 
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->load.weight;
+}
+
 /*
  * Is this task likely cache-hot:
  */
-- 
cgit v1.2.3-55-g7522


From 7def2be1dc679984f4c4fb3ef19a8a081b2454ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 5 Jun 2008 14:49:58 +0200
Subject: sched: fix hotplug cpus on ia64

Cliff Wickman wrote:

> I built an ia64 kernel from Andrew's tree (2.6.26-rc2-mm1)
> and get a very predictable hotplug cpu problem.
> billberry1:/tmp/cpw # ./dis
> disabled cpu 17
> enabled cpu 17
> billberry1:/tmp/cpw # ./dis
> disabled cpu 17
> enabled cpu 17
> billberry1:/tmp/cpw # ./dis
>
> The script that disables the cpu always hangs (unkillable)
> on the 3rd attempt.
>
> And a bit further:
> The kstopmachine thread always sits on the run queue (real time) for about
> 30 minutes before running.

this fix solves some (but not all) issues between CPU hotplug and
RT bandwidth throttling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  15 ++++++--
 kernel/sched_rt.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 115 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 727bdef76161..e9c24a128655 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7513,21 +7513,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
+	int cpu = (int)(long)hcpu;
+
 	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
+		disable_runtime(cpu_rq(cpu));
+		/* fall-through */
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
 		free_sched_domains();
 		return NOTIFY_OK;
 
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
+
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
+		enable_runtime(cpu_rq(cpu));
+		/* fall-through */
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index eaa606071d51..8ae3416e0bb4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -286,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
 			continue;
 
 		spin_lock(&iter->rt_runtime_lock);
+		if (iter->rt_runtime == RUNTIME_INF)
+			goto next;
+
 		diff = iter->rt_runtime - iter->rt_time;
 		if (diff > 0) {
 			do_div(diff, weight);
@@ -299,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq)
 				break;
 			}
 		}
+next:
 		spin_unlock(&iter->rt_runtime_lock);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 
 	return more;
 }
+
+static void __disable_runtime(struct rq *rq)
+{
+	struct root_domain *rd = rq->rd;
+	struct rt_rq *rt_rq;
+
+	if (unlikely(!scheduler_running))
+		return;
+
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+		s64 want;
+		int i;
+
+		spin_lock(&rt_b->rt_runtime_lock);
+		spin_lock(&rt_rq->rt_runtime_lock);
+		if (rt_rq->rt_runtime == RUNTIME_INF ||
+				rt_rq->rt_runtime == rt_b->rt_runtime)
+			goto balanced;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+
+		want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+		for_each_cpu_mask(i, rd->span) {
+			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+			s64 diff;
+
+			if (iter == rt_rq)
+				continue;
+
+			spin_lock(&iter->rt_runtime_lock);
+			if (want > 0) {
+				diff = min_t(s64, iter->rt_runtime, want);
+				iter->rt_runtime -= diff;
+				want -= diff;
+			} else {
+				iter->rt_runtime -= want;
+				want -= want;
+			}
+			spin_unlock(&iter->rt_runtime_lock);
+
+			if (!want)
+				break;
+		}
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		BUG_ON(want);
+balanced:
+		rt_rq->rt_runtime = RUNTIME_INF;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		spin_unlock(&rt_b->rt_runtime_lock);
+	}
+}
+
+static void disable_runtime(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__disable_runtime(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+	struct root_domain *rd = rq->rd;
+	struct rt_rq *rt_rq;
+
+	if (unlikely(!scheduler_running))
+		return;
+
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+		spin_lock(&rt_b->rt_runtime_lock);
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = rt_b->rt_runtime;
+		rt_rq->rt_time = 0;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		spin_unlock(&rt_b->rt_runtime_lock);
+	}
+}
+
+static void enable_runtime(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__enable_runtime(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -334,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 
 #ifdef CONFIG_SMP
 	if (rt_rq->rt_time > runtime) {
-		int more;
-
 		spin_unlock(&rt_rq->rt_runtime_lock);
-		more = balance_runtime(rt_rq);
+		balance_runtime(rt_rq);
 		spin_lock(&rt_rq->rt_runtime_lock);
 
-		if (more)
-			runtime = sched_rt_runtime(rt_rq);
+		runtime = sched_rt_runtime(rt_rq);
+		if (runtime == RUNTIME_INF)
+			return 0;
 	}
 #endif
 
@@ -1174,6 +1269,8 @@ static void rq_online_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
 
+	__enable_runtime(rq);
+
 	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 
@@ -1183,6 +1280,8 @@ static void rq_offline_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 
+	__disable_runtime(rq);
+
 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 
-- 
cgit v1.2.3-55-g7522


From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001
From: David Rientjes
Date: Thu, 5 Jun 2008 12:57:11 -0700
Subject: sched: prevent bound kthreads from changing cpus_allowed

Kthreads that have called kthread_bind() are bound to specific cpus, so
other tasks should not be able to change their cpus_allowed from under
them.  Otherwise, it is possible to move kthreads, such as the migration
or software watchdog threads, so they are not allowed access to the cpu
they work on.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/cpuset.c       | 14 +++++++++++++-
 kernel/kthread.c      |  1 +
 kernel/sched.c        |  6 ++++++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d25acf600a32..2db1485f865d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1486,6 +1486,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6090d18b58a9..b84354f4de36 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 
 	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
+	if (tsk->flags & PF_THREAD_BOUND) {
+		cpumask_t mask;
+
+		mutex_lock(&callback_mutex);
+		mask = cs->cpus_allowed;
+		mutex_unlock(&callback_mutex);
+		if (!cpus_equal(tsk->cpus_allowed, mask))
+			return -EINVAL;
+	}
 
 	return security_task_setscheduler(tsk, 0, NULL);
 }
@@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
+	int err;
 
 	mutex_lock(&callback_mutex);
 	guarantee_online_cpus(cs, &cpus);
-	set_cpus_allowed_ptr(tsk, &cpus);
+	err = set_cpus_allowed_ptr(tsk, &cpus);
 	mutex_unlock(&callback_mutex);
+	if (err)
+		return;
 
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
+	k->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e9c24a128655..164fe7fe0d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 		goto out;
 	}
 
+	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+		     !cpus_equal(p->cpus_allowed, *new_mask))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
-- 
cgit v1.2.3-55-g7522


From e9886ca3a93d7d041d3de8e5acebe213da777d59 Mon Sep 17 00:00:00 2001
From: Paul Mundt
Date: Mon, 9 Jun 2008 17:12:24 +0900
Subject: sched: kill off dead cfs_rq_set_shares()

Building with CONFIG_FAIR_GROUP_SCHED=y on UP results in an unused
cfs_rq_set_shares() reference. As nothing is using this dummy function
in the first place, just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 164fe7fe0d89..07d5472dee99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,16 +1480,8 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-#else /* CONFIG_SMP */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-}
 #endif
 
-#endif /* CONFIG_SMP */
-
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
-- 
cgit v1.2.3-55-g7522


From 040ec23d07f95285e9777a85cda29cb339a3065b Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 9 Jun 2008 01:45:29 -0700
Subject: sched: sched_clock() lockdep fix

Sitsofe Wheeler bisected the following commit to cause a lockdep to
warn about itself and turn itself off:

> commit c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4
> Author: Ingo Molnar <mingo@elte.hu>
> Date:   Mon May 12 21:21:14 2008 +0200
>
>     sched: do not trace sched_clock

do not use raw irq flags in cpu_clock() as it causes lockdep to lose
track of the true state of the IRQ flag.

Reported-and-bisected-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6590a828138f..b8c9fe676221 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -889,7 +889,7 @@ unsigned long long notrace cpu_clock(int cpu)
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long flags;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
@@ -898,7 +898,7 @@ unsigned long long notrace cpu_clock(int cpu)
 		time = __sync_cpu_clock(time, cpu);
 		per_cpu(prev_cpu_time, cpu) = time;
 	}
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	return time;
 }
-- 
cgit v1.2.3-55-g7522


From 20b6331bfed1f07ba1e5006889a5d64adc53615e Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko
Date: Wed, 11 Jun 2008 00:58:30 +0200
Subject: sched: rework of "prioritize non-migratable tasks over migratable
 ones"

regarding this commit: 45c01e824991b2dd0a332e19efc4901acb31209f

I think we can do it simpler. Please take a look at the patch below.

Instead of having 2 separate arrays (which is + ~800 bytes on x86_32 and
twice so on x86_64), let's add "exclusive" (the ones that are bound to
this CPU) tasks to the head of the queue and "shared" ones -- to the
end.

In case of a few newly woken up "exclusive" tasks, they are 'stacked'
(not queued as now), meaning that a task {i+1} is being placed in front
of the previously woken up task {i}. But I don't think that this
behavior may cause any realistic problems.

There are a couple of changes on top of this one.

(1) in check_preempt_curr_rt()

I don't think there is a need for the "pick_next_rt_entity(rq, &rq->rt)
!= &rq->curr->rt" check.

enqueue_task_rt(p) and check_preempt_curr_rt() are always called one
after another with rq->lock being held so the following check
"p->rt.nr_cpus_allowed == 1 && rq->curr->rt.nr_cpus_allowed != 1" should
be enough (well, just its left part) to guarantee that 'p' has been
queued in front of the 'curr'.

(2) in set_cpus_allowed_rt()

I don't thinks there is a need for requeue_task_rt() here.

Perhaps, the only case when 'requeue' (+ reschedule) might be useful is
as follows:

i) weight == 1 && cpu_isset(task_cpu(p), *new_mask)

i.e. a task is being bound to this CPU);

ii) 'p' != rq->curr

but here, 'p' has already been on this CPU for a while and was not
migrated. i.e. it's possible that 'rq->curr' would not have high chances
to be migrated right at this particular moment (although, has chance in
a bit longer term), should we allow it to be preempted.

Anyway, I think we should not perhaps make it more complex trying to
address some rare corner cases. For instance, that's why a single queue
approach would be preferable. Unless I'm missing something obvious, this
approach gives us similar functionality at lower cost.

Verified only compilation-wise.

(Almost)-Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  6 ++----
 kernel/sched_rt.c | 44 +++++++++-----------------------------------
 2 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 554de4009803..cc1d558406f8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -153,8 +153,7 @@ static inline int task_has_rt_policy(struct task_struct *p)
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */
-	struct list_head squeue[MAX_RT_PRIO];  /* shared queue */
+	struct list_head queue[MAX_RT_PRIO];
 };
 
 struct rt_bandwidth {
@@ -7620,8 +7619,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
-		INIT_LIST_HEAD(array->xqueue + i);
-		INIT_LIST_HEAD(array->squeue + i);
+		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8ae3416e0bb4..f721b52acd8d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -576,16 +576,15 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
+	struct list_head *queue = array->queue + rt_se_prio(rt_se);
 
 	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	if (rt_se->nr_cpus_allowed == 1)
-		list_add_tail(&rt_se->run_list,
-			      array->xqueue + rt_se_prio(rt_se));
+		list_add(&rt_se->run_list, queue);
 	else
-		list_add_tail(&rt_se->run_list,
-			      array->squeue + rt_se_prio(rt_se));
+		list_add_tail(&rt_se->run_list, queue);
 
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
@@ -598,8 +597,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	if (list_empty(array->squeue + rt_se_prio(rt_se))
-	    && list_empty(array->xqueue + rt_se_prio(rt_se)))
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 	dec_rt_tasks(rt_se, rt_rq);
@@ -666,11 +664,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
- *
- * Note: We always enqueue the task to the shared-queue, regardless of its
- * previous position w.r.t. exclusive vs shared.  This is so that exclusive RR
- * tasks fairly round-robin with all tasks on the runqueue, not just other
- * exclusive tasks.
  */
 static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
@@ -678,7 +671,7 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
+	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -736,9 +729,6 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 }
 #endif /* CONFIG_SMP */
 
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
-						   struct rt_rq *rt_rq);
-
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -764,8 +754,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if((p->prio == rq->curr->prio)
 	   && p->rt.nr_cpus_allowed == 1
-	   && rq->curr->rt.nr_cpus_allowed != 1
-	   && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
+	   && rq->curr->rt.nr_cpus_allowed != 1) {
 		cpumask_t mask;
 
 		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
@@ -789,15 +778,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
-	queue = array->xqueue + idx;
-	if (!list_empty(queue))
-		next = list_entry(queue->next, struct sched_rt_entity,
-				  run_list);
-	else {
-		queue = array->squeue + idx;
-		next = list_entry(queue->next, struct sched_rt_entity,
-				  run_list);
-	}
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct sched_rt_entity, run_list);
 
 	return next;
 }
@@ -867,7 +849,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 			continue;
 		if (next && next->prio < idx)
 			continue;
-		list_for_each_entry(rt_se, array->squeue + idx, run_list) {
+		list_for_each_entry(rt_se, array->queue + idx, run_list) {
 			struct task_struct *p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
@@ -1249,14 +1231,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 		}
 
 		update_rt_migration(rq);
-
-		if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
-			/*
-			 * If either the new or old weight is a "1", we need
-			 * to requeue to properly move between shared and
-			 * exclusive queues.
-			 */
-			requeue_task_rt(rq, p);
 	}
 
 	p->cpus_allowed    = *new_mask;
-- 
cgit v1.2.3-55-g7522


From 10b612f440a22a294e87ec7e8f03f9eea3338628 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 19 Jun 2008 14:22:27 +0200
Subject: sched: rt: fix the bandwidth contraint computations

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Daniel K." <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b307da827ef..1f711a58a2b4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8419,7 +8419,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+	struct task_group *tgi, *parent = tg->parent;
 	unsigned long total = 0;
 
 	if (!parent) {
@@ -8443,7 +8443,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 	}
 	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) <
+	return total + to_ratio(period, runtime) <=
 		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
 				parent->rt_bandwidth.rt_runtime);
 }
@@ -8560,10 +8560,15 @@ long sched_group_rt_period(struct task_group *tg)
 
 static int sched_rt_global_constraints(void)
 {
+	struct task_group *tg = &root_task_group;
+	u64 rt_runtime, rt_period;
 	int ret = 0;
 
+	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_runtime = tg->rt_bandwidth.rt_runtime;
+
 	mutex_lock(&rt_constraints_mutex);
-	if (!__rt_schedulable(NULL, 1, 0))
+	if (!__rt_schedulable(tg, rt_period, rt_runtime))
 		ret = -EINVAL;
 	mutex_unlock(&rt_constraints_mutex);
 
-- 
cgit v1.2.3-55-g7522


From 961ccddd59d627b89bd3dc284b6517833bbdf25d Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Mon, 23 Jun 2008 13:55:38 +1000
Subject: sched: add new API sched_setscheduler_nocheck: add a flag to control
 access checks

Hidehiro Kawai noticed that sched_setscheduler() can fail in
stop_machine: it calls sched_setscheduler() from insmod, which can
have CAP_SYS_MODULE without CAP_SYS_NICE.

Two cases could have failed, so are changed to sched_setscheduler_nocheck:
  kernel/softirq.c:cpu_callback()
	- CPU hotplug callback
  kernel/stop_machine.c:__stop_machine_run()
	- Called from various places, including modprobe()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi OSHIMA <satoshi.oshima.fk@hitachi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 48 ++++++++++++++++++++++++++++++++++++------------
 kernel/softirq.c      |  2 +-
 kernel/stop_machine.c |  2 +-
 4 files changed, 40 insertions(+), 14 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f847ca8d..fe3b9b5d7390 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1655,6 +1655,8 @@ extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler_nocheck(struct task_struct *, int,
+				      struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index b048ad8a11af..8d7c246ab864 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4746,16 +4746,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	set_load_weight(p);
 }
 
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-		       struct sched_param *param)
+static int __sched_setscheduler(struct task_struct *p, int policy,
+				struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
@@ -4787,7 +4779,7 @@ recheck:
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
-	if (!capable(CAP_SYS_NICE)) {
+	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 
@@ -4823,7 +4815,8 @@ recheck:
 	 * Do not allow realtime tasks into groups that have no runtime
 	 * assigned.
 	 */
-	if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+	if (user
+	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
 		return -EPERM;
 #endif
 
@@ -4872,8 +4865,39 @@ recheck:
 
 	return 0;
 }
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true);
+}
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+			       struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false);
+}
+
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..afd9120c2fc4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -645,7 +645,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
-		sched_setscheduler(p, SCHED_FIFO, &param);
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_stop(p);
 		takeover_tasklets(hotcpu);
 		break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7350bbfb076..ba9b2054ecbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -187,7 +187,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
 		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler(p, SCHED_FIFO, &param);
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_bind(p, cpu);
 		wake_up_process(p);
 		wait_for_completion(&smdata.done);
-- 
cgit v1.2.3-55-g7522


From a7be37ac8e1565e00880531f4e2aff421a21c803 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:11 +0200
Subject: sched: revert the revert of: weight calculations

Try again..

initial commit: 8f1bc385cfbab474db6c27b5af1e439614f3025c
revert: f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          |   9 ++---
 kernel/sched_fair.c     | 105 +++++++++++++++++++++++++++++++++---------------
 kernel/sched_features.h |   1 +
 3 files changed, 76 insertions(+), 39 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index c51d9fae8cd8..f653af684fb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1342,6 +1342,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1369,12 +1372,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1fe4c65a8170..496500988ce5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 }
 #endif
 
+/*
+ * delta *= w / rw
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				se->load.weight, &cfs_rq_of(se)->load);
+	}
+
+	return delta;
+}
+
+/*
+ * delta *= rw / w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, &se->load);
+	}
+
+	return delta;
+}
+
 /*
  * The idea is to set a period in which each task runs once.
  *
@@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 slice = __sched_period(cfs_rq->nr_running);
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-
-		slice *= se->load.weight;
-		do_div(slice, cfs_rq->load.weight);
-	}
-
-
-	return slice;
+	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
-	unsigned long weight;
-	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	vslice = __sched_period(nr_running);
+	return __sched_period(nr_running);
+}
+
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+	struct load_weight lw = {
+		.weight = NICE_0_LOAD,
+		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+	};
 
 	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
+		struct load_weight *se_lw = &se->load;
 
-		weight = cfs_rq->load.weight;
-		if (!se->on_rq)
-			weight += se->load.weight;
+		if (se->load.weight < NICE_0_LOAD)
+			se_lw = &lw;
 
-		vslice *= NICE_0_LOAD;
-		do_div(vslice, weight);
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, se_lw);
 	}
 
-	return vslice;
+	return delta;
 }
 
 /*
@@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = delta_exec;
-	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-							&curr->load);
-	}
+	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -609,8 +640,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS))
-			vruntime -= sysctl_sched_latency;
+		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			unsigned long thresh = sysctl_sched_latency;
+
+			/*
+			 * convert the sleeper threshold into virtual time
+			 */
+			if (sched_feat(NORMALIZED_SLEEPER))
+				thresh = calc_delta_fair(thresh, se);
+
+			vruntime -= thresh;
+		}
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1111,11 +1151,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making
-	 * it harder for + nice tasks.
+	 * More easily preempt - nice tasks, while not making it harder for
+	 * + nice tasks.
 	 */
-	if (unlikely(se->load.weight > NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
+	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 62b39ca92ebd..afa549166d8d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
-- 
cgit v1.2.3-55-g7522


From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:14 +0200
Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling

Try again..

Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e
Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 +
 kernel/sched.c        | 430 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_debug.c  |   5 +
 kernel/sched_fair.c   | 124 +++++++++------
 kernel/sched_rt.c     |   4 +
 5 files changed, 489 insertions(+), 75 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf821072dbd..97a58b622ee1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,6 +765,7 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
+	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index f653af684fb3..874b6da15430 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+	unsigned long task_weight;
+	unsigned long shares;
+	/*
+	 * We need space to build a sched_domain wide view of the full task
+	 * group tree, in order to avoid depending on dynamic memory allocation
+	 * during the load balancing we place this in the per cpu task group
+	 * hierarchy. This limits the load balancing to one instance per cpu,
+	 * but more should not be needed anyway.
+	 */
+	struct aggregate_struct {
+		/*
+		 *   load = weight(cpus) * f(tg)
+		 *
+		 * Where f(tg) is the recursive weight fraction assigned to
+		 * this group.
+		 */
+		unsigned long load;
+
+		/*
+		 * part of the group weight distributed to this span.
+		 */
+		unsigned long shares;
+
+		/*
+		 * The sum of all runqueue weights within this span.
+		 */
+		unsigned long rq_weight;
+
+		/*
+		 * Weight contributed by tasks; this is the part we can
+		 * influence by moving tasks around.
+		 */
+		unsigned long task_weight;
+	} aggregate;
+#endif
 #endif
 };
 
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+			 struct sched_domain *sd)
+{
+	struct task_group *parent, *child;
+
+	rcu_read_lock();
+	parent = &root_task_group;
+down:
+	(*down)(parent, sd);
+	list_for_each_entry_rcu(child, &parent->children, siblings) {
+		parent = child;
+		goto down;
+
+up:
+		continue;
+	}
+	(*up)(parent, sd);
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+	rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long rq_weight = 0;
+	unsigned long task_weight = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		task_weight += tg->cfs_rq[i]->task_weight;
+	}
+
+	aggregate(tg, sd)->rq_weight = rq_weight;
+	aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span)
+		shares += tg->cfs_rq[i]->shares;
+
+	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long load;
+
+	if (!tg->parent) {
+		int i;
+
+		load = 0;
+		for_each_cpu_mask(i, sd->span)
+			load += cpu_rq(i)->load.weight;
+
+	} else {
+		load = aggregate(tg->parent, sd)->load;
+
+		/*
+		 * shares is our weight in the parent's rq so
+		 * shares/parent->rq_weight gives our fraction of the load
+		 */
+		load *= aggregate(tg, sd)->shares;
+		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+	}
+
+	aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+			  int tcpu)
+{
+	int boost = 0;
+	unsigned long shares;
+	unsigned long rq_weight;
+
+	if (!tg->se[tcpu])
+		return;
+
+	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+	/*
+	 * If there are currently no tasks on the cpu pretend there is one of
+	 * average load so that when a new task gets to run here it will not
+	 * get delayed by group starvation.
+	 */
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
+
+	/*
+	 *           \Sum shares * rq_weight
+	 * shares =  -----------------------
+	 *               \Sum rq_weight
+	 *
+	 */
+	shares = aggregate(tg, sd)->shares * rq_weight;
+	shares /= aggregate(tg, sd)->rq_weight + 1;
+
+	/*
+	 * record the actual number of shares, not the boosted amount.
+	 */
+	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
+
+	__set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		    int scpu, int dcpu)
+{
+	unsigned long shares;
+
+	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+	__update_group_shares_cpu(tg, sd, scpu);
+	__update_group_shares_cpu(tg, sd, dcpu);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+	if (shares)
+		tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		  int scpu, int dcpu)
+{
+	while (tg) {
+		__move_group_shares(tg, sd, scpu, dcpu);
+		tg = tg->parent;
+	}
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = aggregate(tg, sd)->shares;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		spin_lock_irqsave(&rq->lock, flags);
+		__update_group_shares_cpu(tg, sd, i);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	aggregate_group_shares(tg, sd);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= aggregate(tg, sd)->shares;
+	if (shares) {
+		tg->cfs_rq[sd->first_cpu]->shares += shares;
+		aggregate(tg, sd)->shares += shares;
+	}
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_weight(tg, sd);
+	aggregate_group_shares(tg, sd);
+	aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+		return 0;
+
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+	cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+	return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
 #endif
 
 #include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(p, rq);
+	inc_nr_running(rq);
 }
 
 /*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(p, rq);
+	dec_nr_running(rq);
 }
 
 /**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq);
+		inc_nr_running(rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
+	unlock_aggregate = get_aggregate(sd);
+
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return ld_moved;
+		ld_moved = -1;
+
+	goto out;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return 0;
+		ld_moved = -1;
+	else
+		ld_moved = 0;
+out:
+	if (unlock_aggregate)
+		put_aggregate(sd);
+	return ld_moved;
 }
 
 /*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
+			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
+	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
-
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
+}
 
-	spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__set_se_shares(se, shares);
+	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		/*
+		 * force a rebalance
+		 */
+		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
+	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8e077b9c91cb..04394ccac88d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
 }
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e197b8e43f1..183388c4dead 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+	cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -580,6 +597,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
@@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		unsigned long max_load_move, struct sched_domain *sd,
+		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+		struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
+	struct rq_iterator cfs_rq_iterator;
 
-	p = task_of(curr);
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+	cfs_rq_iterator.arg = cfs_rq;
 
-	return p->prio;
+	return balance_tasks(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &cfs_rq_iterator);
 }
-#endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	struct rq_iterator cfs_rq_iterator;
-
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
+	int busiest_cpu = cpu_of(busiest);
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
+	rcu_read_lock();
+	list_for_each_entry(tg, &task_groups, list) {
 		long imbalance;
-		unsigned long maxload;
+		unsigned long this_weight, busiest_weight;
+		long rem_load, max_load, moved_load;
+
+		/*
+		 * empty group
+		 */
+		if (!aggregate(tg, sd)->task_weight)
+			continue;
+
+		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+		rem_load /= aggregate(tg, sd)->load + 1;
+
+		this_weight = tg->cfs_rq[this_cpu]->task_weight;
+		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		imbalance = (busiest_weight - this_weight) / 2;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		if (imbalance < 0)
+			imbalance = busiest_weight;
+
+		max_load = max(rem_load, imbalance);
+		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+				max_load, sd, idle, all_pinned, this_best_prio,
+				tg->cfs_rq[busiest_cpu]);
+
+		if (!moved_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		move_group_shares(tg, sd, busiest_cpu, this_cpu);
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
-		/*
-		 * pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-					       maxload, sd, idle, all_pinned,
-					       this_best_prio,
-					       &cfs_rq_iterator);
+		moved_load *= aggregate(tg, sd)->load;
+		moved_load /= aggregate(tg, sd)->rq_weight + 1;
 
-		if (rem_load_move <= 0)
+		rem_load_move -= moved_load;
+		if (rem_load_move < 0)
 			break;
 	}
+	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return __load_balance_fair(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &busiest->cfs);
+}
+#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6b4a6b5a4167..765932d0399d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
+
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 76a2a6ee8a0660a29127f05989ac59ae1ce865fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:15 +0200
Subject: sched: sched_clock_cpu() based cpu_clock()

with sched_clock_cpu() being reasonably in sync between cpus (max 1 jiffy
difference) use this to provide cpu_clock().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 76 ----------------------------------------------------
 kernel/sched_clock.c | 12 +++++++++
 2 files changed, 12 insertions(+), 76 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 874b6da15430..eb3454c410fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -818,82 +818,6 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-unsigned long long time_sync_thresh = 100000;
-
-static DEFINE_PER_CPU(unsigned long long, time_offset);
-static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
-
-/*
- * Global lock which we take every now and then to synchronize
- * the CPUs time. This method is not warp-safe, but it's good
- * enough to synchronize slowly diverging time sources and thus
- * it's good enough for tracing:
- */
-static DEFINE_SPINLOCK(time_sync_lock);
-static unsigned long long prev_global_time;
-
-static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
-{
-	/*
-	 * We want this inlined, to not get tracer function calls
-	 * in this critical section:
-	 */
-	spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
-	__raw_spin_lock(&time_sync_lock.raw_lock);
-
-	if (time < prev_global_time) {
-		per_cpu(time_offset, cpu) += prev_global_time - time;
-		time = prev_global_time;
-	} else {
-		prev_global_time = time;
-	}
-
-	__raw_spin_unlock(&time_sync_lock.raw_lock);
-	spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
-
-	return time;
-}
-
-static unsigned long long __cpu_clock(int cpu)
-{
-	unsigned long long now;
-
-	/*
-	 * Only call sched_clock() if the scheduler has already been
-	 * initialized (some code might call cpu_clock() very early):
-	 */
-	if (unlikely(!scheduler_running))
-		return 0;
-
-	now = sched_clock_cpu(cpu);
-
-	return now;
-}
-
-/*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
- */
-unsigned long long cpu_clock(int cpu)
-{
-	unsigned long long prev_cpu_time, time, delta_time;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
-	delta_time = time-prev_cpu_time;
-
-	if (unlikely(delta_time > time_sync_thresh)) {
-		time = __sync_cpu_clock(time, cpu);
-		per_cpu(prev_cpu_time, cpu) = time;
-	}
-	local_irq_restore(flags);
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(cpu_clock);
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ce05271219ab..3c696db59452 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -244,3 +244,15 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 {
 	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
+
+unsigned long long cpu_clock(int cpu)
+{
+	unsigned long long clock;
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	clock = sched_clock_cpu(cpu);
+	raw_local_irq_restore(flags);
+
+	return clock;
+}
-- 
cgit v1.2.3-55-g7522


From 103638d95ba5b0c53c8d9c0cb581156ccc8513ee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:16 +0200
Subject: sched: fix wakeup granularity and buddy granularity

Uncouple buddy selection from wakeup granularity.

The initial idea was that buddies could run ahead as far as a normal task
can - do this by measuring a pair 'slice' just as we do for a normal task.

This means we can drop the wakeup_granularity back to 5ms.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  1 +
 kernel/sched_fair.c | 15 +++++++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index eb3454c410fa..7d282c52bd42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -375,6 +375,7 @@ struct cfs_rq {
 
 	u64 exec_clock;
 	u64 min_vruntime;
+	u64 pair_start;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 183388c4dead..509092af0330 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
@@ -813,17 +813,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (!cfs_rq->next)
-		return se;
+	struct rq *rq = rq_of(cfs_rq);
+	u64 pair_slice = rq->clock - cfs_rq->pair_start;
 
-	if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+	if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+		cfs_rq->pair_start = rq->clock;
 		return se;
+	}
 
 	return cfs_rq->next;
 }
-- 
cgit v1.2.3-55-g7522


From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:18 +0200
Subject: sched: fix sched_domain aggregation

Keeping the aggregate on the first cpu of the sched domain has two problems:
 - it could collide between different sched domains on different cpus
 - it could slow things down because of the remote accesses

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 -
 kernel/sched.c        | 113 ++++++++++++++++++++++++--------------------------
 kernel/sched_fair.c   |  12 +++---
 3 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97a58b622ee1..eaf821072dbd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,7 +765,6 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
-	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 7d282c52bd42..160d3c209b8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  */
 
 static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
+aggregate(struct task_group *tg, int cpu)
 {
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+	return &tg->cfs_rq[cpu]->aggregate;
 }
 
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
@@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
  */
 static
 void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
+			 int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, sd);
+	(*down)(parent, cpu, sd);
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1508,7 +1508,7 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, sd);
+	(*up)(parent, cpu, sd);
 
 	child = parent;
 	parent = parent->parent;
@@ -1520,8 +1520,8 @@ up:
 /*
  * Calculate the aggregate runqueue weight.
  */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
 	unsigned long task_weight = 0;
@@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 		task_weight += tg->cfs_rq[i]->task_weight;
 	}
 
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
+	aggregate(tg, cpu)->rq_weight = rq_weight;
+	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
  * Compute the weight of this group on the given cpus.
  */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long shares = 0;
 	int i;
@@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 
-	aggregate(tg, sd)->shares = shares;
+	aggregate(tg, cpu)->shares = shares;
 }
 
 /*
  * Compute the load fraction assigned to this group, relies on the aggregate
  * weight and this group's parent's load, i.e. top-down.
  */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long load;
 
@@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
 			load += cpu_rq(i)->load.weight;
 
 	} else {
-		load = aggregate(tg->parent, sd)->load;
+		load = aggregate(tg->parent, cpu)->load;
 
 		/*
 		 * shares is our weight in the parent's rq so
 		 * shares/parent->rq_weight gives our fraction of the load
 		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+		load *= aggregate(tg, cpu)->shares;
+		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
 	}
 
-	aggregate(tg, sd)->load = load;
+	aggregate(tg, cpu)->load = load;
 }
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  * Calculate and set the cpu's group shares.
  */
 static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
+__update_group_shares_cpu(struct task_group *tg, int cpu,
+			  struct sched_domain *sd, int tcpu)
 {
 	int boost = 0;
 	unsigned long shares;
@@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
+	shares = aggregate(tg, cpu)->shares * rq_weight;
+	shares /= aggregate(tg, cpu)->rq_weight + 1;
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
@@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
  * task went to.
  */
 static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		    int scpu, int dcpu)
 {
 	unsigned long shares;
 
 	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
 
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
+	__update_group_shares_cpu(tg, cpu, sd, scpu);
+	__update_group_shares_cpu(tg, cpu, sd, dcpu);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
@@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd,
  * we need to walk up the tree and change all shares until we hit the root.
  */
 static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
+move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		  int scpu, int dcpu)
 {
 	while (tg) {
-		__move_group_shares(tg, sd, scpu, dcpu);
+		__move_group_shares(tg, cpu, sd, scpu, dcpu);
 		tg = tg->parent;
 	}
 }
 
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	unsigned long shares = aggregate(tg, sd)->shares;
+	unsigned long shares = aggregate(tg, cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
+		__update_group_shares_cpu(tg, cpu, sd, i);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-	aggregate_group_shares(tg, sd);
+	aggregate_group_shares(tg, cpu, sd);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
 	 * above redistribution.
 	 */
-	shares -= aggregate(tg, sd)->shares;
+	shares -= aggregate(tg, cpu)->shares;
 	if (shares) {
-		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
+		tg->cfs_rq[cpu]->shares += shares;
+		aggregate(tg, cpu)->shares += shares;
 	}
 }
 
@@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
  * Calculate the accumulative weight and recursive load of each task group
  * while walking down the tree.
  */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
+	aggregate_group_weight(tg, cpu, sd);
+	aggregate_group_shares(tg, cpu, sd);
+	aggregate_group_load(tg, cpu, sd);
 }
 
 /*
  * Rebalance the cpu shares while walking back up the tree.
  */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_set_shares(tg, sd);
+	aggregate_group_set_shares(tg, cpu, sd);
 }
 
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1731,18 +1731,18 @@ static void __init init_aggregate(void)
 		spin_lock_init(&per_cpu(aggregate_lock, i));
 }
 
-static int get_aggregate(struct sched_domain *sd)
+static int get_aggregate(int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
 		return 0;
 
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
 	return 1;
 }
 
-static void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(int cpu, struct sched_domain *sd)
 {
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+	spin_unlock(&per_cpu(aggregate_lock, cpu));
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1756,12 +1756,12 @@ static inline void init_aggregate(void)
 {
 }
 
-static inline int get_aggregate(struct sched_domain *sd)
+static inline int get_aggregate(int cpu, struct sched_domain *sd)
 {
 	return 0;
 }
 
-static inline void put_aggregate(struct sched_domain *sd)
+static inline void put_aggregate(int cpu, struct sched_domain *sd)
 {
 }
 #endif
@@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
+	unlock_aggregate = get_aggregate(this_cpu, sd);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3678,7 +3678,7 @@ out_one_pinned:
 		ld_moved = 0;
 out:
 	if (unlock_aggregate)
-		put_aggregate(sd);
+		put_aggregate(this_cpu, sd);
 	return ld_moved;
 }
 
@@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
-			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 509092af0330..40cf24ab4de8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, sd)->task_weight)
+		if (!aggregate(tg, this_cpu)->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
+		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
 		this_weight = tg->cfs_rq[this_cpu]->task_weight;
 		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
@@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, sd, busiest_cpu, this_cpu);
+		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		moved_load *= aggregate(tg, this_cpu)->load;
+		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3-55-g7522


From 4d8d595dfa69e1c807bf928f364668a7f30da5dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:19 +0200
Subject: sched: update aggregate when holding the RQs

It was observed that in __update_group_shares_cpu()

  rq_weight > aggregate()->rq_weight

This is caused by forks/wakeups in between the initial aggregate pass and
locking of the RQs for load balance. To avoid this situation partially re-do
the aggregation once we have the RQs locked (which avoids new tasks from
appearing).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 160d3c209b8f..dae20199dc9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1721,6 +1721,11 @@ aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 	aggregate_group_set_shares(tg, cpu, sd);
 }
 
+static void
+aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+}
+
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
 
 static void __init init_aggregate(void)
@@ -1740,6 +1745,11 @@ static int get_aggregate(int cpu, struct sched_domain *sd)
 	return 1;
 }
 
+static void update_aggregate(int cpu, struct sched_domain *sd)
+{
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+}
+
 static void put_aggregate(int cpu, struct sched_domain *sd)
 {
 	spin_unlock(&per_cpu(aggregate_lock, cpu));
@@ -1761,6 +1771,10 @@ static inline int get_aggregate(int cpu, struct sched_domain *sd)
 	return 0;
 }
 
+static inline void update_aggregate(int cpu, struct sched_domain *sd)
+{
+}
+
 static inline void put_aggregate(int cpu, struct sched_domain *sd)
 {
 }
@@ -2192,6 +2206,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
+	/*
+	 * now that we have both rqs locked the rq weight won't change
+	 * anymore - so update the stats.
+	 */
+	update_aggregate(this_cpu, sd);
+
 	do {
 		unsigned long load, avg_load;
 		int local_group;
-- 
cgit v1.2.3-55-g7522


From d3f40dbab954d83383b6a516582d5c09cc216dcc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:21 +0200
Subject: sched: dont micro manage share losses

We used to try and contain the loss of 'shares' by playing arithmetic
games. Replace that by noticing that at the top sched_domain we'll
always have the full weight in shares to distribute.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index dae20199dc9c..28229c5d4983 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1551,6 +1551,9 @@ aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 
+	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+		shares = tg->shares;
+
 	aggregate(tg, cpu)->shares = shares;
 }
 
@@ -1642,20 +1645,8 @@ static void
 __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		    int scpu, int dcpu)
 {
-	unsigned long shares;
-
-	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
 	__update_group_shares_cpu(tg, cpu, sd, scpu);
 	__update_group_shares_cpu(tg, cpu, sd, dcpu);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-	if (shares)
-		tg->cfs_rq[dcpu]->shares += shares;
 }
 
 /*
@@ -1675,7 +1666,6 @@ move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 static void
 aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	unsigned long shares = aggregate(tg, cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1688,16 +1678,6 @@ aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *
 	}
 
 	aggregate_group_shares(tg, cpu, sd);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= aggregate(tg, cpu)->shares;
-	if (shares) {
-		tg->cfs_rq[cpu]->shares += shares;
-		aggregate(tg, cpu)->shares += shares;
-	}
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From a25b5aca8740ea99d5e18dfc71235a52b685dcf7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:22 +0200
Subject: sched: no need to aggregate task_weight

We only need to know the task_weight of the busiest rq - nothing to do
if there are no tasks there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 16 +---------------
 kernel/sched_fair.c |  2 +-
 2 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 28229c5d4983..716cfc8e099e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -433,12 +433,6 @@ struct cfs_rq {
 		 * The sum of all runqueue weights within this span.
 		 */
 		unsigned long rq_weight;
-
-		/*
-		 * Weight contributed by tasks; this is the part we can
-		 * influence by moving tasks around.
-		 */
-		unsigned long task_weight;
 	} aggregate;
 #endif
 #endif
@@ -1473,10 +1467,6 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  * rq_weight:
  *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
  *    B would get 2.
- *
- * task_weight:
- *    Part of the rq_weight contributed by tasks; all groups except B would
- *    get 1, B gets 2.
  */
 
 static inline struct aggregate_struct *
@@ -1524,16 +1514,12 @@ static void
 aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
-	unsigned long task_weight = 0;
 	int i;
 
-	for_each_cpu_mask(i, sd->span) {
+	for_each_cpu_mask(i, sd->span)
 		rq_weight += tg->cfs_rq[i]->load.weight;
-		task_weight += tg->cfs_rq[i]->task_weight;
-	}
 
 	aggregate(tg, cpu)->rq_weight = rq_weight;
-	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b10c0d61a2a9..03b9fbd9d648 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1427,7 +1427,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, this_cpu)->task_weight)
+		if (!tg->cfs_rq[busiest_cpu]->task_weight)
 			continue;
 
 		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
-- 
cgit v1.2.3-55-g7522


From c8cba857b4997d5b00451d01474638f6a153f713 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:23 +0200
Subject: sched: simplify the group load balancer

While thinking about the previous patch - I realized that using per domain
aggregate load values in load_balance_fair() is wrong. We should use the
load value for that CPU.

By not needing per domain hierarchical load values we don't need to store
per domain aggregate shares, which greatly simplifies all the math.

It basically falls apart in two separate computations:
 - per domain update of the shares
 - per CPU update of the hierarchical load

Also get rid of the move_group_shares() stuff - just re-compute the shares
again after a successful load balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 286 ++++++++++++----------------------------------------
 kernel/sched_fair.c |  15 +--
 2 files changed, 72 insertions(+), 229 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_SMP
-	unsigned long task_weight;
-	unsigned long shares;
 	/*
-	 * We need space to build a sched_domain wide view of the full task
-	 * group tree, in order to avoid depending on dynamic memory allocation
-	 * during the load balancing we place this in the per cpu task group
-	 * hierarchy. This limits the load balancing to one instance per cpu,
-	 * but more should not be needed anyway.
+	 * the part of load.weight contributed by tasks
 	 */
-	struct aggregate_struct {
-		/*
-		 *   load = weight(cpus) * f(tg)
-		 *
-		 * Where f(tg) is the recursive weight fraction assigned to
-		 * this group.
-		 */
-		unsigned long load;
+	unsigned long task_weight;
 
-		/*
-		 * part of the group weight distributed to this span.
-		 */
-		unsigned long shares;
+	/*
+	 *   h_load = weight * f(tg)
+	 *
+	 * Where f(tg) is the recursive weight fraction assigned to
+	 * this group.
+	 */
+	unsigned long h_load;
 
-		/*
-		 * The sum of all runqueue weights within this span.
-		 */
-		unsigned long rq_weight;
-	} aggregate;
+	/*
+	 * this cpu's part of tg->shares
+	 */
+	unsigned long shares;
 #endif
 #endif
 };
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, int cpu)
-{
-	return &tg->cfs_rq[cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 int cpu, struct sched_domain *sd)
+static void
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
@@ -1507,72 +1463,6 @@ up:
 	rcu_read_unlock();
 }
 
-/*
- * Calculate the aggregate runqueue weight.
- */
-static void
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long rq_weight = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		rq_weight += tg->cfs_rq[i]->load.weight;
-
-	aggregate(tg, cpu)->rq_weight = rq_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static void
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long shares = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		shares += tg->cfs_rq[i]->shares;
-
-	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
-		shares = tg->shares;
-
-	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-		shares = tg->shares;
-
-	aggregate(tg, cpu)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static void
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long load;
-
-	if (!tg->parent) {
-		int i;
-
-		load = 0;
-		for_each_cpu_mask(i, sd->span)
-			load += cpu_rq(i)->load.weight;
-
-	} else {
-		load = aggregate(tg->parent, cpu)->load;
-
-		/*
-		 * shares is our weight in the parent's rq so
-		 * shares/parent->rq_weight gives our fraction of the load
-		 */
-		load *= aggregate(tg, cpu)->shares;
-		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
-	}
-
-	aggregate(tg, cpu)->load = load;
-}
-
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
 /*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  */
 static void
 __update_group_shares_cpu(struct task_group *tg, int cpu,
-			  struct sched_domain *sd, int tcpu)
+			  unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 
-	if (!tg->se[tcpu])
+	if (!tg->se[cpu])
 		return;
 
-	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+	rq_weight = tg->cfs_rq[cpu]->load.weight;
 
 	/*
 	 * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 		rq_weight = NICE_0_LOAD;
 	}
 
+	if (unlikely(rq_weight > sd_rq_weight))
+		rq_weight = sd_rq_weight;
+
 	/*
 	 *           \Sum shares * rq_weight
 	 * shares =  -----------------------
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, cpu)->shares * rq_weight;
-	shares /= aggregate(tg, cpu)->rq_weight + 1;
+	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
 	 */
-	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
 
-	__set_se_shares(tg->se[tcpu], shares);
+	__set_se_shares(tg->se[cpu], shares);
 }
 
 /*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
+ * Re-compute the task group their per cpu shares over the given domain.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
  */
 static void
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		    int scpu, int dcpu)
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	__update_group_shares_cpu(tg, cpu, sd, scpu);
-	__update_group_shares_cpu(tg, cpu, sd, dcpu);
-}
+	unsigned long rq_weight = 0;
+	unsigned long shares = 0;
+	int i;
 
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		  int scpu, int dcpu)
-{
-	while (tg) {
-		__move_group_shares(tg, cpu, sd, scpu, dcpu);
-		tg = tg->parent;
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		shares += tg->cfs_rq[i]->shares;
 	}
-}
 
-static void
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	int i;
+	if ((!shares && rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+		shares = tg->shares;
 
 	for_each_cpu_mask(i, sd->span) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, cpu, sd, i);
+		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
-
-	aggregate_group_shares(tg, cpu, sd);
 }
 
 /*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
  */
 static void
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, cpu, sd);
-	aggregate_group_shares(tg, cpu, sd);
-	aggregate_group_load(tg, cpu, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static void
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	aggregate_group_set_shares(tg, cpu, sd);
-}
-
-static void
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+	unsigned long load;
 
-static void __init init_aggregate(void)
-{
-	int i;
+	if (!tg->parent) {
+		load = cpu_rq(cpu)->load.weight;
+	} else {
+		load = tg->parent->cfs_rq[cpu]->h_load;
+		load *= tg->cfs_rq[cpu]->shares;
+		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+	}
 
-	for_each_possible_cpu(i)
-		spin_lock_init(&per_cpu(aggregate_lock, i));
+	tg->cfs_rq[cpu]->h_load = load;
 }
 
-static int get_aggregate(int cpu, struct sched_domain *sd)
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
-		return 0;
-
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
-	return 1;
 }
 
-static void update_aggregate(int cpu, struct sched_domain *sd)
+static void update_shares(struct sched_domain *sd)
 {
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
 
-static void put_aggregate(int cpu, struct sched_domain *sd)
+static void update_h_load(int cpu)
 {
-	spin_unlock(&per_cpu(aggregate_lock, cpu));
+	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 
 #else
 
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
-{
-	return 0;
-}
-
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
+static inline void update_shares(struct sched_domain *sd)
 {
 }
 
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
-{
-}
 #endif
 
 #endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
-	/*
-	 * now that we have both rqs locked the rq weight won't change
-	 * anymore - so update the stats.
-	 */
-	update_aggregate(this_cpu, sd);
-
 	do {
 		unsigned long load, avg_load;
 		int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(this_cpu, sd);
-
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
+	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 
@@ -3663,8 +3506,8 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
-	if (unlock_aggregate)
-		put_aggregate(this_cpu, sd);
+	if (ld_moved)
+		update_shares(sd);
 	return ld_moved;
 }
 
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
-	init_aggregate();
 	init_defrootdomain();
 #endif
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03b9fbd9d648..7b8d664d6f22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct task_group *tg;
 
 	rcu_read_lock();
+	update_h_load(busiest_cpu);
+
 	list_for_each_entry(tg, &task_groups, list) {
+		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		long rem_load, moved_load;
 
 		/*
 		 * empty group
 		 */
-		if (!tg->cfs_rq[busiest_cpu]->task_weight)
+		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
-		rem_load /= aggregate(tg, this_cpu)->load + 1;
+		rem_load = rem_load_move * busiest_cfs_rq->load.weight;
+		rem_load /= busiest_cfs_rq->h_load + 1;
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
-
-		moved_load *= aggregate(tg, this_cpu)->load;
-		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
+		moved_load *= busiest_cfs_rq->h_load;
+		moved_load /= busiest_cfs_rq->load.weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3-55-g7522


From 3e5459b4bea3ca2618cc02d56d12639f2cba531d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:24 +0200
Subject: sched: fix newidle smp group balancing

Re-compute the shares on newidle - so we can make a decision based on
recent data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index f864b751fd19..cdd09462fc98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1579,6 +1579,13 @@ static void update_shares(struct sched_domain *sd)
 	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
 
+static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+	spin_unlock(&rq->lock);
+	update_shares(sd);
+	spin_lock(&rq->lock);
+}
+
 static void update_h_load(int cpu)
 {
 	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
@@ -1595,6 +1602,10 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
 
+static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+}
+
 #endif
 
 #endif
@@ -3543,6 +3554,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
+	update_shares_locked(this_rq, sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, cpus, NULL);
 	if (!group) {
@@ -3586,6 +3598,7 @@ redo:
 	} else
 		sd->nr_balance_failed = 0;
 
+	update_shares_locked(this_rq, sd);
 	return ld_moved;
 
 out_balanced:
-- 
cgit v1.2.3-55-g7522


From 039a1c41b3a489e34593ea1e1687f6fdad6b13ab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:25 +0200
Subject: sched: fix sched_balance_self() smp group balancing

Finding the least idle cpu is more accurate when done with updated shares.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index cdd09462fc98..39d5495540d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2128,6 +2128,9 @@ static int sched_balance_self(int cpu, int flag)
 			sd = tmp;
 	}
 
+	if (sd)
+		update_shares(sd);
+
 	while (sd) {
 		cpumask_t span, tmpmask;
 		struct sched_group *group;
-- 
cgit v1.2.3-55-g7522


From a8a51d5e59561aa5b4d66e19eca819b537783e8f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:26 +0200
Subject: sched: persistent average load per task

Remove the fall-back to SCHED_LOAD_SCALE by remembering the previous value of
cpu_avg_load_per_task() - this is useful because of the hierarchical group
model in which task weight can be much smaller.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 39d5495540d2..6a6b0139eb32 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -554,6 +554,8 @@ struct rq {
 	int cpu;
 	int online;
 
+	unsigned long avg_load_per_task;
+
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
@@ -1427,9 +1429,18 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->nr_running)
+		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+	return rq->avg_load_per_task;
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
@@ -2010,18 +2021,6 @@ static unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], total);
 }
 
-/*
- * Return the average load per task on the cpu's run queue
- */
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
-	unsigned long n = rq->nr_running;
-
-	return n ? total / n : SCHED_LOAD_SCALE;
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
-- 
cgit v1.2.3-55-g7522


From 408ed066b11cf9ee4536573b4269ee3613bd735e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:28 +0200
Subject: sched: hierarchical load vs find_busiest_group

find_busiest_group() has some assumptions about task weight being in the
NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this
by replacing it with the average task weight, which will adapt the situation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6a6b0139eb32..5e2aa394a812 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
+
 	if (idle == CPU_NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == CPU_NEWLY_IDLE)
@@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
+		unsigned long sum_avg_load_per_task;
+		unsigned long avg_load_per_task;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
@@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
+		sum_avg_load_per_task = avg_load_per_task = 0;
+
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
@@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
 			sum_weighted_load += weighted_cpuload(i);
+
+			sum_avg_load_per_task += cpu_avg_load_per_task(i);
 		}
 
 		/*
@@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 
-		if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+
+		/*
+		 * Consider the group unbalanced when the imbalance is larger
+		 * than the average weight of two tasks.
+		 *
+		 * APZ: with cgroup the avg task weight can vary wildly and
+		 *      might not be a suitable number - should we keep a
+		 *      normalized nr_running number somewhere that negates
+		 *      the hierarchy?
+		 */
+		avg_load_per_task = sg_div_cpu_power(group,
+				sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 			__group_imb = 1;
 
 		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3267,9 +3287,9 @@ small_imbalance:
 			if (busiest_load_per_task > this_load_per_task)
 				imbn = 1;
 		} else
-			this_load_per_task = SCHED_LOAD_SCALE;
+			this_load_per_task = cpu_avg_load_per_task(this_cpu);
 
-		if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+		if (max_load - this_load + 2*busiest_load_per_task >=
 					busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
-- 
cgit v1.2.3-55-g7522


From 051c67640e771fd6ad1b676fc0c16c379b3c6f80 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:31 +0200
Subject: sched: remove prio preference from balance decisions

Priority looses much of its meaning in a hierarchical context. So don't
use it in balance decisions.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5e2aa394a812..10d43f5bf0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2896,7 +2896,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
-	int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+	int loops = 0, pulled = 0, pinned = 0;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 
@@ -2912,14 +2912,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 next:
 	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
-	/*
-	 * To help distribute high priority tasks across CPUs we don't
-	 * skip a task if it will be the highest priority task (i.e. smallest
-	 * prio value) on its new queue regardless of its load weight
-	 */
-	skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-							 SCHED_LOAD_SCALE_FUZZ;
-	if ((skip_for_load && p->prio >= *this_best_prio) ||
+
+	if ((p->se.load.weight >> 1) > rem_load_move ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 		p = iterator->next(iterator->arg);
 		goto next;
-- 
cgit v1.2.3-55-g7522


From 93b75217df39e6d75889cc6f8050343286aff4a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:33 +0200
Subject: sched: disable source/target_load bias

The bias given by source/target_load functions can be very large, disable
it by default to get faster convergence.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 4 ++--
 kernel/sched_features.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 10d43f5bf0fc..6c5eb3bc37e0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2000,7 +2000,7 @@ static unsigned long source_load(int cpu, int type)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 
-	if (type == 0)
+	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 
 	return min(rq->cpu_load[type-1], total);
@@ -2015,7 +2015,7 @@ static unsigned long target_load(int cpu, int type)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 
-	if (type == 0)
+	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 
 	return max(rq->cpu_load[type-1], total);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 04123af2e678..d56e3053e746 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,3 +8,4 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
+SCHED_FEAT(LB_BIAS, 0)
\ No newline at end of file
-- 
cgit v1.2.3-55-g7522


From cd80917e4ff465ea77106f8e4fb631eedc4cf426 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:34 +0200
Subject: sched: fix shares boost logic

In case the domain is empty, pretend there is a single task on each cpu, so
that together with the boost logic we end up giving 1/n shares to each
cpu.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6c5eb3bc37e0..1cff969f6646 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1549,6 +1549,9 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
+	if (!rq_weight)
+		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+
 	for_each_cpu_mask(i, sd->span) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
-- 
cgit v1.2.3-55-g7522


From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:35 +0200
Subject: sched: update shares on wakeup

We found that the affine wakeup code needs rather accurate load figures
to be effective. The trouble is that updating the load figures is fairly
expensive with group scheduling. Therefore ratelimit the updating.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 +++
 kernel/sched.c          | 30 +++++++++++++++++++++++++++++-
 kernel/sched_features.h |  3 ++-
 kernel/sysctl.c         |  8 ++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf821072dbd..835b6c6fcc56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -783,6 +783,8 @@ struct sched_domain {
 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
+	u64 last_update;
+
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1605,6 +1607,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_shares_ratelimit;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index 1cff969f6646..62db0891025a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -777,6 +777,12 @@ late_initcall(sched_init_debug);
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
+/*
+ * ratelimit for updating the group shares.
+ * default: 0.5ms
+ */
+const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
@@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 
 static void update_shares(struct sched_domain *sd)
 {
-	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+	u64 now = cpu_clock(raw_smp_processor_id());
+	s64 elapsed = now - sd->last_update;
+
+	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+		sd->last_update = now;
+		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+	}
 }
 
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
@@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
+#ifdef CONFIG_SMP
+	if (sched_feat(LB_WAKEUP_UPDATE)) {
+		struct sched_domain *sd;
+
+		this_cpu = raw_smp_processor_id();
+		cpu = task_cpu(p);
+
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				update_shares(sd);
+				break;
+			}
+		}
+	}
+#endif
+
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d56e3053e746..7d616d2a2a3f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
\ No newline at end of file
+SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..fe8cdc80ff02 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_shares_ratelimit",
+		.data		= &sysctl_sched_shares_ratelimit,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
-- 
cgit v1.2.3-55-g7522


From 83378269a5fad98f562ebc0f09c349575e6cbfe1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:37 +0200
Subject: sched: correct wakeup weight calculations

rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}

wakeup on cpu0, weight=1

rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}

s_0 = S * rw_0 / \Sum rw_j ->
  \Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)

s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
       1 * (2+1) / (7+1) = 3/8 (correct

so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 ++++
 kernel/sched_fair.c | 48 ++++++++++++++++++++++++++----------------------
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 62db0891025a..01d3e51b7116 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	return NULL;
+}
 
 #endif	/* CONFIG_GROUP_SCHED */
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 26ebe180cdea..bed2f71e63d9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,10 +1074,10 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
+static unsigned long effective_load(struct task_group *tg, int cpu,
+		unsigned long wl, unsigned long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	long wg = wl;
 
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
@@ -1092,6 +1092,13 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 		b = S*rw + s*wg;
 
 		wl = s*(a-b)/D(b);
+		/*
+		 * Assume the group is already running and will
+		 * thus already be accounted for in the weight.
+		 *
+		 * That is, moving shares between CPUs, does not
+		 * alter the group weight.
+		 */
 		wg = 0;
 #undef D
 	}
@@ -1099,26 +1106,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 	return wl;
 }
 
-static unsigned long task_load_sub(struct task_struct *p)
-{
-	return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-	return effective_load(task_group(p), p->se.load.weight, cpu);
-}
-
 #else
 
-static unsigned long task_load_sub(struct task_struct *p)
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+		unsigned long wl, unsigned long wg)
 {
-	return -p->se.load.weight;
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-	return p->se.load.weight;
+	return wl;
 }
 
 #endif
@@ -1130,8 +1123,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	    unsigned int imbalance)
 {
 	struct task_struct *curr = this_rq->curr;
+	struct task_group *tg;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
+	unsigned long weight;
 	int balanced;
 
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1142,10 +1137,19 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 */
-	if (sync)
-		tl += task_load_sub(current);
+	if (sync) {
+		tg = task_group(current);
+		weight = current->se.load.weight;
+
+		tl += effective_load(tg, this_cpu, -weight, -weight);
+		load += effective_load(tg, prev_cpu, 0, -weight);
+	}
+
+	tg = task_group(p);
+	weight = p->se.load.weight;
 
-	balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
+	balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3-55-g7522


From f1d239f73200a5803a89e5929fb3abc1596b7589 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 27 Jun 2008 13:41:38 +0200
Subject: sched: incremental effective_load()

Increase the accuracy of the effective_load values.

Not only consider the current increment (as per the attempted wakeup), but
also consider the delta between when we last adjusted the shares and the
current situation.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  6 ++++++
 kernel/sched_fair.c | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 01d3e51b7116..7613f69f0978 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -427,6 +427,11 @@ struct cfs_rq {
 	 * this cpu's part of tg->shares
 	 */
 	unsigned long shares;
+
+	/*
+	 * load.weight at the time we set shares
+	 */
+	unsigned long rq_weight;
 #endif
 #endif
 };
@@ -1527,6 +1532,7 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 	 * record the actual number of shares, not the boosted amount.
 	 */
 	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->rq_weight = rq_weight;
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bed2f71e63d9..e87f1a52f625 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,10 +1074,22 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, int cpu,
-		unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu,
+		long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
+	long more_w;
+
+	if (!tg->parent)
+		return wl;
+
+	/*
+	 * Instead of using this increment, also add the difference
+	 * between when the shares were last updated and now.
+	 */
+	more_w = se->my_q->load.weight - se->my_q->rq_weight;
+	wl += more_w;
+	wg += more_w;
 
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
@@ -1086,7 +1098,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu,
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
-		rw = se->my_q->load.weight;
+		rw = se->my_q->rq_weight;
 
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
-- 
cgit v1.2.3-55-g7522


From 79c537998d143b127c8c662a403c3356cb885f1c Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko
Date: Sun, 29 Jun 2008 00:16:56 +0200
Subject: sched: fix cpu hotplug

the CPU hotplug problems (crashes under high-volume unplug+replug
tests) seem to be related to migrate_dead_tasks().

Firstly I added traces to see all tasks being migrated with
migrate_live_tasks() and migrate_dead_tasks(). On my setup the problem
pops up (the one with "se == NULL" in the loop of
pick_next_task_fair()) shortly after the traces indicate that some has
been migrated with migrate_dead_tasks()). btw., I can reproduce it
much faster now with just a plain cpu down/up loop.

[disclaimer] Well, unless I'm really missing something important in
this late hour [/desclaimer] pick_next_task() is not something
appropriate for migrate_dead_tasks() :-)

the following change seems to eliminate the problem on my setup
(although, I kept it running only for a few minutes to get a few
messages indicating migrate_dead_tasks() does move tasks and the
system is still ok)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3aaa5c8cb421..a66e85639de2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5887,6 +5887,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 		next = pick_next_task(rq, rq->curr);
 		if (!next)
 			break;
+		next->sched_class->put_prev_task(rq, next);
 		migrate_dead(dead_cpu, next);
 
 	}
-- 
cgit v1.2.3-55-g7522


From 34e83e850f5e5ee2a18cd77a5d70d31972a632e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Fri, 27 Jun 2008 15:42:36 +0200
Subject: sched: build fix

fix:

kernel/sched.c: In function ‘sched_group_set_shares':
kernel/sched.c:8635: error: implicit declaration of function ‘cfs_rq_set_shares'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7613f69f0978..058250a63b64 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1627,11 +1627,6 @@ static void update_h_load(int cpu)
 	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-	cfs_rq->shares = shares;
-}
-
 #else
 
 static inline void update_shares(struct sched_domain *sd)
@@ -1646,6 +1641,13 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+	cfs_rq->shares = shares;
+#endif
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
-- 
cgit v1.2.3-55-g7522


From 30432094a7f506ad24997a3ba6aed913ab61c01d Mon Sep 17 00:00:00 2001
From: Vegard Nossum
Date: Fri, 27 Jun 2008 21:35:50 +0200
Subject: sched: fix warning

This patch fixes the following warning:

kernel/sched.c:1667: warning: 'cfs_rq_set_shares' defined but not used

This seems the correct way to fix this; cfs_rq_set_shares() is only used
in a single place, which is also inside #ifdef CONFIG_FAIR_GROUP_SCHED.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 058250a63b64..677c80b9a6b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1641,12 +1641,14 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 {
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 	cfs_rq->shares = shares;
 #endif
 }
+#endif
 
 #include "sched_stats.h"
 #include "sched_idletask.c"
-- 
cgit v1.2.3-55-g7522


From 619b0488038224391e64fa03854651ca0f5efe56 Mon Sep 17 00:00:00 2001
From: Raistlin
Date: Thu, 26 Jun 2008 18:54:09 +0200
Subject: sched: fix divide error when trying to configure rt_period to zero

Here it is another little Oops we found while configuring invalid values
via cgroups:

echo 0 > /dev/cgroups/0/cpu.rt_period_us
or
echo 4294967296 > /dev/cgroups/0/cpu.rt_period_us

[  205.509825] divide error: 0000 [#1]
[  205.510151] Modules linked in:
[  205.510151]
[  205.510151] Pid: 2339, comm: bash Not tainted (2.6.26-rc8 #33)
[  205.510151] EIP: 0060:[<c030c6ef>] EFLAGS: 00000293 CPU: 0
[  205.510151] EIP is at div64_u64+0x5f/0x70
[  205.510151] EAX: 0000389f EBX: 00000000 ECX: 00000000 EDX: 00000000
[  205.510151] ESI: d9800000 EDI: 00000000 EBP: c6cede60 ESP: c6cede50
[  205.510151]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
[  205.510151] Process bash (pid: 2339, ti=c6cec000 task=c79be370 task.ti=c6cec000)
[  205.510151] Stack: d9800000 0000389f c05971a0 d9800000 c6cedeb4 c0214dbd 00000000 00000000
[  205.510151]        c6cede88 c0242bd8 c05377c0 c7a41b40 00000000 00000000 00000000 c05971a0
[  205.510151]        c780ed20 c7508494 c7a41b40 00000000 00000002 c6cedebc c05971a0 ffffffea
[  205.510151] Call Trace:
[  205.510151]  [<c0214dbd>] ? __rt_schedulable+0x1cd/0x240
[  205.510151]  [<c0242bd8>] ? cgroup_file_open+0x18/0xe0
[  205.510151]  [<c0214fe4>] ? tg_set_bandwidth+0xa4/0xf0
[  205.510151]  [<c0215066>] ? sched_group_set_rt_period+0x36/0x50
[  205.510151]  [<c021508e>] ? cpu_rt_period_write_uint+0xe/0x10
[  205.510151]  [<c0242dc5>] ? cgroup_file_write+0x125/0x160
[  205.510151]  [<c0232c15>] ? hrtimer_interrupt+0x155/0x190
[  205.510151]  [<c02f047f>] ? security_file_permission+0xf/0x20
[  205.510151]  [<c0277ad8>] ? rw_verify_area+0x48/0xc0
[  205.510151]  [<c0283744>] ? dupfd+0x104/0x130
[  205.510151]  [<c027838c>] ? vfs_write+0x9c/0x160
[  205.510151]  [<c0242ca0>] ? cgroup_file_write+0x0/0x160
[  205.510151]  [<c027850d>] ? sys_write+0x3d/0x70
[  205.510151]  [<c0203019>] ? sysenter_past_esp+0x6a/0x91
[  205.510151]  =======================
[  205.510151] Code: 0f 45 de 31 f6 0f ad d0 d3 ea f6 c1 20 0f 45 c2 0f 45 d6 89 45 f0 89 55 f4 8b 55 f4 31 c9 8b 45 f0 39 d3 89 c6 77 08 89 d0 31 d2 <f7> f3 89 c1 83 c4 08 89 f0 f7 f3 89 ca 5b 5e 5d c3 55 89 e5 56
[  205.510151] EIP: [<c030c6ef>] div64_u64+0x5f/0x70 SS:ESP 0068:c6cede50

The attached patch solves the issue for me.

I'm checking as soon as possible for the period not being zero since, if
it is, going ahead is useless. This way we also save a mutex_lock() and
a read_lock() wrt doing it inside tg_set_bandwidth() or
__rt_schedulable().

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index a66e85639de2..94ead43eda62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8502,6 +8502,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
+	if (rt_period == 0)
+		return -EINVAL;
+
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 
-- 
cgit v1.2.3-55-g7522


From c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 Mon Sep 17 00:00:00 2001
From: Gregory Haskins
Date: Fri, 27 Jun 2008 14:29:55 -0600
Subject: sched: terminate newidle balancing once at least one task has moved
 over

Inspired by Peter Zijlstra.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: npiggin@suse.de
Cc: rostedt@goodmis.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 677c80b9a6b5..d99aeabeb72f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3013,6 +3013,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
+
+		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+			break;
+
 	} while (class && max_load_move > total_load_moved);
 
 	return total_load_moved > 0;
-- 
cgit v1.2.3-55-g7522


From 2087a1ad822cd3a68b73338457047fcc54da726b Mon Sep 17 00:00:00 2001
From: Gregory Haskins
Date: Fri, 27 Jun 2008 14:30:00 -0600
Subject: sched: add avg-overlap support to RT tasks

We have the notion of tracking process-coupling (a.k.a. buddy-wake) via
the p->se.last_wake / p->se.avg_overlap facilities, but it is only used
for cfs to cfs interactions.  There is no reason why an rt to cfs
interaction cannot share in establishing a relationhip in a similar
manner.

Because PREEMPT_RT runs many kernel threads as FIFO priority, we often
times have heavy interaction between RT threads waking CFS applications.
This patch offers a substantial boost (50-60%+) in perfomance under those
circumstances.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: npiggin@suse.de
Cc: rostedt@goodmis.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 14 ++++++++++++++
 kernel/sched_fair.c | 21 ++-------------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index d99aeabeb72f..bbc40c3a0657 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1693,6 +1693,12 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
+
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	sched_info_queued(p);
@@ -1702,6 +1708,12 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+	if (sleep && p->se.last_wakeup) {
+		update_avg(&p->se.avg_overlap,
+			   p->se.sum_exec_runtime - p->se.last_wakeup);
+		p->se.last_wakeup = 0;
+	}
+
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
@@ -2313,6 +2325,8 @@ out_running:
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
+	current->se.last_wakeup = current->se.sum_exec_runtime;
+
 	task_rq_unlock(rq, &flags);
 
 	return success;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e43d4a748c3..f2aa987027d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 		__enqueue_entity(cfs_rq, se);
 }
 
-static void update_avg(u64 *avg, u64 sample)
-{
-	s64 diff = sample - *avg;
-	*avg += diff >> 3;
-}
-
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	if (!se->last_wakeup)
-		return;
-
-	update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-	se->last_wakeup = 0;
-}
-
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
-		update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -1196,9 +1180,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
-	if (sync && balanced && curr->sched_class == &fair_sched_class) {
+	if (sync && balanced) {
 		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-				p->se.avg_overlap < sysctl_sched_migration_cost)
+		    p->se.avg_overlap < sysctl_sched_migration_cost)
 			return 1;
 	}
 
@@ -1359,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
-	se->last_wakeup = se->sum_exec_runtime;
 	if (unlikely(se == pse))
 		return;
 
-- 
cgit v1.2.3-55-g7522


From 46ac22bab42cc868b9c1d0e915ddbc8e8065a44d Mon Sep 17 00:00:00 2001
From: Ankita Garg
Date: Tue, 1 Jul 2008 14:30:06 +0530
Subject: sched: fix accounting in task delay accounting & migration

On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
>   cpu0      cpu1
>
> enqueue
>  <wait-d1>
> dequeue
>             enqueue
>              <wait-d2>
>             run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>

Thanks for reviewing the patch. The above is exactly what I have done.

> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.

Fixed. Pl. find the new patch below.

Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       |  1 +
 kernel/sched_stats.h | 42 +++++++++++++++++++++++++++++++++---------
 2 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index bbc40c3a0657..996bc15196a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1714,6 +1714,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 		p->se.last_wakeup = 0;
 	}
 
+	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 80179ef7450e..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.cpu_time += delta;
 }
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+	if (rq)
+		rq->rq_sched_info.run_delay += delta;
+}
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 # define schedstat_set(var, val)	do { var = (val); } while (0)
@@ -126,6 +133,9 @@ static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
 {}
 static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)	do { } while (0)
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 #endif
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  * active queue, thus delaying tasks in the expired queue from running;
  * see scheduler_tick()).
  *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
+ * Though we are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a cpu, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * delta taken on each cpu would annul the skew.
  */
 static inline void sched_info_dequeued(struct task_struct *t)
 {
-	t->sched_info.last_queued = 0;
+	unsigned long long now = task_rq(t)->clock, delta = 0;
+
+	if (unlikely(sched_info_on()))
+		if (t->sched_info.last_queued)
+			delta = now - t->sched_info.last_queued;
+	sched_info_reset_dequeued(t);
+	t->sched_info.run_delay += delta;
+
+	rq_sched_info_dequeued(task_rq(t), delta);
 }
 
 /*
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
 
 	if (t->sched_info.last_queued)
 		delta = now - t->sched_info.last_queued;
-	sched_info_dequeued(t);
+	sched_info_reset_dequeued(t);
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcount++;
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 		__sched_info_switch(prev, next);
 }
 #else
-#define sched_info_queued(t)		do { } while (0)
-#define sched_info_switch(t, next)	do { } while (0)
+#define sched_info_queued(t)			do { } while (0)
+#define sched_info_reset_dequeued(t)	do { } while (0)
+#define sched_info_dequeued(t)			do { } while (0)
+#define sched_info_switch(t, next)		do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
-- 
cgit v1.2.3-55-g7522


From 076ac2af86c3b7f89ac31bc50a7508d3e035b786 Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: sched, numa: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c

  * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c,
    where appropriate.  This saves some allocated space as well as many
    wasted cycles going through node entries that are non-existent.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..bcc22b569ee9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
-- 
cgit v1.2.3-55-g7522


From dc7fab8b3bb388c57c6c4a43ba68c8a32ca25204 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko
Date: Thu, 10 Jul 2008 00:32:40 +0200
Subject: sched: fix cpu hotplug

I think we may have a race between try_to_wake_up() and
migrate_live_tasks() -> move_task_off_dead_cpu() when the later one
may end up looping endlessly.

Interrupts are enabled on other CPUs when migration_call(CPU_DEAD, ...) is
called so we may get a race between try_to_wake_up() and
migrate_live_tasks() -> move_task_off_dead_cpu(). The former one may push
a task out of a dead CPU causing the later one to loop endlessly.

Heiko Carstens observed:

| That's exactly what explains a dump I got yesterday. Thanks for fixing! :)

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: miaox@cn.fujitsu.com
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..9397b8710138 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5621,8 +5621,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
+	if (task_cpu(p) != src_cpu) {
+		ret = 1;
 		goto out;
+	}
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
-- 
cgit v1.2.3-55-g7522


From b1e387348a2a70954312b102d0589c3e2ca3dba1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Thu, 10 Jul 2008 11:25:03 -0700
Subject: sched: fix cpu hotplug, cleanup

Clean up __migrate_task(): to just have separate "done" and "fail"
cases, instead of that "out" case with random error behavior.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9397b8710138..4e2f60335656 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5621,13 +5621,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
-	if (task_cpu(p) != src_cpu) {
-		ret = 1;
-		goto out;
-	}
+	if (task_cpu(p) != src_cpu)
+		goto done;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
-		goto out;
+		goto fail;
 
 	on_rq = p->se.on_rq;
 	if (on_rq)
@@ -5638,8 +5636,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
+done:
 	ret = 1;
-out:
+fail:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
-- 
cgit v1.2.3-55-g7522


From 13b40c1e40f3261e83ee514a08b77dbecb93021b Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Tue, 1 Jul 2008 10:32:50 -0700
Subject: sched: reduce stack size in isolated_cpu_setup()

  * Remove 16k stack requirements in isolated_cpu_setup when NR_CPUS=4096.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850ecab..1ee18dbb4516 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,7 +6768,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-	int ints[NR_CPUS], i;
+	static int __initdata ints[NR_CPUS];
+	int i;
 
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
-- 
cgit v1.2.3-55-g7522


From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky
Date: Tue, 15 Jul 2008 04:43:49 -0700
Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain
 managment (take 2)

This is based on Linus' idea of creating cpu_active_map that prevents
scheduler load balancer from migrating tasks to the cpu that is going
down.

It allows us to simplify domain management code and avoid unecessary
domain rebuilds during cpu hotplug event handling.

Please ignore the cpusets part for now. It needs some more work in order
to avoid crazy lock nesting. Although I did simplfy and unify domain
reinitialization logic. We now simply call partition_sched_domains() in
all the cases. This means that we're using exact same code paths as in
cpusets case and hence the test below cover cpusets too.
Cpuset changes to make rebuild_sched_domains() callable from various
contexts are in the separate patch (right next after this one).

This not only boots but also easily handles
	while true; do make clean; make -j 8; done
and
	while true; do on-off-cpu 1; done
at the same time.
(on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing).

Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing
this on right now in gnome-terminal and things are moving just fine.

Also this is running with most of the debug features enabled (lockdep,
mutex, etc) no BUG_ONs or lockdep complaints so far.

I believe I addressed all of the Dmitry's comments for original Linus'
version. I changed both fair and rt balancer to mask out non-active cpus.
And replaced cpu_is_offline() with !cpu_active() in the main scheduler
code where it made sense (to me).

Signed-off-by: Max Krasnyanskiy <maxk@qualcomm.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Cc: dmitry.adamushko@gmail.com
Cc: pj@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpumask.h |   6 ++-
 include/linux/cpuset.h  |   7 ++++
 init/main.c             |   7 ++++
 kernel/cpu.c            |  30 +++++++++++---
 kernel/cpuset.c         |   2 +-
 kernel/sched.c          | 108 ++++++++++++++++++++----------------------------
 kernel/sched_fair.c     |   3 ++
 kernel/sched_rt.c       |   7 ++++
 8 files changed, 99 insertions(+), 71 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c24875bd9c5b..d614d2472798 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -359,13 +359,14 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 
 /*
  * The following particular system cpumasks and operations manage
- * possible, present and online cpus.  Each of them is a fixed size
+ * possible, present, active and online cpus.  Each of them is a fixed size
  * bitmap of size NR_CPUS.
  *
  *  #ifdef CONFIG_HOTPLUG_CPU
  *     cpu_possible_map - has bit 'cpu' set iff cpu is populatable
  *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
  *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *     cpu_active_map   - has bit 'cpu' set iff cpu available to migration
  *  #else
  *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
  *     cpu_present_map  - copy of cpu_possible_map
@@ -416,6 +417,7 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 extern cpumask_t cpu_possible_map;
 extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_present_map;
+extern cpumask_t cpu_active_map;
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpus_weight(cpu_online_map)
@@ -424,6 +426,7 @@ extern cpumask_t cpu_present_map;
 #define cpu_online(cpu)		cpu_isset((cpu), cpu_online_map)
 #define cpu_possible(cpu)	cpu_isset((cpu), cpu_possible_map)
 #define cpu_present(cpu)	cpu_isset((cpu), cpu_present_map)
+#define cpu_active(cpu)		cpu_isset((cpu), cpu_active_map)
 #else
 #define num_online_cpus()	1
 #define num_possible_cpus()	1
@@ -431,6 +434,7 @@ extern cpumask_t cpu_present_map;
 #define cpu_online(cpu)		((cpu) == 0)
 #define cpu_possible(cpu)	((cpu) == 0)
 #define cpu_present(cpu)	((cpu) == 0)
+#define cpu_active(cpu)		((cpu) == 0)
 #endif
 
 #define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 038578362b47..e8f450c499b0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -78,6 +78,8 @@ extern void cpuset_track_online_nodes(void);
 
 extern int current_cpuset_is_being_rebound(void);
 
+extern void rebuild_sched_domains(void);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -156,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void)
 	return 0;
 }
 
+static inline void rebuild_sched_domains(void)
+{
+	partition_sched_domains(0, NULL, NULL);
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/init/main.c b/init/main.c
index edeace036fd9..dd25259530ea 100644
--- a/init/main.c
+++ b/init/main.c
@@ -415,6 +415,13 @@ static void __init smp_init(void)
 {
 	unsigned int cpu;
 
+	/*
+	 * Set up the current CPU as possible to migrate to.
+	 * The other ones will be done by cpu_up/cpu_down()
+	 */
+	cpu = smp_processor_id();
+	cpu_set(cpu, cpu_active_map);
+
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab801..a1ac7ea245d7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
+cpumask_t cpu_active_map;
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
 	int err = 0;
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_down(cpu, 0);
+		goto out;
+	}
+
+	cpu_clear(cpu, cpu_active_map);
+
+	err = _cpu_down(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
 
+out:
 	cpu_maps_update_done();
 	return err;
 }
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
 	}
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_up(cpu, 0);
+		goto out;
+	}
 
+	err = _cpu_up(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
+
+out:
 	cpu_maps_update_done();
 	return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..3c3ef02f65f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  *	partition_sched_domains().
  */
 
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
 	struct kfifo *q;	/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb4516..c237624a8a04 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
-	    || unlikely(cpu_is_offline(dest_cpu)))
+	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
 	/* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
-		if (cpu_is_offline(cpu) &&
+		if (!cpu_active(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 
-	if (unlikely(cpu_is_offline(dest_cpu)))
+	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
 	rq_src = cpu_rq(src_cpu);
@@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
-/*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-	ndoms_cur = 0;
-	if (doms_cur != &fallback_doms)
-		kfree(doms_cur);
-	doms_cur = &fallback_doms;
-}
-
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL,
  * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * Call with hotplug lock held
  */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL) {
-		ndoms_new = 1;
-		doms_new = &fallback_doms;
-		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
-	}
+	if (doms_new == NULL)
+		ndoms_new = 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
 		;
 	}
 
+	if (doms_new == NULL) {
+		ndoms_cur = 0;
+		ndoms_new = 1;
+		doms_new = &fallback_doms;
+		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		dattr_new = NULL;
+	}
+
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-	int err;
-
 	get_online_cpus();
-	mutex_lock(&sched_domains_mutex);
-	detach_destroy_domains(&cpu_online_map);
-	free_sched_domains();
-	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_domains_mutex);
+	rebuild_sched_domains();
 	put_online_cpus();
-
-	return err;
+	return 0;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
- * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		partition_sched_domains(0, NULL, NULL);
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 
@@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		detach_destroy_domains(&cpu_online_map);
-		free_sched_domains();
 		return NOTIFY_OK;
 
-
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		/*
-		 * Fall through and re-initialise the domains.
-		 */
-		break;
+		return NOTIFY_OK;
+
 	default:
 		return NOTIFY_DONE;
 	}
-
-#ifndef CONFIG_CPUSETS
-	/*
-	 * Create default domain partitioning if cpusets are disabled.
-	 * Otherwise we let cpusets rebuild the domains based on the
-	 * current setup.
-	 */
-
-	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains(&cpu_online_map);
-#endif
-
-	return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+	/* RT runtime code needs to handle some hotplug events */
+	hotcpu_notifier(update_runtime, 0);
+
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d6..d924c679dfac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
+			cpus_and(tmp, tmp, cpu_active_map);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3d1cccb3d7b..50735bb96149 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return -1; /* No targets found */
 
+	/*
+	 * Only consider CPUs that are usable for migration.
+	 * I guess we might want to change cpupri_find() to ignore those
+	 * in the first place.
+	 */
+	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
-- 
cgit v1.2.3-55-g7522


From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 18 Jul 2008 18:01:23 +0200
Subject: sched, x86: clean up hrtick implementation

random uvesafb failures were reported against Gentoo:

  http://bugs.gentoo.org/show_bug.cgi?id=222799

and Mihai Moldovan bisected it back to:

> 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit
> commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Fri Jan 25 21:08:29 2008 +0100
>
>    sched: high-res preemption tick

Linus suspected it to be hrtick + vm86 interaction and observed:

> Btw, Peter, Ingo: I think that commit is doing bad things. They aren't
> _incorrect_ per se, but they are definitely bad.
>
> Why?
>
> Using random _TIF_WORK_MASK flags is really impolite for doing
> "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S
> special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of
> vm86 mode unnecessarily.
>
> See the "work_notifysig_v86" label, and how it does that
> "save_v86_state()" thing etc etc.

Right, I never liked having to fiddle with those TIF flags. Initially I
needed it because the hrtimer base lock could not nest in the rq lock.
That however is fixed these days.

Currently the only reason left to fiddle with the TIF flags is remote
wakeups. We cannot program a remote cpu's hrtimer. I've been thinking
about using the new and improved IPI function call stuff to implement
hrtimer_start_on().

However that does require that smp_call_function_single(.wait=0) works
from interrupt context - /me looks at the latest series from Jens - Yes
that does seem to be supported, good.

Here's a stab at cleaning this stuff up ...

Mihai reported test success as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mihai Moldovan <ionic@ionic.de>
Cc: Michal Januszewski <spock@gentoo.org>
Cc: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/signal_32.c   |   3 -
 arch/x86/kernel/signal_64.c   |   3 -
 include/asm-x86/thread_info.h |   4 +-
 kernel/Kconfig.hz             |   2 +-
 kernel/sched.c                | 202 +++++++++++++-----------------------------
 kernel/sched_fair.c           |   5 +-
 6 files changed, 64 insertions(+), 155 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..e1fc7bd57bfe 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -667,8 +667,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
-
 	clear_thread_flag(TIF_IRET);
 }
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..88023fcc7049 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -502,9 +502,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
-
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 895339d2bc0b..d7012634ace4 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -81,7 +81,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -108,7 +107,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_HRTICK_RESCHED	(1 << TIF_HRTICK_RESCHED)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -132,7 +130,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e622..2a202a846757 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS && X86
+	def_bool HIGH_RES_TIMERS
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb4516..c13c75e9f9f7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -571,8 +571,10 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_HRTICK
-	unsigned long hrtick_flags;
-	ktime_t hrtick_expire;
+#ifdef CONFIG_SMP
+	int hrtick_csd_pending;
+	struct call_single_data hrtick_csd;
+#endif
 	struct hrtimer hrtick_timer;
 #endif
 
@@ -983,13 +985,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-static void __resched_task(struct task_struct *p, int tif_bit);
-
-static inline void resched_task(struct task_struct *p)
-{
-	__resched_task(p, TIF_NEED_RESCHED);
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -1001,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
-static inline void resched_hrt(struct task_struct *p)
-{
-	__resched_task(p, TIF_HRTICK_RESCHED);
-}
-
-static inline void resched_rq(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	resched_task(rq->curr);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-enum {
-	HRTICK_SET,		/* re-programm hrtick_timer */
-	HRTICK_RESET,		/* not a new slice */
-	HRTICK_BLOCK,		/* stop hrtick operations */
-};
 
 /*
  * Use hrtick when:
@@ -1030,72 +1006,17 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+	if (!cpu_online(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-	assert_spin_locked(&rq->lock);
-
-	/*
-	 * preempt at: now + delay
-	 */
-	rq->hrtick_expire =
-		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-	/*
-	 * indicate we need to program the timer
-	 */
-	__set_bit(HRTICK_SET, &rq->hrtick_flags);
-	if (reset)
-		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
-
-	/*
-	 * New slices are called from the schedule path and don't need a
-	 * forced reschedule.
-	 */
-	if (reset)
-		resched_hrt(rq->curr);
-}
-
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 
-/*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-	ktime_t time;
-	int set, reset;
-	unsigned long flags;
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	spin_lock_irqsave(&rq->lock, flags);
-	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-	time = rq->hrtick_expire;
-	clear_thread_flag(TIF_HRTICK_RESCHED);
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (set) {
-		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-		if (reset && !hrtimer_active(&rq->hrtick_timer))
-			resched_rq(rq);
-	} else
-		hrtick_clear(rq);
-}
-
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
@@ -1115,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 
 #ifdef CONFIG_SMP
-static void hotplug_hrtick_disable(int cpu)
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	rq->hrtick_flags = 0;
-	__set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	struct rq *rq = arg;
 
-	hrtick_clear(rq);
+	spin_lock(&rq->lock);
+	hrtimer_restart(&rq->hrtick_timer);
+	rq->hrtick_csd_pending = 0;
+	spin_unlock(&rq->lock);
 }
 
-static void hotplug_hrtick_enable(int cpu)
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
+	struct hrtimer *timer = &rq->hrtick_timer;
+	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 
-	spin_lock_irqsave(&rq->lock, flags);
-	__clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	timer->expires = time;
+
+	if (rq == this_rq()) {
+		hrtimer_restart(timer);
+	} else if (!rq->hrtick_csd_pending) {
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		rq->hrtick_csd_pending = 1;
+	}
 }
 
 static int
@@ -1150,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		hotplug_hrtick_disable(cpu);
-		return NOTIFY_OK;
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hotplug_hrtick_enable(cpu);
+		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 
@@ -1170,46 +1092,45 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-#endif /* CONFIG_SMP */
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
 
-static void init_rq_hrtick(struct rq *rq)
+static void init_hrtick(void)
 {
-	rq->hrtick_flags = 0;
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
+#endif /* CONFIG_SMP */
 
-void hrtick_resched(void)
+static void init_rq_hrtick(struct rq *rq)
 {
-	struct rq *rq;
-	unsigned long flags;
+#ifdef CONFIG_SMP
+	rq->hrtick_csd_pending = 0;
 
-	if (!test_thread_flag(TIF_HRTICK_RESCHED))
-		return;
+	rq->hrtick_csd.flags = 0;
+	rq->hrtick_csd.func = __hrtick_start;
+	rq->hrtick_csd.info = rq;
+#endif
 
-	local_irq_save(flags);
-	rq = cpu_rq(smp_processor_id());
-	hrtick_set(rq);
-	local_irq_restore(flags);
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 
-static inline void hrtick_set(struct rq *rq)
-{
-}
-
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 
-void hrtick_resched(void)
-{
-}
-
 static inline void init_hrtick(void)
 {
 }
@@ -1228,16 +1149,16 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -1303,10 +1224,10 @@ void wake_up_idle_cpu(int cpu)
 #endif /* CONFIG_NO_HZ */
 
 #else /* !CONFIG_SMP */
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_need_resched(p);
 }
 #endif /* CONFIG_SMP */
 
@@ -4395,7 +4316,7 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int cpu, hrtick = sched_feat(HRTICK);
+	int cpu;
 
 need_resched:
 	preempt_disable();
@@ -4410,7 +4331,7 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	if (hrtick)
+	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
 	/*
@@ -4457,9 +4378,6 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (hrtick)
-		hrtick_set(rq);
-
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d6..6893b3ed65fe 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -878,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-	int requeue = rq->curr == p;
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -899,10 +898,10 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		 * Don't schedule slices shorter than 10000ns, that just
 		 * doesn't make sense. Rely on vruntime for fairness.
 		 */
-		if (!requeue)
+		if (rq->curr != p)
 			delta = max(10000LL, delta);
 
-		hrtick_start(rq, delta, requeue);
+		hrtick_start(rq, delta);
 	}
 }
 #else /* !CONFIG_SCHED_HRTICK */
-- 
cgit v1.2.3-55-g7522


From ba42059fbd0aa1ac91b582412b5fedb1258f241f Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sun, 20 Jul 2008 11:02:06 +0200
Subject: sched: hrtick_enabled() should use cpu_active()

Peter pointed out that hrtick_enabled() should use cpu_active().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 85cf246cfdf5..62b1b8ecb5c7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1006,7 +1006,7 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (!cpu_online(cpu_of(rq)))
+	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
-- 
cgit v1.2.3-55-g7522


From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Tue, 1 Jul 2008 18:48:41 +0200
Subject: sysdev: Pass the attribute to the low level sysdev show/store
 function

This allow to dynamically generate attributes and share show/store
functions between attributes. Right now most attributes are generated
by special macros and lots of duplicated code. With the attribute
passed it's instead possible to attach some data to the attribute
and then use that in shared low level functions to do different things.

I need this for the dynamically generated bank attributes in the x86
machine check code, but it'll allow some further cleanups.

I converted all users in tree to the new show/store prototype. It's a single
huge patch to avoid unbisectable sections.

Runtime tested: x86-32, x86-64
Compiled only: ia64, powerpc
Not compile tested/only grep converted: sh, arm, avr32

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 arch/arm/kernel/time.c                    |  4 ++-
 arch/avr32/kernel/cpu.c                   | 38 +++++++++++++++++---------
 arch/ia64/kernel/err_inject.c             | 22 ++++++++++-----
 arch/powerpc/kernel/sysfs.c               | 15 ++++++++---
 arch/powerpc/platforms/cell/cbe_thermal.c | 45 ++++++++++++++++++++-----------
 arch/powerpc/platforms/cell/spu_base.c    |  3 ++-
 arch/s390/kernel/smp.c                    | 36 ++++++++++++++++---------
 arch/s390/kernel/time.c                   | 35 ++++++++++++++++--------
 arch/sh/drivers/dma/dma-sysfs.c           | 15 ++++++++---
 arch/sparc64/kernel/sysfs.c               | 16 +++++++----
 arch/x86/kernel/cpu/mcheck/mce_64.c       | 14 +++++++---
 arch/x86/kernel/cpu/mcheck/therm_throt.c  |  1 +
 arch/x86/kernel/microcode.c               | 10 ++++---
 drivers/base/cpu.c                        | 10 ++++---
 drivers/base/memory.c                     | 12 ++++++---
 drivers/base/node.c                       | 15 +++++++----
 drivers/base/sys.c                        |  4 +--
 drivers/base/topology.c                   | 17 ++++++++----
 drivers/cpuidle/sysfs.c                   | 10 ++++---
 drivers/xen/balloon.c                     |  1 +
 include/linux/sysdev.h                    |  5 ++--
 kernel/rtmutex-tester.c                   |  7 ++---
 kernel/sched.c                            |  8 ++++--
 kernel/time/clocksource.c                 |  8 ++++--
 24 files changed, 239 insertions(+), 112 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index cc5145b28e7f..368d171754cf 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -130,7 +130,9 @@ static const struct leds_evt_name evt_names[] = {
 	{ "red",   led_red_on,   led_red_off   },
 };
 
-static ssize_t leds_store(struct sys_device *dev, const char *buf, size_t size)
+static ssize_t leds_store(struct sys_device *dev,
+			struct sysdev_attribute *attr,
+			const char *buf, size_t size)
 {
 	int ret = -EINVAL, len = strcspn(buf, " ");
 
diff --git a/arch/avr32/kernel/cpu.c b/arch/avr32/kernel/cpu.c
index b8409caeb23d..e84faffbbeca 100644
--- a/arch/avr32/kernel/cpu.c
+++ b/arch/avr32/kernel/cpu.c
@@ -26,14 +26,16 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
  * XXX: If/when a SMP-capable implementation of AVR32 will ever be
  * made, we must make sure that the code executes on the correct CPU.
  */
-static ssize_t show_pc0event(struct sys_device *dev, char *buf)
+static ssize_t show_pc0event(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 12) & 0x3f);
 }
-static ssize_t store_pc0event(struct sys_device *dev, const char *buf,
+static ssize_t store_pc0event(struct sys_device *dev,
+			struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -46,15 +48,17 @@ static ssize_t store_pc0event(struct sys_device *dev, const char *buf,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc0count(struct sys_device *dev, char *buf)
+static ssize_t show_pc0count(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pcnt0;
 
 	pcnt0 = sysreg_read(PCNT0);
 	return sprintf(buf, "%lu\n", pcnt0);
 }
-static ssize_t store_pc0count(struct sys_device *dev, const char *buf,
-			      size_t count)
+static ssize_t store_pc0count(struct sys_device *dev,
+				struct sysdev_attribute *attr,
+				const char *buf, size_t count)
 {
 	unsigned long val;
 	char *endp;
@@ -67,14 +71,16 @@ static ssize_t store_pc0count(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_pc1event(struct sys_device *dev, char *buf)
+static ssize_t show_pc1event(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "0x%lx\n", (pccr >> 18) & 0x3f);
 }
-static ssize_t store_pc1event(struct sys_device *dev, const char *buf,
+static ssize_t store_pc1event(struct sys_device *dev,
+			      struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -87,14 +93,16 @@ static ssize_t store_pc1event(struct sys_device *dev, const char *buf,
 	sysreg_write(PCCR, val);
 	return count;
 }
-static ssize_t show_pc1count(struct sys_device *dev, char *buf)
+static ssize_t show_pc1count(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pcnt1;
 
 	pcnt1 = sysreg_read(PCNT1);
 	return sprintf(buf, "%lu\n", pcnt1);
 }
-static ssize_t store_pc1count(struct sys_device *dev, const char *buf,
+static ssize_t store_pc1count(struct sys_device *dev,
+				struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -108,14 +116,16 @@ static ssize_t store_pc1count(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_pccycles(struct sys_device *dev, char *buf)
+static ssize_t show_pccycles(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccnt;
 
 	pccnt = sysreg_read(PCCNT);
 	return sprintf(buf, "%lu\n", pccnt);
 }
-static ssize_t store_pccycles(struct sys_device *dev, const char *buf,
+static ssize_t store_pccycles(struct sys_device *dev,
+				struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long val;
@@ -129,14 +139,16 @@ static ssize_t store_pccycles(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_pcenable(struct sys_device *dev, char *buf)
+static ssize_t show_pcenable(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long pccr;
 
 	pccr = sysreg_read(PCCR);
 	return sprintf(buf, "%c\n", (pccr & 1)?'1':'0');
 }
-static ssize_t store_pcenable(struct sys_device *dev, const char *buf,
+static ssize_t store_pcenable(struct sys_device *dev,
+			      struct sysdev_attribute *attr, const char *buf,
 			      size_t count)
 {
 	unsigned long pccr, val;
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index b642648cc2ac..c539c689493b 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -55,7 +55,8 @@ static u64 resources[NR_CPUS];
 
 #define show(name) 							\
 static ssize_t 								\
-show_##name(struct sys_device *dev, char *buf)				\
+show_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+		char *buf)						\
 {									\
 	u32 cpu=dev->id;						\
 	return sprintf(buf, "%lx\n", name[cpu]);			\
@@ -63,7 +64,8 @@ show_##name(struct sys_device *dev, char *buf)				\
 
 #define store(name)							\
 static ssize_t 								\
-store_##name(struct sys_device *dev, const char *buf, size_t size)	\
+store_##name(struct sys_device *dev, struct sysdev_attribute *attr,	\
+					const char *buf, size_t size)	\
 {									\
 	unsigned int cpu=dev->id;					\
 	name[cpu] = simple_strtoull(buf, NULL, 16);			\
@@ -76,7 +78,8 @@ show(call_start)
  * processor. The cpu number in driver is only used for storing data.
  */
 static ssize_t
-store_call_start(struct sys_device *dev, const char *buf, size_t size)
+store_call_start(struct sys_device *dev, struct sysdev_attribute *attr,
+		const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
 	unsigned long call_start = simple_strtoull(buf, NULL, 16);
@@ -124,14 +127,16 @@ show(err_type_info)
 store(err_type_info)
 
 static ssize_t
-show_virtual_to_phys(struct sys_device *dev, char *buf)
+show_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+			char *buf)
 {
 	unsigned int cpu=dev->id;
 	return sprintf(buf, "%lx\n", phys_addr[cpu]);
 }
 
 static ssize_t
-store_virtual_to_phys(struct sys_device *dev, const char *buf, size_t size)
+store_virtual_to_phys(struct sys_device *dev, struct sysdev_attribute *attr,
+			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
 	u64 virt_addr=simple_strtoull(buf, NULL, 16);
@@ -154,7 +159,8 @@ show(err_struct_info)
 store(err_struct_info)
 
 static ssize_t
-show_err_data_buffer(struct sys_device *dev, char *buf)
+show_err_data_buffer(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	unsigned int cpu=dev->id;
 
@@ -165,7 +171,9 @@ show_err_data_buffer(struct sys_device *dev, char *buf)
 }
 
 static ssize_t
-store_err_data_buffer(struct sys_device *dev, const char *buf, size_t size)
+store_err_data_buffer(struct sys_device *dev,
+			struct sysdev_attribute *attr,
+			const char *buf, size_t size)
 {
 	unsigned int cpu=dev->id;
 	int ret;
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c8127f832df0..aba0ba95f062 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -28,7 +28,9 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
 /* Time in microseconds we delay before sleeping in the idle loop */
 DEFINE_PER_CPU(unsigned long, smt_snooze_delay) = { 100 };
 
-static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
+static ssize_t store_smt_snooze_delay(struct sys_device *dev,
+				      struct sysdev_attribute *attr,
+				      const char *buf,
 				      size_t count)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -44,7 +46,9 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
 	return count;
 }
 
-static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf)
+static ssize_t show_smt_snooze_delay(struct sys_device *dev,
+				     struct sysdev_attribute *attr,
+				     char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
@@ -152,14 +156,17 @@ static unsigned long write_##NAME(unsigned long val) \
 	mtspr(ADDRESS, val); \
 	return 0; \
 } \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+			struct sysdev_attribute *attr, \
+			char *buf) \
 { \
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
 	unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \
 	return sprintf(buf, "%lx\n", val); \
 } \
 static ssize_t __used \
-	store_##NAME(struct sys_device *dev, const char *buf, size_t count) \
+	store_##NAME(struct sys_device *dev, struct sysdev_attribute *attr, \
+			const char *buf, size_t count) \
 { \
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
 	unsigned long val; \
diff --git a/arch/powerpc/platforms/cell/cbe_thermal.c b/arch/powerpc/platforms/cell/cbe_thermal.c
index 4852bf312d83..4d4c8c169124 100644
--- a/arch/powerpc/platforms/cell/cbe_thermal.c
+++ b/arch/powerpc/platforms/cell/cbe_thermal.c
@@ -97,7 +97,8 @@ static u8 spu_read_register_value(struct sys_device *sysdev, union spe_reg __iom
 	return value.spe[spu->spe_id];
 }
 
-static ssize_t spu_show_temp(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_temp(struct sys_device *sysdev, struct sysdev_attribute *attr,
+			char *buf)
 {
 	u8 value;
 	struct cbe_pmd_regs __iomem *pmd_regs;
@@ -146,32 +147,38 @@ static ssize_t store_throttle(struct cbe_pmd_regs __iomem *pmd_regs, const char
 	return size;
 }
 
-static ssize_t spu_show_throttle_end(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(get_pmd_regs(sysdev), buf, 0);
 }
 
-static ssize_t spu_show_throttle_begin(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(get_pmd_regs(sysdev), buf, 8);
 }
 
-static ssize_t spu_show_throttle_full_stop(struct sys_device *sysdev, char *buf)
+static ssize_t spu_show_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(get_pmd_regs(sysdev), buf, 16);
 }
 
-static ssize_t spu_store_throttle_end(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t spu_store_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(get_pmd_regs(sysdev), buf, size, 0);
 }
 
-static ssize_t spu_store_throttle_begin(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t spu_store_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(get_pmd_regs(sysdev), buf, size, 8);
 }
 
-static ssize_t spu_store_throttle_full_stop(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t spu_store_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(get_pmd_regs(sysdev), buf, size, 16);
 }
@@ -192,43 +199,51 @@ static ssize_t ppe_show_temp(struct sys_device *sysdev, char *buf, int pos)
 
 /* shows the temperature of the DTS on the PPE,
  * located near the linear thermal sensor */
-static ssize_t ppe_show_temp0(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_temp0(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return ppe_show_temp(sysdev, buf, 32);
 }
 
 /* shows the temperature of the second DTS on the PPE */
-static ssize_t ppe_show_temp1(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_temp1(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return ppe_show_temp(sysdev, buf, 0);
 }
 
-static ssize_t ppe_show_throttle_end(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 32);
 }
 
-static ssize_t ppe_show_throttle_begin(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 40);
 }
 
-static ssize_t ppe_show_throttle_full_stop(struct sys_device *sysdev, char *buf)
+static ssize_t ppe_show_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	return show_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, 48);
 }
 
-static ssize_t ppe_store_throttle_end(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_end(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 32);
 }
 
-static ssize_t ppe_store_throttle_begin(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_begin(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 40);
 }
 
-static ssize_t ppe_store_throttle_full_stop(struct sys_device *sysdev, const char *buf, size_t size)
+static ssize_t ppe_store_throttle_full_stop(struct sys_device *sysdev,
+			struct sysdev_attribute *attr, const char *buf, size_t size)
 {
 	return store_throttle(cbe_get_cpu_pmd_regs(sysdev->id), buf, size, 48);
 }
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index 78f905bc6a42..a5bdb89a17c3 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -703,7 +703,8 @@ static unsigned long long spu_acct_time(struct spu *spu,
 }
 
 
-static ssize_t spu_stat_show(struct sys_device *sysdev, char *buf)
+static ssize_t spu_stat_show(struct sys_device *sysdev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct spu *spu = container_of(sysdev, struct spu, sysdev);
 
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b6781030cfbd..b795b3e24afd 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -864,7 +864,8 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t cpu_configure_show(struct sys_device *dev, char *buf)
+static ssize_t cpu_configure_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t count;
 
@@ -874,8 +875,9 @@ static ssize_t cpu_configure_show(struct sys_device *dev, char *buf)
 	return count;
 }
 
-static ssize_t cpu_configure_store(struct sys_device *dev, const char *buf,
-				   size_t count)
+static ssize_t cpu_configure_store(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  const char *buf, size_t count)
 {
 	int cpu = dev->id;
 	int val, rc;
@@ -922,7 +924,8 @@ out:
 static SYSDEV_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t cpu_polarization_show(struct sys_device *dev, char *buf)
+static ssize_t cpu_polarization_show(struct sys_device *dev,
+				     struct sysdev_attribute *attr, char *buf)
 {
 	int cpu = dev->id;
 	ssize_t count;
@@ -950,7 +953,8 @@ static ssize_t cpu_polarization_show(struct sys_device *dev, char *buf)
 }
 static SYSDEV_ATTR(polarization, 0444, cpu_polarization_show, NULL);
 
-static ssize_t show_cpu_address(struct sys_device *dev, char *buf)
+static ssize_t show_cpu_address(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%d\n", __cpu_logical_map[dev->id]);
 }
@@ -970,7 +974,8 @@ static struct attribute_group cpu_common_attr_group = {
 	.attrs = cpu_common_attrs,
 };
 
-static ssize_t show_capability(struct sys_device *dev, char *buf)
+static ssize_t show_capability(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned int capability;
 	int rc;
@@ -982,7 +987,8 @@ static ssize_t show_capability(struct sys_device *dev, char *buf)
 }
 static SYSDEV_ATTR(capability, 0444, show_capability, NULL);
 
-static ssize_t show_idle_count(struct sys_device *dev, char *buf)
+static ssize_t show_idle_count(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long idle_count;
@@ -995,7 +1001,8 @@ static ssize_t show_idle_count(struct sys_device *dev, char *buf)
 }
 static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL);
 
-static ssize_t show_idle_time(struct sys_device *dev, char *buf)
+static ssize_t show_idle_time(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
 	unsigned long long new_time;
@@ -1112,7 +1119,9 @@ out:
 	return rc;
 }
 
-static ssize_t __ref rescan_store(struct sys_device *dev, const char *buf,
+static ssize_t __ref rescan_store(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  const char *buf,
 				  size_t count)
 {
 	int rc;
@@ -1123,7 +1132,9 @@ static ssize_t __ref rescan_store(struct sys_device *dev, const char *buf,
 static SYSDEV_ATTR(rescan, 0200, NULL, rescan_store);
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static ssize_t dispatching_show(struct sys_device *dev, char *buf)
+static ssize_t dispatching_show(struct sys_device *dev,
+				struct sysdev_attribute *attr,
+				char *buf)
 {
 	ssize_t count;
 
@@ -1133,8 +1144,9 @@ static ssize_t dispatching_show(struct sys_device *dev, char *buf)
 	return count;
 }
 
-static ssize_t dispatching_store(struct sys_device *dev, const char *buf,
-				 size_t count)
+static ssize_t dispatching_store(struct sys_device *dev,
+				 struct sysdev_attribute *attr,
+				 const char *buf, size_t count)
 {
 	int val, rc;
 	char delim;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index f2cede3947b2..ab70d9bd9261 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -1100,7 +1100,9 @@ static inline struct etr_aib *etr_aib_from_dev(struct sys_device *dev)
 		return etr_port1_online ? &etr_port1 : NULL;
 }
 
-static ssize_t etr_online_show(struct sys_device *dev, char *buf)
+static ssize_t etr_online_show(struct sys_device *dev,
+				struct sysdev_attribute *attr,
+				char *buf)
 {
 	unsigned int online;
 
@@ -1109,7 +1111,8 @@ static ssize_t etr_online_show(struct sys_device *dev, char *buf)
 }
 
 static ssize_t etr_online_store(struct sys_device *dev,
-			      const char *buf, size_t count)
+				struct sysdev_attribute *attr,
+				const char *buf, size_t count)
 {
 	unsigned int value;
 
@@ -1136,7 +1139,9 @@ static ssize_t etr_online_store(struct sys_device *dev,
 
 static SYSDEV_ATTR(online, 0600, etr_online_show, etr_online_store);
 
-static ssize_t etr_stepping_control_show(struct sys_device *dev, char *buf)
+static ssize_t etr_stepping_control_show(struct sys_device *dev,
+					struct sysdev_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%i\n", (dev == &etr_port0_dev) ?
 		       etr_eacr.e0 : etr_eacr.e1);
@@ -1144,7 +1149,8 @@ static ssize_t etr_stepping_control_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(stepping_control, 0400, etr_stepping_control_show, NULL);
 
-static ssize_t etr_mode_code_show(struct sys_device *dev, char *buf)
+static ssize_t etr_mode_code_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	if (!etr_port0_online && !etr_port1_online)
 		/* Status word is not uptodate if both ports are offline. */
@@ -1155,7 +1161,8 @@ static ssize_t etr_mode_code_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(state_code, 0400, etr_mode_code_show, NULL);
 
-static ssize_t etr_untuned_show(struct sys_device *dev, char *buf)
+static ssize_t etr_untuned_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1166,7 +1173,8 @@ static ssize_t etr_untuned_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(untuned, 0400, etr_untuned_show, NULL);
 
-static ssize_t etr_network_id_show(struct sys_device *dev, char *buf)
+static ssize_t etr_network_id_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1177,7 +1185,8 @@ static ssize_t etr_network_id_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(network, 0400, etr_network_id_show, NULL);
 
-static ssize_t etr_id_show(struct sys_device *dev, char *buf)
+static ssize_t etr_id_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1188,7 +1197,8 @@ static ssize_t etr_id_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(id, 0400, etr_id_show, NULL);
 
-static ssize_t etr_port_number_show(struct sys_device *dev, char *buf)
+static ssize_t etr_port_number_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1199,7 +1209,8 @@ static ssize_t etr_port_number_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(port, 0400, etr_port_number_show, NULL);
 
-static ssize_t etr_coupled_show(struct sys_device *dev, char *buf)
+static ssize_t etr_coupled_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1210,7 +1221,8 @@ static ssize_t etr_coupled_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(coupled, 0400, etr_coupled_show, NULL);
 
-static ssize_t etr_local_time_show(struct sys_device *dev, char *buf)
+static ssize_t etr_local_time_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
@@ -1221,7 +1233,8 @@ static ssize_t etr_local_time_show(struct sys_device *dev, char *buf)
 
 static SYSDEV_ATTR(local_time, 0400, etr_local_time_show, NULL);
 
-static ssize_t etr_utc_offset_show(struct sys_device *dev, char *buf)
+static ssize_t etr_utc_offset_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct etr_aib *aib = etr_aib_from_dev(dev);
 
diff --git a/arch/sh/drivers/dma/dma-sysfs.c b/arch/sh/drivers/dma/dma-sysfs.c
index 51b57c0d1a3c..347ee11351ec 100644
--- a/arch/sh/drivers/dma/dma-sysfs.c
+++ b/arch/sh/drivers/dma/dma-sysfs.c
@@ -23,7 +23,8 @@ static struct sysdev_class dma_sysclass = {
 };
 EXPORT_SYMBOL(dma_sysclass);
 
-static ssize_t dma_show_devices(struct sys_device *dev, char *buf)
+static ssize_t dma_show_devices(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t len = 0;
 	int i;
@@ -57,13 +58,15 @@ static int __init dma_sysclass_init(void)
 }
 postcore_initcall(dma_sysclass_init);
 
-static ssize_t dma_show_dev_id(struct sys_device *dev, char *buf)
+static ssize_t dma_show_dev_id(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
 	return sprintf(buf, "%s\n", channel->dev_id);
 }
 
 static ssize_t dma_store_dev_id(struct sys_device *dev,
+				struct sysdev_attribute *attr,
 				const char *buf, size_t count)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
@@ -74,6 +77,7 @@ static ssize_t dma_store_dev_id(struct sys_device *dev,
 static SYSDEV_ATTR(dev_id, S_IRUGO | S_IWUSR, dma_show_dev_id, dma_store_dev_id);
 
 static ssize_t dma_store_config(struct sys_device *dev,
+				struct sysdev_attribute *attr,
 				const char *buf, size_t count)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
@@ -87,13 +91,15 @@ static ssize_t dma_store_config(struct sys_device *dev,
 
 static SYSDEV_ATTR(config, S_IWUSR, NULL, dma_store_config);
 
-static ssize_t dma_show_mode(struct sys_device *dev, char *buf)
+static ssize_t dma_show_mode(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
 	return sprintf(buf, "0x%08x\n", channel->mode);
 }
 
 static ssize_t dma_store_mode(struct sys_device *dev,
+			      struct sysdev_attribute *attr,
 			      const char *buf, size_t count)
 {
 	struct dma_channel *channel = to_dma_channel(dev);
@@ -104,7 +110,8 @@ static ssize_t dma_store_mode(struct sys_device *dev,
 static SYSDEV_ATTR(mode, S_IRUGO | S_IWUSR, dma_show_mode, dma_store_mode);
 
 #define dma_ro_attr(field, fmt)						\
-static ssize_t dma_show_##field(struct sys_device *dev, char *buf)	\
+static ssize_t dma_show_##field(struct sys_device *dev, 		\
+				struct sysdev_attribute *attr, char *buf)\
 {									\
 	struct dma_channel *channel = to_dma_channel(dev);		\
 	return sprintf(buf, fmt, channel->field);			\
diff --git a/arch/sparc64/kernel/sysfs.c b/arch/sparc64/kernel/sysfs.c
index e885034a6b73..84e5ce146713 100644
--- a/arch/sparc64/kernel/sysfs.c
+++ b/arch/sparc64/kernel/sysfs.c
@@ -14,7 +14,8 @@
 static DEFINE_PER_CPU(struct hv_mmu_statistics, mmu_stats) __attribute__((aligned(64)));
 
 #define SHOW_MMUSTAT_ULONG(NAME) \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+			struct sysdev_attribute *attr, char *buf) \
 { \
 	struct hv_mmu_statistics *p = &per_cpu(mmu_stats, dev->id); \
 	return sprintf(buf, "%lu\n", p->NAME); \
@@ -135,13 +136,16 @@ static unsigned long write_mmustat_enable(unsigned long val)
 	return sun4v_mmustat_conf(ra, &orig_ra);
 }
 
-static ssize_t show_mmustat_enable(struct sys_device *s, char *buf)
+static ssize_t show_mmustat_enable(struct sys_device *s,
+				struct sysdev_attribute *attr, char *buf)
 {
 	unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
 	return sprintf(buf, "%lx\n", val);
 }
 
-static ssize_t store_mmustat_enable(struct sys_device *s, const char *buf, size_t count)
+static ssize_t store_mmustat_enable(struct sys_device *s,
+			struct sysdev_attribute *attr, const char *buf,
+			size_t count)
 {
 	unsigned long val, err;
 	int ret = sscanf(buf, "%ld", &val);
@@ -179,14 +183,16 @@ static void unregister_mmu_stats(struct sys_device *s)
 #endif
 
 #define SHOW_CPUDATA_ULONG_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+		struct sysdev_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%lu\n", c->MEMBER); \
 }
 
 #define SHOW_CPUDATA_UINT_NAME(NAME, MEMBER) \
-static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+static ssize_t show_##NAME(struct sys_device *dev, \
+		struct sysdev_attribute *attr, char *buf) \
 { \
 	cpuinfo_sparc *c = &cpu_data(dev->id); \
 	return sprintf(buf, "%u\n", c->MEMBER); \
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..e6a4d5f67643 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
-	static ssize_t show_ ## name(struct sys_device *s, char *buf) {	\
+	static ssize_t show_ ## name(struct sys_device *s,		\
+				     struct sysdev_attribute *attr,	\
+				     char *buf) {			\
 		return sprintf(buf, "%lx\n", (unsigned long)var);	\
 	}								\
-	static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+	static ssize_t set_ ## name(struct sys_device *s,		\
+				    struct sysdev_attribute *attr,	\
+				    const char *buf, size_t siz) {	\
 		char *end;						\
 		unsigned long new = simple_strtoul(buf, &end, 0);	\
 		if (end == buf) return -EINVAL;				\
@@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
 ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(bank5ctl,bank[5],mce_restart())
 
-static ssize_t show_trigger(struct sys_device *s, char *buf)
+static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+				char *buf)
 {
 	strcpy(buf, trigger);
 	strcat(buf, "\n");
 	return strlen(trigger) + 1;
 }
 
-static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+				const char *buf,size_t siz)
 {
 	char *p;
 	int len;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
 
 #define define_therm_throt_sysdev_show_func(name)                            \
 static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \
+					struct sysdev_attribute *attr,	     \
                                               char *buf)                     \
 {                                                                            \
 	unsigned int cpu = dev->id;                                          \
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 56b933119a04..fc4790638b69 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu)
 	mutex_unlock(&microcode_mutex);
 }
 
-static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+static ssize_t reload_store(struct sys_device *dev,
+			    struct sysdev_attribute *attr,
+			    const char *buf, size_t sz)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 	char *end;
@@ -674,14 +676,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
 	return sz;
 }
 
-static ssize_t version_show(struct sys_device *dev, char *buf)
+static ssize_t version_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->rev);
 }
 
-static ssize_t pf_show(struct sys_device *dev, char *buf)
+static ssize_t pf_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index e38dfed41d80..20537d507909 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -21,15 +21,16 @@ EXPORT_SYMBOL(cpu_sysdev_class);
 static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static ssize_t show_online(struct sys_device *dev, char *buf)
+static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
+			   char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 
 	return sprintf(buf, "%u\n", !!cpu_online(cpu->sysdev.id));
 }
 
-static ssize_t __ref store_online(struct sys_device *dev, const char *buf,
-			    size_t count)
+static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribute *attr,
+				 const char *buf, size_t count)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 	ssize_t ret;
@@ -80,7 +81,8 @@ static inline void register_cpu_control(struct cpu *cpu)
 #ifdef CONFIG_KEXEC
 #include <linux/kexec.h>
 
-static ssize_t show_crash_notes(struct sys_device *dev, char *buf)
+static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute *attr,
+				char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
 	ssize_t rc;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 937e8258981d..4d4e0e7b6e92 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -92,7 +92,8 @@ unregister_memory(struct memory_block *memory, struct mem_section *section)
  * uses.
  */
 
-static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+static ssize_t show_mem_phys_index(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
@@ -102,7 +103,8 @@ static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
 /*
  * online, offline, going offline, etc.
  */
-static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+static ssize_t show_mem_state(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
@@ -217,7 +219,8 @@ out:
 }
 
 static ssize_t
-store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+store_mem_state(struct sys_device *dev,
+		struct sysdev_attribute *attr, const char *buf, size_t count)
 {
 	struct memory_block *mem;
 	unsigned int phys_section_nr;
@@ -248,7 +251,8 @@ out:
  * s.t. if I offline all of these sections I can then
  * remove the physical device?
  */
-static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+static ssize_t show_phys_device(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	struct memory_block *mem =
 		container_of(dev, struct memory_block, sysdev);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0f867a083338..5116b78c6325 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -36,11 +36,13 @@ static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
 	return len;
 }
 
-static inline ssize_t node_read_cpumask(struct sys_device *dev, char *buf)
+static inline ssize_t node_read_cpumask(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	return node_read_cpumap(dev, 0, buf);
 }
-static inline ssize_t node_read_cpulist(struct sys_device *dev, char *buf)
+static inline ssize_t node_read_cpulist(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	return node_read_cpumap(dev, 1, buf);
 }
@@ -49,7 +51,8 @@ static SYSDEV_ATTR(cpumap,  S_IRUGO, node_read_cpumask, NULL);
 static SYSDEV_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
-static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
+static ssize_t node_read_meminfo(struct sys_device * dev,
+			struct sysdev_attribute *attr, char * buf)
 {
 	int n;
 	int nid = dev->id;
@@ -112,7 +115,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 #undef K
 static SYSDEV_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
 
-static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
+static ssize_t node_read_numastat(struct sys_device * dev,
+				struct sysdev_attribute *attr, char * buf)
 {
 	return sprintf(buf,
 		       "numa_hit %lu\n"
@@ -130,7 +134,8 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
 }
 static SYSDEV_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
-static ssize_t node_read_distance(struct sys_device * dev, char * buf)
+static ssize_t node_read_distance(struct sys_device * dev,
+			struct sysdev_attribute *attr, char * buf)
 {
 	int nid = dev->id;
 	int len = 0;
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 50690d9df248..dc7dace14e1c 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -36,7 +36,7 @@ sysdev_show(struct kobject * kobj, struct attribute * attr, char * buffer)
 	struct sysdev_attribute * sysdev_attr = to_sysdev_attr(attr);
 
 	if (sysdev_attr->show)
-		return sysdev_attr->show(sysdev, buffer);
+		return sysdev_attr->show(sysdev, sysdev_attr, buffer);
 	return -EIO;
 }
 
@@ -49,7 +49,7 @@ sysdev_store(struct kobject * kobj, struct attribute * attr,
 	struct sysdev_attribute * sysdev_attr = to_sysdev_attr(attr);
 
 	if (sysdev_attr->store)
-		return sysdev_attr->store(sysdev, buffer, count);
+		return sysdev_attr->store(sysdev, sysdev_attr, buffer, count);
 	return -EIO;
 }
 
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 3f6d9b0a6abe..199cd97e32e6 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -34,7 +34,8 @@
 static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_id_show_func(name)				\
-static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev,		\
+		struct sysdev_attribute *attr, char *buf)	\
 {								\
 	unsigned int cpu = dev->id;				\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
@@ -59,14 +60,17 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 
 #ifdef arch_provides_topology_pointers
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev,			\
+			   struct sysdev_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+static ssize_t show_##name##_list(struct sys_device *dev,		\
+				  struct sysdev_attribute *attr,	\
+				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
 	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
@@ -74,7 +78,8 @@ static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
 
 #else
 #define define_siblings_show_map(name)					\
-static ssize_t show_##name(struct sys_device *dev, char *buf)	\
+static ssize_t show_##name(struct sys_device *dev,			\
+			   struct sysdev_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
 	cpumask_t mask = topology_##name(cpu);				\
@@ -82,7 +87,9 @@ static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 }
 
 #define define_siblings_show_list(name)					\
-static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+static ssize_t show_##name##_list(struct sys_device *dev,		\
+				  struct sysdev_attribute *attr,	\
+				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
 	cpumask_t mask = topology_##name(cpu);				\
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index e949618b9be0..31a0e0b455b6 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -21,7 +21,8 @@ static int __init cpuidle_sysfs_setup(char *unused)
 }
 __setup("cpuidle_sysfs_switch", cpuidle_sysfs_setup);
 
-static ssize_t show_available_governors(struct sys_device *dev, char *buf)
+static ssize_t show_available_governors(struct sys_device *dev,
+		struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t i = 0;
 	struct cpuidle_governor *tmp;
@@ -39,7 +40,8 @@ out:
 	return i;
 }
 
-static ssize_t show_current_driver(struct sys_device *dev, char *buf)
+static ssize_t show_current_driver(struct sys_device *dev,
+		struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -53,7 +55,8 @@ static ssize_t show_current_driver(struct sys_device *dev, char *buf)
 	return ret;
 }
 
-static ssize_t show_current_governor(struct sys_device *dev, char *buf)
+static ssize_t show_current_governor(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t ret;
 
@@ -68,6 +71,7 @@ static ssize_t show_current_governor(struct sys_device *dev, char *buf)
 }
 
 static ssize_t store_current_governor(struct sys_device *dev,
+	struct sysdev_attribute *attr,
 	const char *buf, size_t count)
 {
 	char gov_name[CPUIDLE_NAME_LEN];
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 591bc29b55f5..d4427cb86979 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -610,6 +610,7 @@ static ssize_t show_target_kb(struct sys_device *dev, char *buf)
 }
 
 static ssize_t store_target_kb(struct sys_device *dev,
+			       struct sysdev_attribute *attr,
 			       const char *buf,
 			       size_t count)
 {
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index f2767bc6b735..8dcf3162b21b 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -99,8 +99,9 @@ extern void sysdev_unregister(struct sys_device *);
 
 struct sysdev_attribute { 
 	struct attribute	attr;
-	ssize_t (*show)(struct sys_device *, char *);
-	ssize_t (*store)(struct sys_device *, const char *, size_t);
+	ssize_t (*show)(struct sys_device *, struct sysdev_attribute *, char *);
+	ssize_t (*store)(struct sys_device *, struct sysdev_attribute *,
+			 const char *, size_t);
 };
 
 
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af9..a56f629b057a 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
  *
  * opcode:data
  */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
-				  size_t count)
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+				  const char *buf, size_t count)
 {
 	struct sched_param schedpar;
 	struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
  * @dev:	thread to query
  * @buf:	char buffer to be filled with thread status info
  */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+				 char *buf)
 {
 	struct test_thread_data *td;
 	struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850ecab..b1104ea5d255 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7737,11 +7737,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+					    struct sysdev_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
@@ -7751,11 +7753,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+					     struct sysdev_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..b1c2da81b050 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
  * Provides sysfs interface for listing current clocksource.
  */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_current_clocksources(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t count = 0;
 
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
  * clocksource selction.
  */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  struct sysdev_attribute *attr,
 					  const char *buf, size_t count)
 {
 	struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
  * Provides sysfs interface for listing registered clocksources
  */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_available_clocksources(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  char *buf)
 {
 	struct clocksource *src;
 	ssize_t count = 0;
-- 
cgit v1.2.3-55-g7522


From 49b5cf34727a6c1be1568ab28e89a2d9a6bf51e0 Mon Sep 17 00:00:00 2001
From: Jonathan Lim
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: accounting: account for user time when updating memory integrals

Adapt acct_update_integrals() to include user time when calculating the time
difference.  The units of acct_rss_mem1 and acct_vm_mem1 are also changed from
pages-jiffies to pages-usecs to avoid calling jiffies_to_usecs() in
xacct_add_tsk() which might overflow.

Signed-off-by: Jonathan Lim <jlim@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        |  2 ++
 kernel/tsacct.c       | 21 ++++++++++++++-------
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 354ef478a80d..af780f299c7c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1257,7 +1257,7 @@ struct task_struct {
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
-	cputime_t acct_stimexpd;/* stime since last update */
+	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d3336..0047bd9b96aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+	/* Account for user time used */
+	acct_update_integrals(p);
 }
 
 /*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..1da6990af8e0 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-jiffies to Mbyte-usec */
-	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
-	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+	/* convert pages-usec to Mbyte-usec */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = cputime_to_jiffies(
-			cputime_sub(tsk->stime, tsk->acct_stimexpd));
+		cputime_t time, dtime;
+		struct timeval value;
+		u64 delta;
+
+		time = tsk->stime + tsk->utime;
+		dtime = cputime_sub(time, tsk->acct_timexpd);
+		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+		delta = value.tv_sec;
+		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
 			return;
-		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
  */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-	tsk->acct_stimexpd = 0;
+	tsk->acct_timexpd = 0;
 	tsk->acct_rss_mem1 = 0;
 	tsk->acct_vm_mem1 = 0;
 }
-- 
cgit v1.2.3-55-g7522


From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Full conversion to early_initcall() interface, remove old interface

A previous patch added the early_initcall(), to allow a cleaner hooking of
pre-SMP initcalls.  Now we remove the older interface, converting all
existing users to the new one.

[akpm@linux-foundation.org: cleanups]
[akpm@linux-foundation.org: build fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  9 ---------
 include/linux/smp.h   |  5 -----
 init/main.c           | 23 +----------------------
 kernel/sched.c        |  5 ++++-
 kernel/smp.c          |  4 +++-
 kernel/softirq.c      |  3 ++-
 kernel/softlockup.c   | 25 ++++++++++++++++++++++---
 7 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3260a5c42b91..adb8077dc463 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -292,7 +292,6 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
-extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
@@ -2222,14 +2221,6 @@ static inline void inc_syscw(struct task_struct *tsk)
 }
 #endif
 
-#ifdef CONFIG_SMP
-void migration_init(void);
-#else
-static inline void migration_init(void)
-{
-}
-#endif
-
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 48262f86c969..66484d4a8459 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -74,15 +74,10 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data);
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 void generic_smp_call_function_single_interrupt(void);
 void generic_smp_call_function_interrupt(void);
-void init_call_single_data(void);
 void ipi_call_lock(void);
 void ipi_call_unlock(void);
 void ipi_call_lock_irq(void);
 void ipi_call_unlock_irq(void);
-#else
-static inline void init_call_single_data(void)
-{
-}
 #endif
 
 /*
diff --git a/init/main.c b/init/main.c
index b6fec08dbbef..20fdc9884b77 100644
--- a/init/main.c
+++ b/init/main.c
@@ -774,16 +774,7 @@ static void __init do_basic_setup(void)
 	do_initcalls();
 }
 
-static int __initdata nosoftlockup;
-
-static int __init nosoftlockup_setup(char *str)
-{
-	nosoftlockup = 1;
-	return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-
-static void __init __do_pre_smp_initcalls(void)
+static void __init do_pre_smp_initcalls(void)
 {
 	initcall_t *call;
 
@@ -791,17 +782,6 @@ static void __init __do_pre_smp_initcalls(void)
 		do_one_initcall(*call);
 }
 
-static void __init do_pre_smp_initcalls(void)
-{
-	extern int spawn_ksoftirqd(void);
-
-	init_call_single_data();
-	migration_init();
-	spawn_ksoftirqd();
-	if (!nosoftlockup)
-		spawn_softlockup_task();
-}
-
 static void run_init_process(char *init_filename)
 {
 	argv_init[0] = init_filename;
@@ -873,7 +853,6 @@ static int __init kernel_init(void * unused)
 
 	smp_prepare_cpus(setup_max_cpus);
 
-	__do_pre_smp_initcalls();
 	do_pre_smp_initcalls();
 
 	smp_init();
diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9b96aa..fde1a1026359 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-void __init migration_init(void)
+static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -6399,7 +6399,10 @@ void __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
+
+	return err;
 }
+early_initcall(migration_init);
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..96fc7c0edc59 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
 	int i;
 
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+	return 0;
 }
+early_initcall(init_call_single_data);
 
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2bf..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+early_initcall(spawn_ksoftirqd);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5d..b75b492fbfcf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+	nosoftlockup = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err;
 
-	BUG_ON(err == NOTIFY_BAD);
+	if (nosoftlockup)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
 }
+early_initcall(spawn_softlockup_task);
-- 
cgit v1.2.3-55-g7522


From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001
From: Roland McGrath
Date: Fri, 25 Jul 2008 19:45:58 -0700
Subject: tracehook: wait_task_inactive

This extends wait_task_inactive() with a new argument so it can be used in
a "soft" mode where it will check for the task changing state unexpectedly
and back off.  There is no change to existing callers.  This lays the
groundwork to allow robust, noninvasive tracing that can try to sample a
blocked thread but back off safely if it wakes up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 ++--
 include/linux/sched.h      |  8 ++++++--
 kernel/kthread.c           |  2 +-
 kernel/ptrace.c            |  2 +-
 kernel/sched.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 19d4493c6193..fc8f3509df27 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2626,7 +2626,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	/*
 	 * make sure the task is off any CPU
 	 */
-	wait_task_inactive(task);
+	wait_task_inactive(task, 0);
 
 	/* more to come... */
 
@@ -4774,7 +4774,7 @@ recheck:
 
 		UNPROTECT_CTX(ctx, flags);
 
-		wait_task_inactive(task);
+		wait_task_inactive(task, 0);
 
 		PROTECT_CTX(ctx, flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a95d84d0da95..f59318a0099b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1882,9 +1882,13 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_inactive(struct task_struct * p);
+extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-#define wait_task_inactive(p)	do { } while (0)
+static inline unsigned long wait_task_inactive(struct task_struct *p,
+					       long match_state)
+{
+	return 1;
+}
 #endif
 
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 		return;
 	}
 	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k);
+	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..082b3fcb32a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !kill)
-		wait_task_inactive(child);
+		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index fde1a1026359..0236958addcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
+	unsigned long ncsw;
 	struct rq *rq;
 
 	for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
-		while (task_running(rq, p))
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
 			cpu_relax();
+		}
 
 		/*
 		 * Ok, time to look more closely! We need the rq
@@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p)
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state) {
+			ncsw = p->nivcsw + p->nvcsw;
+			if (unlikely(!ncsw))
+				ncsw = 1;
+		}
 		task_rq_unlock(rq, &flags);
 
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
 		 */
 		break;
 	}
+
+	return ncsw;
 }
 
 /***
-- 
cgit v1.2.3-55-g7522


From e26873bb10f722f10b1af9de05119a3d7cbc07b2 Mon Sep 17 00:00:00 2001
From: roel kluin
Date: Tue, 22 Jul 2008 16:51:15 -0400
Subject: sched: test runtime rather than period in global_rt_runtime()

Test runtime rather than period

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..0d1717b00225 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -834,7 +834,7 @@ static inline u64 global_rt_period(void)
 
 static inline u64 global_rt_runtime(void)
 {
-	if (sysctl_sched_rt_period < 0)
+	if (sysctl_sched_rt_runtime < 0)
 		return RUNTIME_INF;
 
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-- 
cgit v1.2.3-55-g7522


From f718cd4add5aea9d379faff92f162571e356cc5f Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Tue, 29 Jul 2008 22:33:52 -0700
Subject: sched: make scheduler sysfs attributes sysdev class devices

They are really class devices, but were incorrectly declared.  This
leads to crashes with the recent changes that makes non normal sysdevs
use a different prototype.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pierre Ossman <drzeus-list@drzeus.cx>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..21f7da94662e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7671,34 +7671,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
-					    struct sysdev_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
-		   sched_mc_power_savings_store);
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+			 sched_mc_power_savings_show,
+			 sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
-					     struct sysdev_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 
-- 
cgit v1.2.3-55-g7522


From 5e710e37bde120bb069f691bee68e69ef4393173 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 30 Jul 2008 13:26:57 +0200
Subject: lockdep: change scheduler annotation

While thinking about David's graph walk lockdep patch it _finally_
dawned on me that there is no reason we have a lock class per cpu ...

Sorry for being dense :-/

The below changes the annotation from a lock class per cpu, to a single
nested lock, as the scheduler never holds more that 2 rq locks at a time
anyway.

If there was code requiring holding all rq locks this would not work and
the original annotation would be the only option, but that not being the
case, this is a much lighter one.

Compiles and boots on a 2-way x86_64.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..655f1db26b12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -600,7 +600,6 @@ struct rq {
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
-	struct lock_class_key rq_lock_key;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
-			spin_lock(&rq2->lock);
+			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			spin_lock(&rq2->lock);
-			spin_lock(&rq1->lock);
+			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 	update_rq_clock(rq1);
@@ -2805,10 +2804,10 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
-			spin_lock(&this_rq->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
-			spin_lock(&busiest->lock);
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
@@ -7998,7 +7997,6 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
-		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
-- 
cgit v1.2.3-55-g7522


From 725aad24c3ba96a7c06448c14c265a466cdbd663 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge
Date: Sun, 3 Aug 2008 09:33:03 -0700
Subject: __sched_setscheduler: don't do any policy checks when not "user"

The "user" parameter to __sched_setscheduler indicates whether the
change is being done on behalf of a user process or not.  If not, we
shouldn't apply any permissions checks, so don't call
security_task_setscheduler().

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 21f7da94662e..04160d277e7a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5004,19 +5004,21 @@ recheck:
 			return -EPERM;
 	}
 
+	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/*
-	 * Do not allow realtime tasks into groups that have no runtime
-	 * assigned.
-	 */
-	if (user
-	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
-		return -EPERM;
+		/*
+		 * Do not allow realtime tasks into groups that have no runtime
+		 * assigned.
+		 */
+		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+			return -EPERM;
 #endif
 
-	retval = security_task_setscheduler(p, policy, param);
-	if (retval)
-		return retval;
+		retval = security_task_setscheduler(p, policy, param);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-- 
cgit v1.2.3-55-g7522


From 1b12bbc747560ea68bcc132c3d05699e52271da0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 11 Aug 2008 09:30:22 +0200
Subject: lockdep: re-annotate scheduler runqueues

Instead of using a per-rq lock class, use the regular nesting operations.

However, take extra care with double_lock_balance() as it can release the
already held rq->lock (and therefore change its nesting class).

So what can happen is:

 spin_lock(rq->lock);	// this rq subclass 0

 double_lock_balance(rq, other_rq);
   // release rq
   // acquire other_rq->lock subclass 0
   // acquire rq->lock subclass 1

 spin_unlock(other_rq->lock);

leaving you with rq->lock in subclass 1

So a subsequent double_lock_balance() call can try to nest a subclass 1
lock while already holding a subclass 1 lock.

Fix this by introducing double_unlock_balance() which releases the other
rq's lock, but also re-sets the subclass for this rq's lock to 0.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 11 +++++++++--
 kernel/sched_rt.c |  8 +++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 655f1db26b12..9b2b6a85577d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2812,6 +2812,13 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -3636,7 +3643,7 @@ redo:
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
-		spin_unlock(&busiest->lock);
+		double_unlock_balance(this_rq, busiest);
 
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), *cpus);
@@ -3751,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		else
 			schedstat_inc(sd, alb_failed);
 	}
-	spin_unlock(&target_rq->lock);
+	double_unlock_balance(busiest_rq, target_rq);
 }
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad0..6163e4cf885b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			break;
 
 		/* try again */
-		spin_unlock(&lowest_rq->lock);
+		double_unlock_balance(rq, lowest_rq);
 		lowest_rq = NULL;
 	}
 
@@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq)
 
 	resched_task(lowest_rq->curr);
 
-	spin_unlock(&lowest_rq->lock);
+	double_unlock_balance(rq, lowest_rq);
 
 	ret = 1;
 out:
@@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 		}
  skip:
-		spin_unlock(&src_rq->lock);
+		double_unlock_balance(this_rq, src_rq);
 	}
 
 	return ret;
-- 
cgit v1.2.3-55-g7522


From 09f2724a786f76475ef2985cf84f5359c553aade Mon Sep 17 00:00:00 2001
From: Zhang, Yanmin
Date: Wed, 14 Aug 2030 15:56:40 +0800
Subject: sched: fix the race between walk_tg_tree and sched_create_group

With 2.6.27-rc3, I hit a kernel panic when running volanoMark on my
new x86_64 machine. I also hit it with other 2.6.27-rc kernels.
See below log.

Basically, function walk_tg_tree and sched_create_group have a race
between accessing and initiating tg->children. Below patch fixes it
by moving tg->children initiation to the front of linking tg->siblings
to parent->children.

{----------------panic log------------}

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
IP: [<ffffffff802292ab>] walk_tg_tree+0x45/0x7f
PGD 1be1c4067 PUD 1bdd8d067 PMD 0
Oops: 0000 [1] SMP
CPU 11
Modules linked in: igb
Pid: 22979, comm: java Not tainted 2.6.27-rc3 #1
RIP: 0010:[<ffffffff802292ab>]  [<ffffffff802292ab>] walk_tg_tree+0x45/0x7f
RSP: 0018:ffff8801bfbbbd18  EFLAGS: 00010083
RAX: 0000000000000000 RBX: ffff8800be0dce40 RCX: ffffffffffffffc0
RDX: ffff880102c43740 RSI: 0000000000000000 RDI: ffff8800be0dce40
RBP: ffff8801bfbbbd48 R08: ffff8800ba437bc8 R09: 0000000000001f40
R10: ffff8801be812100 R11: ffffffff805fdf44 R12: ffff880102c43740
R13: 0000000000000000 R14: ffffffff8022cf0f R15: ffffffff8022749f
FS:  00000000568ac950(0063) GS:ffff8801bfa26d00(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 00000001bd848000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process java (pid: 22979, threadinfo ffff8801b145a000, task ffff8801bf18e450)
Stack:  0000000000000001 ffff8800ba5c8d60 0000000000000001 0000000000000001
 ffff8800bad1ccb8 0000000000000000 ffff8801bfbbbd98 ffffffff8022ed37
 0000000000000001 0000000000000286 ffff8801bd5ee180 ffff8800ba437bc8
Call Trace:
 <IRQ>  [<ffffffff8022ed37>] try_to_wake_up+0x71/0x24c
 [<ffffffff80247177>] autoremove_wake_function+0x9/0x2e
 [<ffffffff80228039>] ? __wake_up_common+0x46/0x76
 [<ffffffff802296d5>] __wake_up+0x38/0x4f
 [<ffffffff806169cc>] tcp_v4_rcv+0x380/0x62e

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..8bf8a5528bc7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8462,8 +8462,8 @@ struct task_group *sched_create_group(struct task_group *parent)
 	WARN_ON(!parent); /* root should already exist */
 
 	tg->parent = parent;
-	list_add_rcu(&tg->siblings, &parent->children);
 	INIT_LIST_HEAD(&tg->children);
+	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	return tg;
-- 
cgit v1.2.3-55-g7522


From be4de35263f59ca1f4740edfffbfb02cc3f2189e Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Fri, 15 Aug 2008 00:40:44 -0700
Subject: completions: uninline try_wait_for_completion and completion_done

m68k fails to build with these functions inlined in completion.h.  Move
them out of line into sched.c and export them to avoid this problem.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/completion.h | 46 ++--------------------------------------------
 kernel/sched.c             | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 44 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 57faa60de9bd..02ef8835999c 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -49,6 +49,8 @@ extern unsigned long wait_for_completion_timeout(struct completion *x,
 						   unsigned long timeout);
 extern unsigned long wait_for_completion_interruptible_timeout(
 			struct completion *x, unsigned long timeout);
+extern bool try_wait_for_completion(struct completion *x);
+extern bool completion_done(struct completion *x);
 
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
@@ -56,48 +58,4 @@ extern void complete_all(struct completion *);
 #define INIT_COMPLETION(x)	((x).done = 0)
 
 
-/**
- *	try_wait_for_completion - try to decrement a completion without blocking
- *	@x:	completion structure
- *
- *	Returns: 0 if a decrement cannot be done without blocking
- *		 1 if a decrement succeeded.
- *
- *	If a completion is being used as a counting completion,
- *	attempt to decrement the counter without blocking. This
- *	enables us to avoid waiting if the resource the completion
- *	is protecting is not available.
- */
-static inline bool try_wait_for_completion(struct completion *x)
-{
-	int ret = 1;
-
-	spin_lock_irq(&x->wait.lock);
-	if (!x->done)
-		ret = 0;
-	else
-		x->done--;
-	spin_unlock_irq(&x->wait.lock);
-	return ret;
-}
-
-/**
- *	completion_done - Test to see if a completion has any waiters
- *	@x:	completion structure
- *
- *	Returns: 0 if there are waiters (wait_for_completion() in progress)
- *		 1 if there are no waiters.
- *
- */
-static inline bool completion_done(struct completion *x)
-{
-	int ret = 1;
-
-	spin_lock_irq(&x->wait.lock);
-	if (!x->done)
-		ret = 0;
-	spin_unlock_irq(&x->wait.lock);
-	return ret;
-}
-
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..95e6ad3c231d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4669,6 +4669,52 @@ int __sched wait_for_completion_killable(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
 
+/**
+ *	try_wait_for_completion - try to decrement a completion without blocking
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if a decrement cannot be done without blocking
+ *		 1 if a decrement succeeded.
+ *
+ *	If a completion is being used as a counting completion,
+ *	attempt to decrement the counter without blocking. This
+ *	enables us to avoid waiting if the resource the completion
+ *	is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+	int ret = 1;
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done)
+		ret = 0;
+	else
+		x->done--;
+	spin_unlock_irq(&x->wait.lock);
+	return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ *	completion_done - Test to see if a completion has any waiters
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *		 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+	int ret = 1;
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done)
+		ret = 0;
+	spin_unlock_irq(&x->wait.lock);
+	return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
-- 
cgit v1.2.3-55-g7522


From 55cd53404c5cc5fd94708232e3b4aa4a9388917b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Mon, 4 Aug 2008 08:54:26 +0200
Subject: sched: scale sysctl_sched_shares_ratelimit with nr_cpus

David reported that his Niagra spend a little too much time in
tg_shares_up(), which considering he has a large cpu count makes sense.

So scale the ratelimit value with the number of cpus like we do for
other controls as well.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8bf8a5528bc7..040807196b33 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -808,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
  * ratelimit for updating the group shares.
- * default: 0.5ms
+ * default: 0.25ms
  */
-const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+unsigned int sysctl_sched_shares_ratelimit = 250000;
 
 /*
  * period over which we measure -rt task cpu usage in us.
@@ -5740,6 +5740,8 @@ static inline void sched_init_granularity(void)
 		sysctl_sched_latency = limit;
 
 	sysctl_sched_wakeup_granularity *= factor;
+
+	sysctl_sched_shares_ratelimit *= factor;
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-55-g7522


From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 19 Aug 2008 12:33:04 +0200
Subject: sched: rt-bandwidth group disable fixes

More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime
to disable full group bandwidth control.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 9 ++++++++-
 kernel/sched_rt.c | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..c1bee5fb8154 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
+static inline int rt_bandwidth_enabled(void);
+
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
@@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 77340b04a538..94daace5ee15 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	int i, idle = 1;
 	cpumask_t span;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
 	span = sched_rt_period_mask();
@@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	if (!rt_bandwidth_enabled())
+		return;
+
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
-- 
cgit v1.2.3-55-g7522


From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 19 Aug 2008 12:33:05 +0200
Subject: sched: extract walk_tg_tree()

Extract walk_tg_tree() and make it a little more generic so we can use it
in the schedulablity test.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index c1bee5fb8154..8c019a19d052 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	if (rq->nr_running)
-		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
-	return rq->avg_load_per_task;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED))
+typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static void
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
+	int ret;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, cpu, sd);
+	ret = (*down)(parent, data);
+	if (ret)
+		goto out_unlock;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1426,14 +1412,42 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, cpu, sd);
+	ret = (*up)(parent, data);
+	if (ret)
+		goto out_unlock;
 
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
+out_unlock:
 	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tg_nop(struct task_group *tg, void *data)
+{
+	return 0;
 }
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->nr_running)
+		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+	return rq->avg_load_per_task;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
@@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
  * This needs to be done in a bottom-up fashion because the rq weight of a
  * parent group depends on the shares of its child groups.
  */
-static void
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_shares_up(struct task_group *tg, void *data)
 {
 	unsigned long rq_weight = 0;
 	unsigned long shares = 0;
+	struct sched_domain *sd = data;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
+
+	return 0;
 }
 
 /*
@@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static void
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_load_down(struct task_group *tg, void *data)
 {
 	unsigned long load;
+	long cpu = (long)data;
 
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
@@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 	}
 
 	tg->cfs_rq[cpu]->h_load = load;
-}
 
-static void
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
+	return 0;
 }
 
 static void update_shares(struct sched_domain *sd)
@@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd)
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
 		sd->last_update = now;
-		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+		walk_tg_tree(tg_nop, tg_shares_up, sd);
 	}
 }
 
@@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 	spin_lock(&rq->lock);
 }
 
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
 {
-	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
 #else
-- 
cgit v1.2.3-55-g7522


From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 19 Aug 2008 12:33:06 +0200
Subject: sched: rt-bandwidth fixes

The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting
for the group scheduler - however it doesn't deal with sched_setscheduler(),
which will keep tasks out of groups that have no assigned runtime.

If we relax this, we get into the situation where RT tasks can get into a group
when we disable bandwidth control, and then starve them by enabling it again.

Rework the schedulability code to check for this condition and fail to turn
on bandwidth control with -EBUSY when this situation is found.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 125 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 62 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8c019a19d052..e41bdae2778d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED))
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
@@ -5082,7 +5082,8 @@ recheck:
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
-		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+		if (rt_bandwidth_enabled() && rt_policy(policy) &&
+				task_group(p)->rt_bandwidth.rt_runtime == 0)
 			return -EPERM;
 #endif
 
@@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
-		return 1ULL << 16;
+		return 1ULL << 20;
 
-	return div64_u64(runtime << 16, period);
+	return div64_u64(runtime << 20, period);
 }
 
-#ifdef CONFIG_CGROUP_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-	struct task_group *tgi, *parent = tg->parent;
-	unsigned long total = 0;
+	struct task_struct *g, *p;
 
-	if (!parent) {
-		if (global_rt_period() < period)
-			return 0;
+	do_each_thread(g, p) {
+		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+			return 1;
+	} while_each_thread(g, p);
 
-		return to_ratio(period, runtime) <
-			to_ratio(global_rt_period(), global_rt_runtime());
-	}
+	return 0;
+}
 
-	if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-		return 0;
+struct rt_schedulable_data {
+	struct task_group *tg;
+	u64 rt_period;
+	u64 rt_runtime;
+};
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-		if (tgi == tg)
-			continue;
+static int tg_schedulable(struct task_group *tg, void *data)
+{
+	struct rt_schedulable_data *d = data;
+	struct task_group *child;
+	unsigned long total, sum = 0;
+	u64 period, runtime;
+
+	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	runtime = tg->rt_bandwidth.rt_runtime;
 
-		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-				tgi->rt_bandwidth.rt_runtime);
+	if (tg == d->tg) {
+		period = d->rt_period;
+		runtime = d->rt_runtime;
 	}
-	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) <=
-		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-				parent->rt_bandwidth.rt_runtime);
-}
-#elif defined CONFIG_USER_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-	struct task_group *tgi;
-	unsigned long total = 0;
-	unsigned long global_ratio =
-		to_ratio(global_rt_period(), global_rt_runtime());
+	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+		return -EBUSY;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &task_groups, list) {
-		if (tgi == tg)
-			continue;
+	total = to_ratio(period, runtime);
 
-		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-				tgi->rt_bandwidth.rt_runtime);
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		period = ktime_to_ns(child->rt_bandwidth.rt_period);
+		runtime = child->rt_bandwidth.rt_runtime;
+
+		if (child == d->tg) {
+			period = d->rt_period;
+			runtime = d->rt_runtime;
+		}
+
+		sum += to_ratio(period, runtime);
 	}
-	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) < global_ratio;
+	if (sum > total)
+		return -EINVAL;
+
+	return 0;
 }
-#endif
 
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_struct *g, *p;
-	do_each_thread(g, p) {
-		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-			return 1;
-	} while_each_thread(g, p);
-	return 0;
+	struct rt_schedulable_data data = {
+		.tg = tg,
+		.rt_period = period,
+		.rt_runtime = runtime,
+	};
+
+	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 
 static int tg_set_bandwidth(struct task_group *tg,
@@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg,
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-		err = -EBUSY;
+	err = __rt_schedulable(tg, rt_period, rt_runtime);
+	if (err)
 		goto unlock;
-	}
-	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-		err = -EINVAL;
-		goto unlock;
-	}
 
 	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void)
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
 	mutex_lock(&rt_constraints_mutex);
-	if (!__rt_schedulable(tg, rt_period, rt_runtime))
-		ret = -EINVAL;
+	read_lock(&tasklist_lock);
+	ret = __rt_schedulable(tg, rt_period, rt_runtime);
+	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return ret;
-- 
cgit v1.2.3-55-g7522


From 94d3d8247de22c5b0624aa00616ceca459498e55 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 20 Aug 2008 16:54:41 -0700
Subject: sched: do_wait_for_common: use signal_pending_state()

Change do_wait_for_common() to use signal_pending_state() instead of open
coding.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..da7c5d23cc03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4599,10 +4599,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
-			if ((state == TASK_INTERRUPTIBLE &&
-			     signal_pending(current)) ||
-			    (state == TASK_KILLABLE &&
-			     fatal_signal_pending(current))) {
+			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
-- 
cgit v1.2.3-55-g7522


From f31e11d87a5d7601636710195891ba462ad99f11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 20 Aug 2008 16:54:44 -0700
Subject: wait_task_inactive(): don't consider task->nivcsw

If wait_task_inactive() returns success the task was deactivated.  In that
case schedule() always increments ->nvcsw which alone can be used as a
"generation counter".

If the next call returns the same number, we can be sure that the task was
unscheduled.  Otherwise, because we know that .on_rq == 0 again, ->nvcsw
should have been changed in between.

Q: perhaps it is better to do "ncsw = (p->nvcsw << 1) | 1" ?  This
decreases the possibility of "was it unscheduled" false positive when
->nvcsw == 0.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index da7c5d23cc03..908670aa215a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1921,11 +1921,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
-		if (!match_state || p->state == match_state) {
-			ncsw = p->nivcsw + p->nvcsw;
-			if (unlikely(!ncsw))
-				ncsw = 1;
-		}
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw ?: 1;
 		task_rq_unlock(rq, &flags);
 
 		/*
-- 
cgit v1.2.3-55-g7522


From 93dcf55f828b035fc93fc19eb03c1390e1e6d570 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 20 Aug 2008 16:54:44 -0700
Subject: wait_task_inactive: "improve" the returned value for ->nvcsw == 0

wait_task_inactive() returns 1 when p->nvcsw == 0 || p->nvcsw == 1.  This
means that two subsequent calls can return the same number while the task
was scheduled in between.

Change the code to return "nvcsw | LONG_MIN" instead of "nvcsw ?: 1", now
the overlap always needs LONG_MAX schedules.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 908670aa215a..6a43c8942b05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1922,7 +1922,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		on_rq = p->se.on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
-			ncsw = p->nvcsw ?: 1;
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, &flags);
 
 		/*
-- 
cgit v1.2.3-55-g7522


From 65eb3dc609dec17deea48dcd4de2e549d29a9824 Mon Sep 17 00:00:00 2001
From: Kevin Diggs
Date: Tue, 26 Aug 2008 10:26:54 +0200
Subject: sched: add kernel doc for the completion, fix
 kernel-doc-nano-HOWTO.txt

This patch adds kernel doc for the completion feature.

An error in the split-man.pl PERL snippet in kernel-doc-nano-HOWTO.txt is
also fixed.

Signed-off-by: Kevin Diggs <kevdig@hypersurf.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-doc-nano-HOWTO.txt |  4 +--
 include/linux/completion.h              | 41 ++++++++++++++++++++++++
 kernel/sched.c                          | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt
index 0bd32748a467..c6841eee9598 100644
--- a/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
 mkdir $ARGV[0],0777;
 $state = 0;
 while (<STDIN>) {
-    if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
+    if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
 	if ($state == 1) { close OUT }
 	$state = 1;
-	$fn = "$ARGV[0]/$1.4";
+	$fn = "$ARGV[0]/$1.9";
 	print STDERR "Creating $fn\n";
 	open OUT, ">$fn" or die "can't open $fn: $!\n";
 	print OUT $_;
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 02ef8835999c..4a6b604ef7e4 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -10,6 +10,18 @@
 
 #include <linux/wait.h>
 
+/**
+ * struct completion - structure used to maintain state for a "completion"
+ *
+ * This is the opaque structure used to maintain the state for a "completion".
+ * Completions currently use a FIFO to queue threads that have to wait for
+ * the "completion" event.
+ *
+ * See also:  complete(), wait_for_completion() (and friends _timeout,
+ * _interruptible, _interruptible_timeout, and _killable), init_completion(),
+ * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
+ * INIT_COMPLETION().
+ */
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
@@ -21,6 +33,14 @@ struct completion {
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	({ init_completion(&work); work; })
 
+/**
+ * DECLARE_COMPLETION: - declare and initialize a completion structure
+ * @work:  identifier for the completion structure
+ *
+ * This macro declares and initializes a completion structure. Generally used
+ * for static declarations. You should use the _ONSTACK variant for automatic
+ * variables.
+ */
 #define DECLARE_COMPLETION(work) \
 	struct completion work = COMPLETION_INITIALIZER(work)
 
@@ -29,6 +49,13 @@ struct completion {
  * completions - so we use the _ONSTACK() variant for those that
  * are on the kernel stack:
  */
+/**
+ * DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure
+ * @work:  identifier for the completion structure
+ *
+ * This macro declares and initializes a completion structure on the kernel
+ * stack.
+ */
 #ifdef CONFIG_LOCKDEP
 # define DECLARE_COMPLETION_ONSTACK(work) \
 	struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
@@ -36,6 +63,13 @@ struct completion {
 # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
 #endif
 
+/**
+ * init_completion: - Initialize a dynamically allocated completion
+ * @x:  completion structure that is to be initialized
+ *
+ * This inline function will initialize a dynamically created completion
+ * structure.
+ */
 static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
@@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x);
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
 
+/**
+ * INIT_COMPLETION: - reinitialize a completion structure
+ * @x:  completion structure to be reinitialized
+ *
+ * This macro should be used to reinitialize a completion structure so it can
+ * be reused. This is especially important after complete_all() is used.
+ */
 #define INIT_COMPLETION(x)	((x).done = 0)
 
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 29e2ec0bd831..93f5ea08be97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4565,6 +4565,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
 void complete(struct completion *x)
 {
 	unsigned long flags;
@@ -4576,6 +4585,12 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
 
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
@@ -4624,12 +4639,31 @@ wait_for_common(struct completion *x, long timeout, int state)
 	return timeout;
 }
 
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
@@ -4637,6 +4671,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4646,6 +4687,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
@@ -4654,6 +4703,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
 int __sched wait_for_completion_killable(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-- 
cgit v1.2.3-55-g7522


From 29cbef4869bf288256ab76c7dc674cb132b35de2 Mon Sep 17 00:00:00 2001
From: Joe Korty
Date: Wed, 27 Aug 2008 11:21:39 -0400
Subject: make might_sleep() display the oopsing process

Expand might_sleep's printk to indicate the oopsing process.

Signed-off-by: Joe Korty <joe.korty@ccur.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93f5ea08be97..6e283dc7679a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8233,8 +8233,8 @@ void __might_sleep(char *file, int line)
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(), current->pid, current->comm);
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
-- 
cgit v1.2.3-55-g7522


From aef745fca016aea45adae5c98e8698904dd8ad51 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 28 Aug 2008 11:34:43 +0200
Subject: sched: clean up __might_sleep()

add KERN_ to the printout and clean up the flow a bit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6e283dc7679a..b112caaa400a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8226,20 +8226,25 @@ void __might_sleep(char *file, int line)
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) &&
-	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
-		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-			return;
-		prev_jiffy = jiffies;
-		printk(KERN_ERR "BUG: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(), current->pid, current->comm);
-		debug_show_held_locks(current);
-		if (irqs_disabled())
-			print_irqtrace_events(current);
-		dump_stack();
-	}
+	if ((!in_atomic() && !irqs_disabled()) ||
+		    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+	dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
-- 
cgit v1.2.3-55-g7522


From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 19 Aug 2008 13:40:47 +0200
Subject: sched: extract walk_tg_tree(), fix

fix:

 kernel/sched.c: In function '__rt_schedulable':
 kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree'
 kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function)
 kernel/sched.c:8771: error: (Each undeclared identifier is reported only once
 kernel/sched.c:8771: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index e41bdae2778d..703f56d5db5f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED)
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
-- 
cgit v1.2.3-55-g7522


From 49048622eae698e5c4ae61f7e71200f265ccc529 Mon Sep 17 00:00:00 2001
From: Balbir Singh
Date: Fri, 5 Sep 2008 18:12:23 +0200
Subject: sched: fix process time monotonicity

Spencer reported a problem where utime and stime were going negative despite
the fixes in commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa. The suspected
reason for the problem is that signal_struct maintains it's own utime and
stime (of exited tasks), these are not updated using the new task_utime()
routine, hence sig->utime can go backwards and cause the same problem
to occur (sig->utime, adds tsk->utime and not task_utime()). This patch
fixes the problem

TODO: using max(task->prev_utime, derived utime) works for now, but a more
generic solution is to implement cputime_max() and use the cputime_gt()
function for comparison.

Reported-by: spencer@bluehost.com
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/array.c       | 59 ---------------------------------------------------
 include/linux/sched.h |  4 ++++
 kernel/exit.c         |  6 +++---
 kernel/sched.c        | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 62 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c6..71c9be59c9c2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -337,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
-	return p->utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
-	return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
-	u64 temp;
-
-	/*
-	 * Use CFS's precise accounting:
-	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-
-	if (total) {
-		temp *= utime;
-		do_div(temp, total);
-	}
-	utime = (clock_t)temp;
-
-	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-	return p->prev_utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
-	clock_t stime;
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-			cputime_to_clock_t(task_utime(p));
-
-	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-
-	return p->prev_stime;
-}
-#endif
-
-static cputime_t task_gtime(struct task_struct *p)
-{
-	return p->gtime;
-}
-
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task, int whole)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cfb0d87b99fc..3d9120c5ad15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1475,6 +1475,10 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
+extern cputime_t task_utime(struct task_struct *p);
+extern cputime_t task_stime(struct task_struct *p);
+extern cputime_t task_gtime(struct task_struct *p);
+
 /*
  * Per process flags
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 25ed2ad986df..16395644a98f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, tsk->utime);
-		sig->stime = cputime_add(sig->stime, tsk->stime);
-		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+		sig->utime = cputime_add(sig->utime, task_utime(tsk));
+		sig->stime = cputime_add(sig->stime, task_stime(tsk));
+		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..1a5f73c1fcdc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4178,6 +4178,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+	return p->utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+	return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+	clock_t utime = cputime_to_clock_t(p->utime),
+		total = utime + cputime_to_clock_t(p->stime);
+	u64 temp;
+
+	/*
+	 * Use CFS's precise accounting:
+	 */
+	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+	if (total) {
+		temp *= utime;
+		do_div(temp, total);
+	}
+	utime = (clock_t)temp;
+
+	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+	return p->prev_utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+	clock_t stime;
+
+	/*
+	 * Use CFS's precise accounting. (we subtract utime from
+	 * the total, to make sure the total observed by userspace
+	 * grows monotonically - apps rely on that):
+	 */
+	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+			cputime_to_clock_t(task_utime(p));
+
+	if (stime >= 0)
+		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+	return p->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+	return p->gtime;
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
-- 
cgit v1.2.3-55-g7522


From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt
Date: Fri, 5 Sep 2008 23:46:19 +0200
Subject: sched: compilation fix with gcc 3.4.6

I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6.
The error is:
  CC      kernel/sched.o
kernel/sched.c: In function `start_rt_bandwidth':
kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available
kernel/sched.c:214: sorry, unimplemented: called from here
make[1]: *** [kernel/sched.o] Error 1
make: *** [kernel] Error 2

It seems that the gcc 3.4.6 requires full inline definition before first usage.
The patch below fixes the compilation problem.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl> (if needed>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 703f56d5db5f..4de2bfb28c58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
-static inline int rt_bandwidth_enabled(void);
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
 
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
@@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static inline int rt_bandwidth_enabled(void)
-{
-	return sysctl_sched_rt_runtime >= 0;
-}
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
-- 
cgit v1.2.3-55-g7522


From dfb512ec4834116124da61d6c1ee10fd0aa32bd6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky
Date: Fri, 29 Aug 2008 13:11:41 -0700
Subject: sched: arch_reinit_sched_domains() must destroy domains to force
 rebuild

What I realized recently is that calling rebuild_sched_domains() in
arch_reinit_sched_domains() by itself is not enough when cpusets are enabled.
partition_sched_domains() code is trying to avoid unnecessary domain rebuilds
and will not actually rebuild anything if new domain masks match the old ones.

What this means is that doing
     echo 1 > /sys/devices/system/cpu/sched_mc_power_savings
on a system with cpusets enabled will not take affect untill something changes
in the cpuset setup (ie new sets created or deleted).

This patch fixes restore correct behaviour where domains must be rebuilt in
order to enable MC powersaving flags.

Test on quad-core Core2 box with both CONFIG_CPUSETS and !CONFIG_CPUSETS.
Also tested on dual-core Core2 laptop. Lockdep is happy and things are working
as expected.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Tested-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h |  2 +-
 kernel/sched.c         | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e8f450c499b0..2691926fb506 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void)
 
 static inline void rebuild_sched_domains(void)
 {
-	partition_sched_domains(0, NULL, NULL);
+	partition_sched_domains(1, NULL, NULL);
 }
 
 #endif /* !CONFIG_CPUSETS */
diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..d72ee9a0eacd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7589,24 +7589,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * and partition_sched_domains() will fallback to the single partition
  * 'fallback_doms', it also forces the domains to be rebuilt.
  *
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
-	int i, j;
+	int i, j, n;
 
 	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL)
-		ndoms_new = 0;
+	n = doms_new ? ndoms_new : 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < ndoms_new; j++) {
+		for (j = 0; j < n; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
@@ -7619,7 +7622,6 @@ match1:
 
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
-		ndoms_new = 1;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 		dattr_new = NULL;
@@ -7656,8 +7658,13 @@ match2:
 int arch_reinit_sched_domains(void)
 {
 	get_online_cpus();
+
+	/* Destroy domains first to force the rebuild */
+	partition_sched_domains(0, NULL, NULL);
+
 	rebuild_sched_domains();
 	put_online_cpus();
+
 	return 0;
 }
 
@@ -7741,7 +7748,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		partition_sched_domains(0, NULL, NULL);
+		partition_sched_domains(1, NULL, NULL);
 		return NOTIFY_OK;
 
 	default:
-- 
cgit v1.2.3-55-g7522


From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto
Date: Wed, 10 Sep 2008 17:00:19 -0700
Subject: sched: fix deadlock in setting scheduler parameter to zero

Andrei Gusev wrote:

> I played witch scheduler settings. After doing something like:
> echo -n 1000000 >sched_rt_period_us
>
> command is locked. I found in kernel.log:
>
> Sep 11 00:39:34 zaratustra
> Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G        W
> (2.6.26.3 #12)
> Sep 11 00:39:34 zaratustra EIP: 0060:[<c0213fc7>] EFLAGS: 00210246 CPU: 0
> Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80
> Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000
> EDX: 00000000
> Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f
> ESP: ea7a6edc
> Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
> Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000
> task=ea744000 task.ti=ea7a6000)
> Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f
> c0119042 00000000 00000000 00000001
> Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000
> c04d2e80 00000001 000e7ef0
> Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001
> ffffffff c04d2e80 ea5b2480
> Sep 11 00:39:34 zaratustra Call Trace:
> Sep 11 00:39:34 zaratustra [<c0119042>] __rt_schedulable+0x52/0x130
> Sep 11 00:39:34 zaratustra [<c01191a3>] sched_rt_handler+0x83/0x120
> Sep 11 00:39:34 zaratustra [<c01a76a6>] proc_sys_call_handler+0xb6/0xd0
> Sep 11 00:39:34 zaratustra [<c01a76c0>] proc_sys_write+0x0/0x20
> Sep 11 00:39:34 zaratustra [<c01a76d9>] proc_sys_write+0x19/0x20
> Sep 11 00:39:34 zaratustra [<c016cc68>] vfs_write+0xa8/0x140
> Sep 11 00:39:34 zaratustra [<c016cdd1>] sys_write+0x41/0x80
> Sep 11 00:39:34 zaratustra [<c0103051>] sysenter_past_esp+0x6a/0x91
> Sep 11 00:39:34 zaratustra =======================
> Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de
> 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea
> 77 08 89 e8 31 d2 <f7> f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89
> ca 8b 1c 24
> Sep 11 00:39:34 zaratustra EIP: [<c0213fc7>] div64_u64+0x57/0x80 SS:ESP
> 0068:ea7a6edc
> Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]---

fix the boundary condition.

sysctl_sched_rt_period=0 makes exception at to_ratio().

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index cc1f81b50b82..98890807375b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void)
 	u64 rt_runtime, rt_period;
 	int ret = 0;
 
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
@@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void)
 	unsigned long flags;
 	int i;
 
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-- 
cgit v1.2.3-55-g7522


From 15afe09bf496ae10c989e1a375a6b5da7bd3e16e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 20 Sep 2008 23:38:02 +0200
Subject: sched: wakeup preempt when small overlap

Lin Ming reported a 10% OLTP regression against 2.6.27-rc4.

The difference seems to come from different preemption agressiveness,
which affects the cache footprint of the workload and its effective
cache trashing.

Aggresively preempt a task if its avg overlap is very small, this should
avoid the task going to sleep and find it still running when we schedule
back to it - saving a wakeup.

Reported-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  2 +-
 kernel/sched.c          | 12 ++++++------
 kernel/sched_fair.c     | 13 ++++++++++---
 kernel/sched_features.h |  1 +
 kernel/sched_idletask.c |  6 +++---
 kernel/sched_rt.c       |  2 +-
 6 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3b7a8f32477..d8e699b55858 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -897,7 +897,7 @@ struct sched_class {
 	void (*yield_task) (struct rq *rq);
 	int  (*select_task_rq)(struct task_struct *p, int sync);
 
-	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
 
 	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 0d8905a1b8ca..ad9d39b021f8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -604,9 +604,9 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
-	rq->curr->sched_class->check_preempt_curr(rq, p);
+	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -2282,7 +2282,7 @@ out_running:
 	trace_mark(kernel_sched_wakeup,
 		"pid %d state %ld ## rq %p task %p rq->curr %p",
 		p->pid, p->state, rq, p, rq->curr);
-	check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p, sync);
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2417,7 +2417,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	trace_mark(kernel_sched_wakeup_new,
 		"pid %d state %ld ## rq %p task %p rq->curr %p",
 		p->pid, p->state, rq, p, rq->curr);
-	check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
@@ -2877,7 +2877,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	check_preempt_curr(this_rq, p);
+	check_preempt_curr(this_rq, p, 0);
 }
 
 /*
@@ -6007,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p);
+		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a10ac0bcee64..7328383690f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ static inline int depth_se(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@@ -1367,6 +1367,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
+	if (sched_feat(WAKEUP_OVERLAP) && sync &&
+			se->avg_overlap < sysctl_sched_migration_cost &&
+			pse->avg_overlap < sysctl_sched_migration_cost) {
+		resched_task(curr);
+		return;
+	}
+
 	/*
 	 * preemption test can be made between sibling entities who are in the
 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
@@ -1649,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /*
@@ -1666,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
 	if (running)
 		resched_task(rq->curr);
 	else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..bf027a7accf8 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
 {
 	resched_task(rq->idle);
 }
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
 	if (running)
 		resched_task(rq->curr);
 	else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..6d2d0a5d030b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -783,7 +783,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
-- 
cgit v1.2.3-55-g7522


From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001
From: Rakib Mullick
Date: Mon, 22 Sep 2008 14:55:45 -0700
Subject: sched: fix init_hrtick() section mismatch warning

LD      kernel/built-in.o
WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference
from the function init_hrtick() to the variable
.cpuinit.data:hotplug_hrtick_nb.8
The function init_hrtick() references
the variable __cpuinitdata hotplug_hrtick_nb.8.
This is often because init_hrtick lacks a __cpuinitdata
annotation or the annotation of hotplug_hrtick_nb.8 is wrong.

Signed-off-by: Md.Rakib H. Mullick <rakib.mullick@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98890807375b..13dd2db9fb2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_DONE;
 }
 
-static void init_hrtick(void)
+static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-- 
cgit v1.2.3-55-g7522


From 006c75f146e58e080d2b2725a6664f71886e112b Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Mon, 22 Sep 2008 14:55:46 -0700
Subject: sched: clarify ifdef tangle

- Add some comments to try to make the ifdef puzzle a bit clearer

- Explicitly inline one of the three init_hrtick() implementations.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad9d39b021f8..927c9307cd00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1102,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
 
-static void init_hrtick(void)
+static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1121,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq)
 	rq->hrtick_timer.function = hrtick;
 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
-#else
+#else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
@@ -1133,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq)
 static inline void init_hrtick(void)
 {
 }
-#endif
+#endif	/* CONFIG_SCHED_HRTICK */
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
-- 
cgit v1.2.3-55-g7522


From 4653f803e6e0d970ffeac0efd2c01743eb6c5228 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 23 Sep 2008 15:33:44 +0200
Subject: sched: more sanity checks on the bandwidth settings

While playing around with it, I noticed we missed some sanity checks.
Also add some comments while we're there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 669c49aa57f0..e1299de1765e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8866,11 +8866,29 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
+	/*
+	 * Cannot have more runtime than the period.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
+
+	/*
+	 * Ensure we don't starve existing RT tasks.
+	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	total = to_ratio(period, runtime);
 
+	/*
+	 * Nobody can have more than the global setting allows.
+	 */
+	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+		return -EINVAL;
+
+	/*
+	 * The sum of our children's runtime should not exceed our own.
+	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
@@ -8978,19 +8996,24 @@ long sched_group_rt_period(struct task_group *tg)
 
 static int sched_rt_global_constraints(void)
 {
-	struct task_group *tg = &root_task_group;
-	u64 rt_runtime, rt_period;
+	u64 runtime, period;
 	int ret = 0;
 
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 
-	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	rt_runtime = tg->rt_bandwidth.rt_runtime;
+	runtime = global_rt_runtime();
+	period = global_rt_period();
+
+	/*
+	 * Sanity check on the sysctl variables.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	ret = __rt_schedulable(tg, rt_period, rt_runtime);
+	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
-- 
cgit v1.2.3-55-g7522


From ccc7dadf736639da86f3e0c86832c11a66fc8221 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Mon, 29 Sep 2008 15:47:42 +0200
Subject: hrtimer: prevent migration of per CPU hrtimers

Impact: per CPU hrtimers can be migrated from a dead CPU

The hrtimer code has no knowledge about per CPU timers, but we need to
prevent the migration of such timers and warn when such a timer is
active at migration time.

Explicitely mark the timers as per CPU and use a more understandable
mode descriptor for the interrupts safe unlocked callback mode, which
is used by hrtimer_sleeper and the scheduler code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h      | 14 +++++++++++---
 kernel/hrtimer.c             | 37 +++++++++++++++++++++++++------------
 kernel/sched.c               |  4 ++--
 kernel/time/tick-sched.c     |  2 +-
 kernel/trace/trace_sysprof.c |  2 +-
 5 files changed, 40 insertions(+), 19 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bdd88df1b4e5..2f245fe63bda 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -47,14 +47,22 @@ enum hrtimer_restart {
  *	HRTIMER_CB_IRQSAFE:		Callback may run in hardirq context
  *	HRTIMER_CB_IRQSAFE_NO_RESTART:	Callback may run in hardirq context and
  *					does not restart the timer
- *	HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:	Callback must run in hardirq context
- *					Special mode for tick emultation
+ *	HRTIMER_CB_IRQSAFE_PERCPU:	Callback must run in hardirq context
+ *					Special mode for tick emulation and
+ *					scheduler timer. Such timers are per
+ *					cpu and not allowed to be migrated on
+ *					cpu unplug.
+ *	HRTIMER_CB_IRQSAFE_UNLOCKED:	Callback should run in hardirq context
+ *					with timer->base lock unlocked
+ *					used for timers which call wakeup to
+ *					avoid lock order problems with rq->lock
  */
 enum hrtimer_cb_mode {
 	HRTIMER_CB_SOFTIRQ,
 	HRTIMER_CB_IRQSAFE,
 	HRTIMER_CB_IRQSAFE_NO_RESTART,
-	HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
+	HRTIMER_CB_IRQSAFE_PERCPU,
+	HRTIMER_CB_IRQSAFE_UNLOCKED,
 };
 
 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ace723dd1e52..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 			 */
 			BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
 			return 1;
-		case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+		case HRTIMER_CB_IRQSAFE_PERCPU:
+		case HRTIMER_CB_IRQSAFE_UNLOCKED:
 			/*
 			 * This is solely for the sched tick emulation with
 			 * dynamic tick support to ensure that we do not
 			 * restart the tick right on the edge and end up with
 			 * the tick timer in the softirq ! The calling site
-			 * takes care of this.
+			 * takes care of this. Also used for hrtimer sleeper !
 			 */
 			debug_hrtimer_deactivate(timer);
 			return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
 	timer_stats_account_hrtimer(timer);
 
 	fn = timer->function;
-	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+	    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
 		/*
 		 * Used for scheduler timers, avoid lock inversion with
 		 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 #endif
 }
 
@@ -1592,7 +1594,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
 
 static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base)
+				struct hrtimer_clock_base *new_base, int dcpu)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
@@ -1603,6 +1605,18 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
 
+		/*
+		 * Should not happen. Per CPU timers should be
+		 * canceled _before_ the migration code is called
+		 */
+		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+			__remove_hrtimer(timer, old_base,
+					 HRTIMER_STATE_INACTIVE, 0);
+			WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+			     timer, timer->function, dcpu);
+			continue;
+		}
+
 		/*
 		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
@@ -1619,12 +1633,11 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		/*
 		 * Happens with high res enabled when the timer was
 		 * already expired and the callback mode is
-		 * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
-		 * (hrtimer_sleeper). The enqueue code does not move
-		 * them to the soft irq pending list for
-		 * performance/latency reasons, but in the migration
-		 * state, we need to do that otherwise we end up with
-		 * a stale timer.
+		 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+		 * enqueue code does not move them to the soft irq
+		 * pending list for performance/latency reasons, but
+		 * in the migration state, we need to do that
+		 * otherwise we end up with a stale timer.
 		 */
 		if (timer->state == HRTIMER_STATE_MIGRATE) {
 			timer->state = HRTIMER_STATE_PENDING;
@@ -1682,7 +1695,7 @@ static void migrate_hrtimers(int cpu)
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		if (migrate_hrtimer_list(&old_base->clock_base[i],
-					 &new_base->clock_base[i]))
+					 &new_base->clock_base[i], cpu))
 			raise = 1;
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 13dd2db9fb2d..ad1962dc0aa2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
-	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 39019b3f7621..cb02324bdb88 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -625,7 +625,7 @@ void tick_setup_sched_timer(void)
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
-	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	/* Get the next period (per cpu) */
 	ts->sched_timer.expires = tick_init_jiffy_update();
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
 
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
-	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
-- 
cgit v1.2.3-55-g7522


From 34b3ede2353604ec9861c1d900b2a835ff85de47 Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Mon, 6 Oct 2008 09:27:00 +0800
Subject: sched: remove redundant code in cpu_cgroup_create()

css will be initialized by cgroup core.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2caedc47e764..9715f4ce6cfe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9088,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
-		init_task_group.css.cgroup = cgrp;
 		return &init_task_group.css;
 	}
 
@@ -9097,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 
-	/* Bind the cgroup to task_group object we just created */
-	tg->css.cgroup = cgrp;
-
 	return &tg->css;
 }
 
-- 
cgit v1.2.3-55-g7522


From a5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 9 Oct 2008 11:35:51 +0200
Subject: sched debug: add name to sched_domain sysctl entries

add /proc/sys/kernel/sched_domain/cpu0/domain0/name, to make
it easier to see which specific scheduler domain remained at
that entry.

Since we process the scheduler domain tree and
simplify it, it's not always immediately clear during debugging
which domain came from where.

depends on CONFIG_SCHED_DEBUG=y.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d8e699b55858..5d0819ee442a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -824,6 +824,9 @@ struct sched_domain {
 	unsigned int ttwu_move_affine;
 	unsigned int ttwu_move_balance;
 #endif
+#ifdef CONFIG_SCHED_DEBUG
+	char *name;
+#endif
 };
 
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
diff --git a/kernel/sched.c b/kernel/sched.c
index 9715f4ce6cfe..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6351,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(12);
+	struct ctl_table *table = sd_alloc_ctl_entry(13);
 
 	if (table == NULL)
 		return NULL;
@@ -6379,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	/* &table[11] is terminator */
+	set_table_entry(&table[11], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[12] is terminator */
 
 	return table;
 }
@@ -7263,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)		sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)		do { } while (0)
+#endif
+
 #define	SD_INIT(sd, type)	sd_init_##type(sd)
+
 #define SD_INIT_FUNC(type)	\
 static noinline void sd_init_##type(struct sched_domain *sd)	\
 {								\
 	memset(sd, 0, sizeof(*sd));				\
 	*sd = SD_##type##_INIT;					\
 	sd->level = SD_LV_##type;				\
+	SD_INIT_NAME(sd, type);					\
 }
 
 SD_INIT_FUNC(CPU)
-- 
cgit v1.2.3-55-g7522