From 0a565f7919cfb3d3df2c97d45751cbb83d858f97 Mon Sep 17 00:00:00 2001
From: Peter Williams
Date: Mon, 10 Jul 2006 04:43:51 -0700
Subject: [PATCH] sched: fix bug in __migrate_task()

Problem:

In the function __migrate_task(), deactivate_task() followed by
activate_task() is used to move the task from one run queue to
another.  This has two undesirable effects:

1. The task's priority is recalculated. (Nowhere else in the
scheduler code is the priority recalculated for a change of CPU.)

2. The task's time stamp is set to the current time.  At the very least,
this makes the adjustment of the time stamp before the call to
deactivate_task() redundant but I believe the problem is more serious
as the time stamp now holds the time of the queue change instead of
the time at which the task was woken.  In addition, unless dest_rq is
the same queue as "current" is on the time stamp could be inaccurate
due to inter CPU drift.

Solution:

Replace the call to activate_task() with one to __activate_task().

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4ee400f9d56b..b47819b676fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4877,7 +4877,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
 				+ rq_dest->timestamp_last_tick;
 		deactivate_task(p, rq_src);
-		activate_task(p, rq_dest, 0);
+		__activate_task(p, rq_dest);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
-- 
cgit v1.2.3-55-g7522


From 2ed6e34f88a0d896a6f889b00693cae0fadacfd0 Mon Sep 17 00:00:00 2001
From: Andreas Mohr
Date: Mon, 10 Jul 2006 04:43:52 -0700
Subject: [PATCH] small kernel/sched.c cleanup

- constify and optimize stat_nam (thanks to Michael Tokarev!)
- spelling and comment fixes

Signed-off-by: Andreas Mohr <andi@lisas.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b47819b676fa..d714611f1691 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3384,7 +3384,7 @@ EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
 /*
- * this is is the entry point to schedule() from in-kernel preemption
+ * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
@@ -3427,7 +3427,7 @@ need_resched:
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is is the entry point to schedule() from kernel preemption
+ * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
@@ -3439,7 +3439,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
-	/* Catch callers which need to be fixed*/
+	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
 need_resched:
@@ -4650,7 +4650,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+static const char stat_nam[] = "RSDTtZX";
 
 static void show_task(struct task_struct *p)
 {
@@ -4658,12 +4658,9 @@ static void show_task(struct task_struct *p)
 	unsigned long free = 0;
 	unsigned state;
 
-	printk("%-13.13s ", p->comm);
 	state = p->state ? __ffs(p->state) + 1 : 0;
-	if (state < ARRAY_SIZE(stat_nam))
-		printk(stat_nam[state]);
-	else
-		printk("?");
+	printk("%-13.13s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if (BITS_PER_LONG == 32)
 	if (state == TASK_RUNNING)
 		printk(" running ");
@@ -5776,7 +5773,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 	cache = vmalloc(max_size);
 	if (!cache) {
 		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
-		return 1000000; // return 1 msec on very small boxen
+		return 1000000; /* return 1 msec on very small boxen */
 	}
 
 	while (size <= max_size) {
-- 
cgit v1.2.3-55-g7522


From f9829cceb686f3719215fe43c8593e5f3efe1710 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Mon, 10 Jul 2006 04:44:01 -0700
Subject: [PATCH] Minor cleanup to lockdep.c

- Use printk formatting for indentation
- Don't leave NTFS in the default event filter

Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 44 ++++++++++++--------------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f32ca78c198d..42af5cc31a49 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -169,22 +169,17 @@ EXPORT_SYMBOL(lockdep_internal);
  */
 static int class_filter(struct lock_class *class)
 {
+#if 0
+	/* Example */
 	if (class->name_version == 1 &&
-			!strcmp(class->name, "&rl->lock"))
+			!strcmp(class->name, "lockname"))
 		return 1;
 	if (class->name_version == 1 &&
-			!strcmp(class->name, "&ni->mrec_lock"))
+			!strcmp(class->name, "&struct->lockfield"))
 		return 1;
-	if (class->name_version == 1 &&
-			!strcmp(class->name, "mft_ni_runlist_lock"))
-		return 1;
-	if (class->name_version == 1 &&
-			!strcmp(class->name, "mft_ni_mrec_lock"))
-		return 1;
-	if (class->name_version == 1 &&
-			!strcmp(class->name, "&vol->lcnbmp_lock"))
-		return 1;
-	return 0;
+#endif
+	/* Allow everything else. 0 would be filter everything else */
+	return 1;
 }
 #endif
 
@@ -408,23 +403,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
 		print_lock(curr->held_locks + i);
 	}
 }
-/*
- * Helper to print a nice hierarchy of lock dependencies:
- */
-static void print_spaces(int nr)
-{
-	int i;
-
-	for (i = 0; i < nr; i++)
-		printk("  ");
-}
 
 static void print_lock_class_header(struct lock_class *class, int depth)
 {
 	int bit;
 
-	print_spaces(depth);
-	printk("->");
+	printk("%*s->", depth, "");
 	print_lock_name(class);
 	printk(" ops: %lu", class->ops);
 	printk(" {\n");
@@ -433,17 +417,14 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 		if (class->usage_mask & (1 << bit)) {
 			int len = depth;
 
-			print_spaces(depth);
-			len += printk("   %s", usage_str[bit]);
+			len += printk("%*s   %s", depth, "", usage_str[bit]);
 			len += printk(" at:\n");
 			print_stack_trace(class->usage_traces + bit, len);
 		}
 	}
-	print_spaces(depth);
-	printk(" }\n");
+	printk("%*s }\n", depth, "");
 
-	print_spaces(depth);
-	printk(" ... key      at: ");
+	printk("%*s ... key      at: ",depth,"");
 	print_ip_sym((unsigned long)class->key);
 }
 
@@ -463,8 +444,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 		DEBUG_LOCKS_WARN_ON(!entry->class);
 		print_lock_dependencies(entry->class, depth + 1);
 
-		print_spaces(depth);
-		printk(" ... acquired at:\n");
+		printk("%*s ... acquired at:\n",depth,"");
 		print_stack_trace(&entry->trace, 2);
 		printk("\n");
 	}
-- 
cgit v1.2.3-55-g7522


From 55794a412fdf9af1744800e5020a4ec6b21e3cdc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Mon, 10 Jul 2006 04:44:03 -0700
Subject: [PATCH] lockdep: improve debug output

Make lockdep print which lock is held, in the "kfree() of a live lock"
scenario.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/lockdep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42af5cc31a49..c1f34addd003 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2551,7 +2551,7 @@ static inline int in_range(const void *start, const void *addr, const void *end)
 
 static void
 print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
-		     const void *mem_to)
+		     const void *mem_to, struct held_lock *hlock)
 {
 	if (!debug_locks_off())
 		return;
@@ -2563,6 +2563,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
 	printk(  "-------------------------\n");
 	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
 		curr->comm, curr->pid, mem_from, mem_to-1);
+	print_lock(hlock);
 	lockdep_print_held_locks(curr);
 
 	printk("\nstack backtrace:\n");
@@ -2596,7 +2597,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 					!in_range(mem_from, lock_to, mem_to))
 			continue;
 
-		print_freed_lock_bug(curr, mem_from, mem_to);
+		print_freed_lock_bug(curr, mem_from, mem_to, hlock);
 		break;
 	}
 	local_irq_restore(flags);
-- 
cgit v1.2.3-55-g7522


From d6d897cec29252b8d0785198cfa6ca16d30c739d Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 10 Jul 2006 04:44:04 -0700
Subject: [PATCH] lockdep: core, reduce per-lock class-cache size

lockdep_map is embedded into every lock, which blows up data structure
sizes all around the kernel.  Reduce the class-cache to be for the default
class only - that is used in 99.9% of the cases and even if we dont have a
class cached, the lookup in the class-hash is lockless.

This change reduces the per-lock dep_map overhead by 56 bytes on 64-bit
platforms and by 28 bytes on 32-bit platforms.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/lockdep.h |  2 +-
 kernel/lockdep.c        | 87 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 316e0fb8d7b1..c040a8c969aa 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -120,7 +120,7 @@ struct lock_class {
  */
 struct lockdep_map {
 	struct lock_class_key		*key;
-	struct lock_class		*class[MAX_LOCKDEP_SUBCLASSES];
+	struct lock_class		*class_cache;
 	const char			*name;
 };
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c1f34addd003..9bad17884513 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1104,7 +1104,7 @@ extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
  * itself, so actual lookup of the hash should be once per lock object.
  */
 static inline struct lock_class *
-register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
 	struct list_head *hash_head;
@@ -1148,7 +1148,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 */
 	list_for_each_entry(class, hash_head, hash_entry)
 		if (class->key == key)
-			goto out_set;
+			return class;
+
+	return NULL;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+	struct lockdep_subclass_key *key;
+	struct list_head *hash_head;
+	struct lock_class *class;
+
+	class = look_up_lock_class(lock, subclass);
+	if (likely(class))
+		return class;
 
 	/*
 	 * Debug-check: all keys must be persistent!
@@ -1163,6 +1182,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 		return NULL;
 	}
 
+	key = lock->key->subkeys + subclass;
+	hash_head = classhashentry(key);
+
 	__raw_spin_lock(&hash_lock);
 	/*
 	 * We have to do the hash-walk again, to avoid races
@@ -1209,8 +1231,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
 out_unlock_set:
 	__raw_spin_unlock(&hash_lock);
 
-out_set:
-	lock->class[subclass] = class;
+	if (!subclass)
+		lock->class_cache = class;
 
 	DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
 
@@ -1914,7 +1936,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	}
 	lock->name = name;
 	lock->key = key;
-	memset(lock->class, 0, sizeof(lock->class[0])*MAX_LOCKDEP_SUBCLASSES);
+	lock->class_cache = NULL;
 }
 
 EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1928,8 +1950,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  unsigned long ip)
 {
 	struct task_struct *curr = current;
+	struct lock_class *class = NULL;
 	struct held_lock *hlock;
-	struct lock_class *class;
 	unsigned int depth, id;
 	int chain_head = 0;
 	u64 chain_key;
@@ -1947,8 +1969,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		return 0;
 	}
 
-	class = lock->class[subclass];
-	/* not cached yet? */
+	if (!subclass)
+		class = lock->class_cache;
+	/*
+	 * Not cached yet or subclass?
+	 */
 	if (unlikely(!class)) {
 		class = register_lock_class(lock, subclass);
 		if (!class)
@@ -2449,48 +2474,44 @@ void lockdep_free_key_range(void *start, unsigned long size)
 
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
-	struct lock_class *class, *next, *entry;
+	struct lock_class *class, *next;
 	struct list_head *head;
 	unsigned long flags;
 	int i, j;
 
 	raw_local_irq_save(flags);
-	__raw_spin_lock(&hash_lock);
 
 	/*
-	 * Remove all classes this lock has:
+	 * Remove all classes this lock might have:
+	 */
+	for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+		/*
+		 * If the class exists we look it up and zap it:
+		 */
+		class = look_up_lock_class(lock, j);
+		if (class)
+			zap_class(class);
+	}
+	/*
+	 * Debug check: in the end all mapped classes should
+	 * be gone.
 	 */
+	__raw_spin_lock(&hash_lock);
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
 		if (list_empty(head))
 			continue;
 		list_for_each_entry_safe(class, next, head, hash_entry) {
-			for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
-				entry = lock->class[j];
-				if (class == entry) {
-					zap_class(class);
-					lock->class[j] = NULL;
-					break;
-				}
+			if (unlikely(class == lock->class_cache)) {
+				__raw_spin_unlock(&hash_lock);
+				DEBUG_LOCKS_WARN_ON(1);
+				goto out_restore;
 			}
 		}
 	}
-
-	/*
-	 * Debug check: in the end all mapped classes should
-	 * be gone.
-	 */
-	for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
-		entry = lock->class[j];
-		if (!entry)
-			continue;
-		__raw_spin_unlock(&hash_lock);
-		DEBUG_LOCKS_WARN_ON(1);
-		raw_local_irq_restore(flags);
-		return;
-	}
-
 	__raw_spin_unlock(&hash_lock);
+
+out_restore:
 	raw_local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3-55-g7522


From c0fc84d2e5bb4a9e3ae470812a00cccba85a48b8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk
Date: Mon, 10 Jul 2006 04:44:21 -0700
Subject: [PATCH] kernel/printk.c: EXPORT_SYMBOL_UNUSED

This patch marks unused exports as EXPORT_SYMBOL_UNUSED.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index bdba5d80496c..65ca0688f86f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -52,7 +52,7 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
-EXPORT_SYMBOL(console_printk);
+EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
 
 /*
  * Low lever drivers may need that to know if they can schedule in
@@ -773,7 +773,7 @@ int is_console_locked(void)
 {
 	return console_locked;
 }
-EXPORT_SYMBOL(is_console_locked);
+EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 
 /**
  * release_console_sem - unlock the console system
-- 
cgit v1.2.3-55-g7522


From 80d6679a62fe45f440d042099d997a42e4e8c59d Mon Sep 17 00:00:00 2001
From: Adrian Bunk
Date: Mon, 10 Jul 2006 04:44:24 -0700
Subject: [PATCH] kernel/softirq.c: EXPORT_UNUSED_SYMBOL

This patch marks an unused export as EXPORT_UNUSED_SYMBOL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 215541e26c1a..fd12f2556f0d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,7 +311,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
 	softirq_vec[nr].action = action;
 }
 
-EXPORT_SYMBOL(open_softirq);
+EXPORT_UNUSED_SYMBOL(open_softirq);  /*  June 2006  */
 
 /* Tasklets */
 struct tasklet_head
-- 
cgit v1.2.3-55-g7522


From 06a9ec291b3aec9c7e36af0a10ad2b556bd7e84f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Mon, 10 Jul 2006 04:44:30 -0700
Subject: [PATCH] pi-futex: Validate futex type instead of oopsing

Calling futex_lock_pi is called with a reference to a non PI futex and
waiters exist already, lookup_pi_state() oopses due to pi_state == NULL.
Check this condition and return -EINVAL to userspace.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 1dc98e4dd287..cf0c8e21d1ab 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -476,6 +476,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			 * the refcount and return its pi_state:
 			 */
 			pi_state = this->pi_state;
+			/*
+			 * Userspace might have messed up non PI and PI futexes
+			 */
+			if (unlikely(!pi_state))
+				return -EINVAL;
+
 			atomic_inc(&pi_state->refcount);
 			me->pi_state = pi_state;
 
-- 
cgit v1.2.3-55-g7522


From e154ff3d2c5ad313ef0c66e6217502361cad2799 Mon Sep 17 00:00:00 2001
From: Roman Zippel
Date: Mon, 10 Jul 2006 04:44:32 -0700
Subject: [PATCH] adjust clock for lost ticks

A large number of lost ticks can cause an overadjustment of the clock.  To
compensate for this we look at the current error and the larger the error
already is the more careful we are at adjusting the error.  As small extra
fix reset the error when the clock is set.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: john stultz <johnstul@us.ibm.com>
Cc: Uwe Bugla <uwe.bugla@gmx.de>
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 85 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 396a3c024c2c..2a87430a58d4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -891,6 +891,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
+	clock->error = 0;
 	ntp_clear();
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -1008,52 +1009,52 @@ static int __init timekeeping_init_device(void)
 device_initcall(timekeeping_init_device);
 
 /*
- * If the error is already larger, we look ahead another tick,
+ * If the error is already larger, we look ahead even further
  * to compensate for late or lost adjustments.
  */
-static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
 {
-	int adj;
+	s64 tick_error, i;
+	u32 look_ahead, adj;
+	s32 error2, mult;
 
 	/*
-	 * As soon as the machine is synchronized to the external time
-	 * source this should be the common case.
+	 * Use the current error value to determine how much to look ahead.
+	 * The larger the error the slower we adjust for it to avoid problems
+	 * with losing too many ticks, otherwise we would overadjust and
+	 * produce an even larger error.  The smaller the adjustment the
+	 * faster we try to adjust for it, as lost ticks can do less harm
+	 * here.  This is tuned so that an error of about 1 msec is adusted
+	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
-	error >>= 2;
-	if (likely(sign > 0 ? error <= *interval : error >= *interval))
-		return sign;
+	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+	error2 = abs(error2);
+	for (look_ahead = 0; error2 > 0; look_ahead++)
+		error2 >>= 2;
 
 	/*
-	 * An extra look ahead dampens the effect of the current error,
-	 * which can grow quite large with continously late updates, as
-	 * it would dominate the adjustment value and can lead to
-	 * oscillation.
+	 * Now calculate the error in (1 << look_ahead) ticks, but first
+	 * remove the single look ahead already included in the error.
 	 */
-	error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
-	error -= clock->xtime_interval >> 1;
-
-	adj = 0;
-	while (1) {
-		error >>= 1;
-		if (sign > 0 ? error <= *interval : error >= *interval)
-			break;
-		adj++;
+	tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+	tick_error -= clock->xtime_interval >> 1;
+	error = ((error - tick_error) >> look_ahead) + tick_error;
+
+	/* Finally calculate the adjustment shift value.  */
+	i = *interval;
+	mult = 1;
+	if (error < 0) {
+		error = -error;
+		*interval = -*interval;
+		*offset = -*offset;
+		mult = -1;
 	}
-
-	/*
-	 * Add the current adjustments to the error and take the offset
-	 * into account, the latter can cause the error to be hardly
-	 * reduced at the next tick. Check the error again if there's
-	 * room for another adjustment, thus further reducing the error
-	 * which otherwise had to be corrected at the next update.
-	 */
-	error = (error << 1) - *interval + *offset;
-	if (sign > 0 ? error > *interval : error < *interval)
-		adj++;
+	for (adj = 0; error > i; adj++)
+		error >>= 1;
 
 	*interval <<= adj;
 	*offset <<= adj;
-	return sign << adj;
+	return mult << adj;
 }
 
 /*
@@ -1068,11 +1069,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
 
 	error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
 	if (error > interval) {
-		adj = clocksource_bigadjust(1, error, &interval, &offset);
+		error >>= 2;
+		if (likely(error <= interval))
+			adj = 1;
+		else
+			adj = clocksource_bigadjust(error, &interval, &offset);
 	} else if (error < -interval) {
-		interval = -interval;
-		offset = -offset;
-		adj = clocksource_bigadjust(-1, error, &interval, &offset);
+		error >>= 2;
+		if (likely(error >= -interval)) {
+			adj = -1;
+			interval = -interval;
+			offset = -offset;
+		} else
+			adj = clocksource_bigadjust(error, &interval, &offset);
 	} else
 		return;
 
@@ -1129,7 +1138,7 @@ static void update_wall_time(void)
 	clocksource_adjust(clock, offset);
 
 	/* store full nanoseconds into xtime */
-	xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
 	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
 
 	/* check to see if there is a new clocksource to use */
-- 
cgit v1.2.3-55-g7522


From 95018f7c94cbe4e78fc014b6ce52004714c06e2a Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Mon, 10 Jul 2006 04:45:00 -0700
Subject: [PATCH] swsusp: do not use memcpy for snapshotting memory

swsusp should not use memcpy for snapshotting memory, because on some
architectures memcpy may increase preempt_count (i386 does this when
CONFIG_X86_USE_3DNOW is set).  Then, as a result, wrong value of preempt_count
is stored in the image.

Replace memcpy in copy_data_pages with an open-coded loop.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 24c96f354231..75d4886e648e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist)
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
 				struct page *page;
+				long *src, *dst;
+				int n;
+
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page is not usable for copying task structs. */
-				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				/* copy_page and memcpy are not usable for copying task structs. */
+				dst = (long *)pbe->address;
+				src = (long *)pbe->orig_address;
+				for (n = PAGE_SIZE / sizeof(long); n; n--)
+					*dst++ = *src++;
 				pbe = pbe->next;
 			}
 		}
-- 
cgit v1.2.3-55-g7522


From 712f403af6682c942d8ff8bfbd54eed03643a796 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Mon, 10 Jul 2006 04:45:00 -0700
Subject: [PATCH] swsusp warning fix

kernel/power/swap.c: In function 'swsusp_write':
kernel/power/swap.c:275: warning: 'start' may be used uninitialized in this function

gcc isn't smart enough, so help it.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..a57c661c7e8a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -263,7 +263,6 @@ int swsusp_write(void)
 	struct swap_map_handle handle;
 	struct snapshot_handle snapshot;
 	struct swsusp_info *header;
-	unsigned long start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
@@ -281,16 +280,17 @@ int swsusp_write(void)
 	}
 	error = get_swap_writer(&handle);
 	if (!error) {
-		start = handle.cur_swap;
+		unsigned long start = handle.cur_swap;
 		error = swap_write_page(&handle, header);
-	}
-	if (!error)
-		error = save_image(&handle, &snapshot, header->pages - 1);
-	if (!error) {
-		flush_swap_writer(&handle);
-		printk("S");
-		error = mark_swapfiles(swp_entry(root_swap, start));
-		printk("|\n");
+		if (!error)
+			error = save_image(&handle, &snapshot,
+					header->pages - 1);
+		if (!error) {
+			flush_swap_writer(&handle);
+			printk("S");
+			error = mark_swapfiles(swp_entry(root_swap, start));
+			printk("|\n");
+		}
 	}
 	if (error)
 		free_all_swap_pages(root_swap, handle.bitmap);
-- 
cgit v1.2.3-55-g7522


From aeceb15738958fe59cd9fe537f40317b1a3bc731 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Mon, 10 Jul 2006 04:45:01 -0700
Subject: [PATCH] swsusp: fix panic when signature can't be read

Do not panic a machine when swsusp signature can't be read.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a57c661c7e8a..f1dd146bd64d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0);
 
 static int end_io(struct bio *bio, unsigned int num, int err)
 {
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		panic("I/O error reading memory image");
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		printk(KERN_ERR "I/O error reading swsusp image.\n");
+		return -EIO;
+	}
 	atomic_set(&io_done, 0);
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 21d71f513b6221f482ed6ad45e05f073ae67f319 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 10 Jul 2006 04:45:32 -0700
Subject: [PATCH] uninline init_waitqueue_head()

allyesconfig vmlinux size delta:

  text            data    bss     dec          filename
  20736884        6073834 3075176 29885894     vmlinux.before
  20721009        6073966 3075176 29870151     vmlinux.after

~18 bytes per callsite, 15K of text size (~0.1%) saved.

(as an added bonus this also removes a lockdep annotation.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/wait.h | 12 +-----------
 kernel/wait.c        |  8 ++++++--
 2 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 794be7af58ae..b3b9048421d8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -77,17 +77,7 @@ struct task_struct;
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)				\
 	{ .flags = word, .bit_nr = bit, }
 
-/*
- * lockdep: we want one lock-class for all waitqueue locks.
- */
-extern struct lock_class_key waitqueue_lock_key;
-
-static inline void init_waitqueue_head(wait_queue_head_t *q)
-{
-	spin_lock_init(&q->lock);
-	lockdep_set_class(&q->lock, &waitqueue_lock_key);
-	INIT_LIST_HEAD(&q->task_list);
-}
+extern void init_waitqueue_head(wait_queue_head_t *q);
 
 static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 {
diff --git a/kernel/wait.c b/kernel/wait.c
index a1d57aeb7f75..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,9 +10,13 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
 
-struct lock_class_key waitqueue_lock_key;
+void init_waitqueue_head(wait_queue_head_t *q)
+{
+	spin_lock_init(&q->lock);
+	INIT_LIST_HEAD(&q->task_list);
+}
 
-EXPORT_SYMBOL(waitqueue_lock_key);
+EXPORT_SYMBOL(init_waitqueue_head);
 
 void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
-- 
cgit v1.2.3-55-g7522


From c59923a15c12d2b3597af913bf234a0ef264a38b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 10 Jul 2006 04:45:40 -0700
Subject: [PATCH] remove the tasklist_lock export

As announced half a year ago this patch will remove the tasklist_lock
export.  The previous two patches got rid of the remaining modular users.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/feature-removal-schedule.txt | 11 -----------
 kernel/fork.c                              |  4 +---
 2 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 99f219a01e0e..ee287988934e 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -166,17 +166,6 @@ Who:	Arjan van de Ven <arjan@linux.intel.com>
 
 ---------------------------
 
-What:	remove EXPORT_SYMBOL(tasklist_lock)
-When:	August 2006
-Files:	kernel/fork.c
-Why:	tasklist_lock protects the kernel internal task list.  Modules have
-	no business looking at it, and all instances in drivers have been due
-	to use of too-lowlevel APIs.  Having this symbol exported prevents
-	moving to more scalable locking schemes for the task list.
-Who:	Christoph Hellwig <hch@lst.de>
-
----------------------------
-
 What:	mount/umount uevents
 When:	February 2007
 Why:	These events are not correct, and do not properly let userspace know
diff --git a/kernel/fork.c b/kernel/fork.c
index 56e4e07e45f7..926e5a68ea9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,9 +61,7 @@ int max_threads;		/* tunable limit on nr_threads */
 
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
- __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
-
-EXPORT_SYMBOL(tasklist_lock);
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
 int nr_processes(void)
 {
-- 
cgit v1.2.3-55-g7522


From 2c16e9c888985761511bd1905b00fb271169c3c0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven
Date: Mon, 10 Jul 2006 04:45:42 -0700
Subject: [PATCH] lockdep: disable lock debugging when kernel state becomes
 untrusted

Disable lockdep debugging in two situations where the integrity of the
kernel no longer is guaranteed: when oopsing and when hitting a
tainting-condition.  The goal is to not get weird lockdep traces that don't
make sense or are otherwise undebuggable, to not waste time.

Lockdep assumes that the previous state it knows about is valid to operate,
which is why lockdep turns itself off after the first violation it reports,
after that point it can no longer make that assumption.

A kernel oops means that the integrity of the kernel compromised; in
addition anything lockdep would report is of lesser importance than the
oops.

All the tainting conditions are of similar integrity-violating nature and
also make debugging/diagnosing more difficult.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index ab13f0f668b5..d8a0bca21233 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -172,6 +172,7 @@ const char *print_tainted(void)
 
 void add_taint(unsigned flag)
 {
+	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
 	tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
@@ -256,6 +257,7 @@ int oops_may_print(void)
  */
 void oops_enter(void)
 {
+	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
 	do_oops_enter_exit();
 }
 
-- 
cgit v1.2.3-55-g7522


From abf75a5033d4da7b8a7e92321d74021d1fcfb502 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann
Date: Wed, 12 Jul 2006 13:12:00 +0200
Subject: [PATCH] Fix prctl privilege escalation and suid_dumpable
 (CVE-2006-2451)

Based on a patch from Ernie Petrides

During security research, Red Hat discovered a behavioral flaw in core
dump handling. A local user could create a program that would cause a
core file to be dumped into a directory they would not normally have
permissions to write to. This could lead to a denial of service (disk
consumption), or allow the local user to gain root privileges.

The prctl() system call should never allow to set "dumpable" to the
value 2. Especially not for non-privileged users.

This can be split into three cases:

  1) running as root -- then core dumps will already be done as root,
     and so prctl(PR_SET_DUMPABLE, 2) is not useful

  2) running as non-root w/setuid-to-root -- this is the debatable case

  3) running as non-root w/setuid-to-non-root -- then you definitely
     do NOT want "dumpable" to get set to 2 because you have the
     privilege escalation vulnerability

With case #2, the only potential usefulness is for a program that has
designed to run with higher privilege (than the user invoking it) that
wants to be able to create root-owned root-validated core dumps. This
might be useful as a debugging aid, but would only be safe if the program
had done a chdir() to a safe directory.

There is no benefit to a production setuid-to-root utility, because it
shouldn't be dumping core in the first place. If this is true, then the
same debugging aid could also be accomplished with the "suid_dumpable"
sysctl.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index dbb3b9c7ea64..e236f98f7ec5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			error = current->mm->dumpable;
 			break;
 		case PR_SET_DUMPABLE:
-			if (arg2 < 0 || arg2 > 2) {
+			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-- 
cgit v1.2.3-55-g7522


From 26865e9c26d2d336f385b821b531ce2b31008e20 Mon Sep 17 00:00:00 2001
From: Adrian Bunk
Date: Fri, 30 Jun 2006 02:15:43 -0700
Subject: [PATCH] remove kernel/power/pm.c:pm_unregister_all()

Remove the deprecated and no longer used pm_unregister_all().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pm_legacy.h |  7 -------
 kernel/power/pm.c         | 37 -------------------------------------
 2 files changed, 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pm_legacy.h b/include/linux/pm_legacy.h
index 78027c533b94..514729a44688 100644
--- a/include/linux/pm_legacy.h
+++ b/include/linux/pm_legacy.h
@@ -14,11 +14,6 @@ extern int pm_active;
 struct pm_dev __deprecated *
 pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
 
-/*
- * Unregister all devices with matching callback
- */
-void __deprecated pm_unregister_all(pm_callback callback);
-
 /*
  * Send a request to all devices
  */
@@ -35,8 +30,6 @@ static inline struct pm_dev *pm_register(pm_dev_t type,
 	return NULL;
 }
 
-static inline void pm_unregister_all(pm_callback callback) {}
-
 static inline int pm_send_all(pm_request_t rqst, void *data)
 {
 	return 0;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
 	return dev;
 }
 
-static void __pm_unregister(struct pm_dev *dev)
-{
-	if (dev) {
-		list_del(&dev->entry);
-		kfree(dev);
-	}
-}
-
-/**
- *	pm_unregister_all - unregister all devices with matching callback
- *	@callback: callback function pointer
- *
- *	Unregister every device that would call the callback passed. This
- *	is primarily meant as a helper function for loadable modules. It
- *	enables a module to give up all its managed devices without keeping
- *	its own private list.
- */
- 
-void pm_unregister_all(pm_callback callback)
-{
-	struct list_head *entry;
-
-	if (!callback)
-		return;
-
-	mutex_lock(&pm_devs_lock);
-	entry = pm_devs.next;
-	while (entry != &pm_devs) {
-		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-		entry = entry->next;
-		if (dev->callback == callback)
-			__pm_unregister(dev);
-	}
-	mutex_unlock(&pm_devs_lock);
-}
-
 /**
  *	pm_send - send request to a single device
  *	@dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
 }
 
 EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister_all);
 EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
 
-- 
cgit v1.2.3-55-g7522


From cd6ef2ada54aa4788d5a3dee3cffaad41383a52a Mon Sep 17 00:00:00 2001
From: Adrian Bunk
Date: Fri, 30 Jun 2006 02:15:42 -0700
Subject: [PATCH] The scheduled unexport of insert_resource

Implement the scheduled unexport of insert_resource.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/feature-removal-schedule.txt | 8 --------
 include/linux/ioport.h                     | 2 +-
 kernel/resource.c                          | 2 --
 3 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ee287988934e..47d714d70821 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -55,14 +55,6 @@ Who:	Mauro Carvalho Chehab <mchehab@brturbo.com.br>
 
 ---------------------------
 
-What:	remove EXPORT_SYMBOL(insert_resource)
-When:	April 2006
-Files:	kernel/resource.c
-Why:	No modular usage in the kernel.
-Who:	Adrian Bunk <bunk@stusta.de>
-
----------------------------
-
 What:	PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl])
 When:	November 2005
 Files:	drivers/pcmcia/: pcmcia_ioctl.c
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5612dfeeae50..d42c83399071 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -97,7 +97,7 @@ extern struct resource iomem_resource;
 extern int request_resource(struct resource *root, struct resource *new);
 extern struct resource * ____request_resource(struct resource *root, struct resource *new);
 extern int release_resource(struct resource *new);
-extern __deprecated_for_modules int insert_resource(struct resource *parent, struct resource *new);
+extern int insert_resource(struct resource *parent, struct resource *new);
 extern int allocate_resource(struct resource *root, struct resource *new,
 			     resource_size_t size, resource_size_t min,
 			     resource_size_t max, resource_size_t align,
diff --git a/kernel/resource.c b/kernel/resource.c
index 129cf046e561..0dd3a857579e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -404,8 +404,6 @@ int insert_resource(struct resource *parent, struct resource *new)
 	return result;
 }
 
-EXPORT_SYMBOL(insert_resource);
-
 /*
  * Given an existing resource, change its start and size to match the
  * arguments.  Returns -EBUSY if it can't fit.  Existing children of
-- 
cgit v1.2.3-55-g7522


From 098c5eea03de4707019a205140296893252b4130 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher
Date: Fri, 14 Jul 2006 00:24:04 -0700
Subject: [PATCH] null-terminate over-long /proc/kallsyms symbols

Got a customer bug report (https://bugzilla.novell.com/190296) about kernel
symbols longer than 127 characters which end up in a string buffer that is
not NULL terminated, leading to garbage in /proc/kallsyms.  Using strlcpy
prevents this from happening, even though such symbols still won't come out
right.

A better fix would be to not use a fixed-size buffer, but it's probably not
worth the trouble.  (Modversion'ed symbols even have a length limit of 60.)

[bunk@stusta.de: build fix]
Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/module.h | 10 ++++------
 kernel/kallsyms.c      |  4 ++--
 kernel/module.c        | 11 ++++-------
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index d06c74fb8c26..0dfb794c52d3 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -362,10 +362,8 @@ int is_module_address(unsigned long addr);
 
 /* Returns module and fills in value, defined and namebuf, or NULL if
    symnum out of range. */
-struct module *module_get_kallsym(unsigned int symnum,
-				  unsigned long *value,
-				  char *type,
-				  char namebuf[128]);
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+				char *type, char *name, size_t namelen);
 
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
@@ -535,8 +533,8 @@ static inline const char *module_address_lookup(unsigned long addr,
 
 static inline struct module *module_get_kallsym(unsigned int symnum,
 						unsigned long *value,
-						char *type,
-						char namebuf[128])
+						char *type, char *name,
+						size_t namelen)
 {
 	return NULL;
 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
 	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
-					 &iter->value,
-					 &iter->type, iter->name);
+					 &iter->value, &iter->type,
+					 iter->name, sizeof(iter->name));
 	if (iter->owner == NULL)
 		return 0;
 
diff --git a/kernel/module.c b/kernel/module.c
index 35e1b1f859d7..2a19cd47c046 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
 	return NULL;
 }
 
-struct module *module_get_kallsym(unsigned int symnum,
-				  unsigned long *value,
-				  char *type,
-				  char namebuf[128])
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+				char *type, char *name, size_t namelen)
 {
 	struct module *mod;
 
@@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
 		if (symnum < mod->num_symtab) {
 			*value = mod->symtab[symnum].st_value;
 			*type = mod->symtab[symnum].st_info;
-			strncpy(namebuf,
-				mod->strtab + mod->symtab[symnum].st_name,
-				127);
+			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+				namelen);
 			mutex_unlock(&module_mutex);
 			return mod;
 		}
-- 
cgit v1.2.3-55-g7522


From 52e92e5788139921352213fa6faf6e30ff1f2f5a Mon Sep 17 00:00:00 2001
From: Adrian Bunk
Date: Fri, 14 Jul 2006 00:24:05 -0700
Subject: [PATCH] remove kernel/kthread.c:kthread_stop_sem()

Remove the now-unneeded kthread_stop_sem().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kthread.h |  1 -
 kernel/kthread.c        | 24 ++----------------------
 2 files changed, 2 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 7cce5dfa092f..1c65e7a9f186 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -28,7 +28,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 int kthread_stop(struct task_struct *k);
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s);
 int kthread_should_stop(void);
 
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 24be714b04c7..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,23 +215,6 @@ EXPORT_SYMBOL(kthread_bind);
  * was never called.
  */
 int kthread_stop(struct task_struct *k)
-{
-	return kthread_stop_sem(k, NULL);
-}
-EXPORT_SYMBOL(kthread_stop);
-
-/**
- * kthread_stop_sem - stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- * @s: semaphore that @k waits on while idle.
- *
- * Does essentially the same thing as kthread_stop() above, but wakes
- * @k by calling up(@s).
- *
- * Returns the result of threadfn(), or %-EINTR if wake_up_process()
- * was never called.
- */
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
@@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	if (s)
-		up(s);
-	else
-		wake_up_process(k);
+	wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop_sem);
+EXPORT_SYMBOL(kthread_stop);
 
 static __init int helper_init(void)
 {
-- 
cgit v1.2.3-55-g7522


From a0009652af385a42f0e0604136f772ead406c78d Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Fri, 14 Jul 2006 00:24:06 -0700
Subject: [PATCH] del_timer_sync(): add cpu_relax()

Relax the CPU in the del_timer_sync() busywait loop.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 2a87430a58d4..acfa557e685b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		cpu_relax();
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 60198f9992db1e36d5b4cc1526ff29550f7d002c Mon Sep 17 00:00:00 2001
From: Luca Tettamanti
Date: Fri, 14 Jul 2006 00:24:13 -0700
Subject: [PATCH] Add try_to_freeze() to rt-test kthreads

When CONFIG_RT_MUTEX_TESTER is enabled kernel refuses to suspend the
machine because it's unable to freeze the rt-test-* threads.

Add try_to_freeze() after schedule() so that the threads will be freezed
correctly; I've tested the patch and it lets the notebook suspends and
resumes nicely.

Signed-off-by: Luca Tettamanti <kronos.it@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rtmutex-tester.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 494dac872a13..948bd8f643e2 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -275,6 +275,7 @@ static int test_func(void *data)
 
 		/* Wait for the next command to be executed */
 		schedule();
+		try_to_freeze();
 
 		if (signal_pending(current))
 			flush_signals(current);
-- 
cgit v1.2.3-55-g7522


From 6bc02d8412b422388f86b09ae40d762c0bc05290 Mon Sep 17 00:00:00 2001
From: Adrian Bunk
Date: Fri, 14 Jul 2006 00:24:15 -0700
Subject: [PATCH] unexport open_softirq

Christoph Hellwig:
open_softirq just enables a softirq.  The softirq array is statically
allocated so to add a new one you would have to patch the kernel.  So
there's no point to keep this export at all as any user would have to
patch the enum in include/linux/interrupt.h anyway.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index fd12f2556f0d..0f08a84ae307 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
 	softirq_vec[nr].action = action;
 }
 
-EXPORT_UNUSED_SYMBOL(open_softirq);  /*  June 2006  */
-
 /* Tasklets */
 struct tasklet_head
 {
-- 
cgit v1.2.3-55-g7522


From 3e143475c22036847f898d7e76ba337c1d7dbf6f Mon Sep 17 00:00:00 2001
From: john stultz
Date: Fri, 14 Jul 2006 00:24:17 -0700
Subject: [PATCH] improve timekeeping resume robustness

Resolve problems seen w/ APM suspend.

Due to resume initialization ordering, its possible we could get a timer
interrupt before the timekeeping resume() function is called.  This patch
ensures we don't do any timekeeping accounting before we're fully resumed.

(akpm: fixes the machine-freezes-on-APM-resume bug)

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index acfa557e685b..05809c2e2fd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -969,6 +969,7 @@ void __init timekeeping_init(void)
 }
 
 
+static int timekeeping_suspended;
 /*
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
@@ -984,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
 	write_seqlock_irqsave(&xtime_lock, flags);
 	/* restart the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
+	clock->error = 0;
+	timekeeping_suspended = 0;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	timekeeping_suspended = 1;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -991,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
 	.resume		= timekeeping_resume,
+	.suspend	= timekeeping_suspend,
 	set_kset_name("timekeeping"),
 };
 
@@ -1101,13 +1115,16 @@ static void update_wall_time(void)
 {
 	cycle_t offset;
 
-	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+	/* Make sure we're fully resumed: */
+	if (unlikely(timekeeping_suspended))
+		return;
 
 #ifdef CONFIG_GENERIC_TIME
 	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
 #else
 	offset = clock->cycle_interval;
 #endif
+	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
 
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.
-- 
cgit v1.2.3-55-g7522


From a4afee02a5dd4f20c08fca26e9b610e72d0bcbf0 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi
Date: Fri, 14 Jul 2006 00:24:18 -0700
Subject: [PATCH] Fix sighand->siglock usage in kernel/acct.c

IRQs must be disabled before taking ->siglock.

Noticed by lockdep.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index f18e0b8df3e1..2a7c933651c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,7 +488,7 @@ static void do_acct_process(struct file *file)
 		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 	read_unlock(&tasklist_lock);
 
-	spin_lock(&current->sighand->siglock);
+	spin_lock_irq(&current->sighand->siglock);
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 	ac.ac_flag = pacct->ac_flag;
@@ -496,7 +496,7 @@ static void do_acct_process(struct file *file)
 	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 	ac.ac_exitcode = pacct->ac_exitcode;
-	spin_unlock(&current->sighand->siglock);
+	spin_unlock_irq(&current->sighand->siglock);
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_swaps = encode_comp_t(0);
-- 
cgit v1.2.3-55-g7522


From 3a5f5e488ceee9e08df3dff3f01b12fafc9e7e68 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Fri, 14 Jul 2006 00:24:27 -0700
Subject: [PATCH] lockdep: core, fix rq-lock handling on
 __ARCH_WANT_UNLOCKED_CTXSW

On platforms that have __ARCH_WANT_UNLOCKED_CTXSW set and want to implement
lock validator support there's a bug in rq->lock handling: in this case we
dont 'carry over' the runqueue lock into another task - but still we did a
spinlock_release() of it.  Fix this by making the spinlock_release() in
context_switch() dependent on !__ARCH_WANT_UNLOCKED_CTXSW.

(Reported by Ralf Baechle on MIPS, which has __ARCH_WANT_UNLOCKED_CTXSW.
This fixes a lockdep-internal BUG message on such platforms.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d714611f1691..e9a0b61f12ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1788,7 +1788,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
-- 
cgit v1.2.3-55-g7522


From ca74e92b4698276b6696f15a801759f50944f387 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:36 -0700
Subject: [PATCH] per-task-delay-accounting: setup

Initialization code related to collection of per-task "delay" statistics which
measure how long it had to wait for cpu, sync block io, swapping etc.  The
collection of statistics and the interface are in other patches.  This patch
sets up the data structures and allows the statistics collection to be
disabled through a kernel boot parameter.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  2 +
 include/linux/delayacct.h           | 69 +++++++++++++++++++++++++++++
 include/linux/sched.h               | 20 +++++++++
 include/linux/time.h                | 12 +++++
 init/Kconfig                        | 10 +++++
 init/main.c                         |  2 +
 kernel/Makefile                     |  1 +
 kernel/delayacct.c                  | 87 +++++++++++++++++++++++++++++++++++++
 kernel/exit.c                       |  2 +
 kernel/fork.c                       |  2 +
 10 files changed, 207 insertions(+)
 create mode 100644 include/linux/delayacct.h
 create mode 100644 kernel/delayacct.c

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 149f62ba14a5..e11f7728ec6f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -448,6 +448,8 @@ running once the system is up.
 			Format: <area>[,<node>]
 			See also Documentation/networking/decnet.txt.
 
+	delayacct	[KNL] Enable per-task delay accounting
+
 	dhash_entries=	[KNL]
 			Set number of hash buckets for dentry cache.
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
new file mode 100644
index 000000000000..9572cfa1f129
--- /dev/null
+++ b/include/linux/delayacct.h
@@ -0,0 +1,69 @@
+/* delayacct.h - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_DELAYACCT_H
+#define _LINUX_DELAYACCT_H
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+
+extern int delayacct_on;	/* Delay accounting turned on/off */
+extern kmem_cache_t *delayacct_cache;
+extern void delayacct_init(void);
+extern void __delayacct_tsk_init(struct task_struct *);
+extern void __delayacct_tsk_exit(struct task_struct *);
+
+static inline void delayacct_set_flag(int flag)
+{
+	if (current->delays)
+		current->delays->flags |= flag;
+}
+
+static inline void delayacct_clear_flag(int flag)
+{
+	if (current->delays)
+		current->delays->flags &= ~flag;
+}
+
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{
+	/* reinitialize in case parent's non-null pointer was dup'ed*/
+	tsk->delays = NULL;
+	if (unlikely(delayacct_on))
+		__delayacct_tsk_init(tsk);
+}
+
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{
+	if (tsk->delays)
+		__delayacct_tsk_exit(tsk);
+}
+
+#else
+static inline void delayacct_set_flag(int flag)
+{}
+static inline void delayacct_clear_flag(int flag)
+{}
+static inline void delayacct_init(void)
+{}
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{}
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{}
+#endif /* CONFIG_TASK_DELAY_ACCT */
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c876e27ff93..7a54e62763c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -552,6 +552,23 @@ struct sched_info {
 extern struct file_operations proc_schedstat_operations;
 #endif
 
+#ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+	spinlock_t	lock;
+	unsigned int	flags;	/* Private per-task flags */
+
+	/* For each stat XXX, add following, aligned appropriately
+	 *
+	 * struct timespec XXX_start, XXX_end;
+	 * u64 XXX_delay;
+	 * u32 XXX_count;
+	 *
+	 * Atomicity of updates to XXX_delay, XXX_count protected by
+	 * single lock above (split into XXX_lock if contention is an issue).
+	 */
+};
+#endif
+
 enum idle_type
 {
 	SCHED_IDLE,
@@ -945,6 +962,9 @@ struct task_struct {
 	 * cache last used pipe for splice
 	 */
 	struct pipe_inode_info *splice_pipe;
+#ifdef	CONFIG_TASK_DELAY_ACCT
+	struct task_delay_info *delays;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff --git a/include/linux/time.h b/include/linux/time.h
index c05f8bb9a323..a5b739967b74 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -70,6 +70,18 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 
+/*
+ * sub = lhs - rhs, in normalized form
+ */
+static inline struct timespec timespec_sub(struct timespec lhs,
+						struct timespec rhs)
+{
+	struct timespec ts_delta;
+	set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec,
+				lhs.tv_nsec - rhs.tv_nsec);
+	return ts_delta;
+}
+
 /*
  * Returns true if the timespec is norm, false if denorm:
  */
diff --git a/init/Kconfig b/init/Kconfig
index a5b073a103e7..90498a3e53da 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -158,6 +158,16 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>.
 
+config TASK_DELAY_ACCT
+	bool "Enable per-task delay accounting (EXPERIMENTAL)"
+	help
+	  Collect information on time spent by a task waiting for system
+	  resources like cpu, synchronous block I/O completion and swapping
+	  in pages. Such statistics can help in setting a task's priorities
+	  relative to other tasks for cpu, io, rss limits etc.
+
+	  Say N if unsure.
+
 config SYSCTL
 	bool "Sysctl support" if EMBEDDED
 	default y
diff --git a/init/main.c b/init/main.c
index 628b8e9e841a..9e8e8c152142 100644
--- a/init/main.c
+++ b/init/main.c
@@ -41,6 +41,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/efi.h>
+#include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
@@ -574,6 +575,7 @@ asmlinkage void __init start_kernel(void)
 	proc_root_init();
 #endif
 	cpuset_init();
+	delayacct_init();
 
 	check_bugs();
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 47dbcd570cd8..87bb34cc8938 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..fbf7f2284952
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,87 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_enable(char *str)
+{
+	delayacct_on = 1;
+	return 1;
+}
+__setup("delayacct", delayacct_setup_enable);
+
+void delayacct_init(void)
+{
+	delayacct_cache = kmem_cache_create("delayacct_cache",
+					sizeof(struct task_delay_info),
+					0,
+					SLAB_PANIC,
+					NULL, NULL);
+	delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+	if (tsk->delays)
+		spin_lock_init(&tsk->delays->lock);
+}
+
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+	kmem_cache_free(delayacct_cache, tsk->delays);
+	tsk->delays = NULL;
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+	do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+				u64 *total, u32 *count)
+{
+	struct timespec ts;
+	s64 ns;
+
+	do_posix_clock_monotonic_gettime(end);
+	ts = timespec_sub(*end, *start);
+	ns = timespec_to_ns(&ts);
+	if (ns < 0)
+		return;
+
+	spin_lock(&current->delays->lock);
+	*total += ns;
+	(*count)++;
+	spin_unlock(&current->delays->lock);
+}
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783d..3c2cf91defa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	delayacct_tsk_exit(tsk);
 	exit_mm(tsk);
 
 	if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 926e5a68ea9e..451cfd35bf22 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/delayacct.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1000,6 +1001,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_put_domain;
 
 	p->did_exec = 0;
+	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
 	p->pid = pid;
 	retval = -EFAULT;
-- 
cgit v1.2.3-55-g7522


From 0ff922452df86f3e9a2c6f705c4588ec62d096a7 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:37 -0700
Subject: [PATCH] per-task-delay-accounting: sync block I/O and swapin delay
 collection

Unlike earlier iterations of the delay accounting patches, now delays are only
collected for the actual I/O waits rather than try and cover the delays seen
in I/O submission paths.

Account separately for block I/O delays incurred as a result of swapin page
faults whose frequency can be affected by the task/process' rss limit.  Hence
swapin delays can act as feedback for rss limit changes independent of I/O
priority changes.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/delayacct.h | 25 +++++++++++++++++++++++++
 include/linux/sched.h     | 13 +++++++++++++
 kernel/delayacct.c        | 19 +++++++++++++++++++
 kernel/sched.c            |  5 +++++
 mm/memory.c               |  4 ++++
 5 files changed, 66 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 9572cfa1f129..0ecbf9aad8e1 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -19,6 +19,13 @@
 
 #include <linux/sched.h>
 
+/*
+ * Per-task flags relevant to delay accounting
+ * maintained privately to avoid exhausting similar flags in sched.h:PF_*
+ * Used to set current->delays->flags
+ */
+#define DELAYACCT_PF_SWAPIN	0x00000001	/* I am doing a swapin */
+
 #ifdef CONFIG_TASK_DELAY_ACCT
 
 extern int delayacct_on;	/* Delay accounting turned on/off */
@@ -26,6 +33,8 @@ extern kmem_cache_t *delayacct_cache;
 extern void delayacct_init(void);
 extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
+extern void __delayacct_blkio_start(void);
+extern void __delayacct_blkio_end(void);
 
 static inline void delayacct_set_flag(int flag)
 {
@@ -53,6 +62,18 @@ static inline void delayacct_tsk_exit(struct task_struct *tsk)
 		__delayacct_tsk_exit(tsk);
 }
 
+static inline void delayacct_blkio_start(void)
+{
+	if (current->delays)
+		__delayacct_blkio_start();
+}
+
+static inline void delayacct_blkio_end(void)
+{
+	if (current->delays)
+		__delayacct_blkio_end();
+}
+
 #else
 static inline void delayacct_set_flag(int flag)
 {}
@@ -64,6 +85,10 @@ static inline void delayacct_tsk_init(struct task_struct *tsk)
 {}
 static inline void delayacct_tsk_exit(struct task_struct *tsk)
 {}
+static inline void delayacct_blkio_start(void)
+{}
+static inline void delayacct_blkio_end(void)
+{}
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a54e62763c5..2f43f1fb7de7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -566,6 +566,19 @@ struct task_delay_info {
 	 * Atomicity of updates to XXX_delay, XXX_count protected by
 	 * single lock above (split into XXX_lock if contention is an issue).
 	 */
+
+	/*
+	 * XXX_count is incremented on every XXX operation, the delay
+	 * associated with the operation is added to XXX_delay.
+	 * XXX_delay contains the accumulated delay time in nanoseconds.
+	 */
+	struct timespec blkio_start, blkio_end;	/* Shared by blkio, swapin */
+	u64 blkio_delay;	/* wait for sync block io completion */
+	u64 swapin_delay;	/* wait for swapin block io completion */
+	u32 blkio_count;	/* total count of the number of sync block */
+				/* io operations performed */
+	u32 swapin_count;	/* total count of the number of swapin block */
+				/* io operations performed */
 };
 #endif
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index fbf7f2284952..3546b0800f9f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -85,3 +85,22 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 	spin_unlock(&current->delays->lock);
 }
 
+void __delayacct_blkio_start(void)
+{
+	delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+	if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+		/* Swapin block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->swapin_delay,
+			&current->delays->swapin_count);
+	else	/* Other block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->blkio_delay,
+			&current->delays->blkio_count);
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index e9a0b61f12ab..9d42cbfc4f8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -51,6 +51,7 @@
 #include <linux/times.h>
 #include <linux/acct.h>
 #include <linux/kprobes.h>
+#include <linux/delayacct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -4534,9 +4535,11 @@ void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 
+	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 
@@ -4545,9 +4548,11 @@ long __sched io_schedule_timeout(long timeout)
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 
+	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
 	return ret;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index de8bc85dc8f3..109e9866237e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,6 +47,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
+#include <linux/delayacct.h>
 #include <linux/init.h>
 
 #include <asm/pgalloc.h>
@@ -1934,6 +1935,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		migration_entry_wait(mm, pmd, address);
 		goto out;
 	}
+	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1946,6 +1948,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
+			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 			goto unlock;
 		}
 
@@ -1955,6 +1958,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		grab_swap_token();
 	}
 
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 	mark_page_accessed(page);
 	lock_page(page);
 
-- 
cgit v1.2.3-55-g7522


From 52f17b6c2bd443e7806a161e9d10a983650db01d Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman
Date: Fri, 14 Jul 2006 00:24:38 -0700
Subject: [PATCH] per-task-delay-accounting: cpu delay collection via
 schedstats

Make the task-related schedstats functions callable by delay accounting even
if schedstats collection isn't turned on.  This removes the dependency of
delay accounting on schedstats.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 20 ++++++++++++---
 kernel/sched.c        | 71 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 66 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2f43f1fb7de7..f751062d89a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -537,7 +537,7 @@ extern struct user_struct root_user;
 struct backing_dev_info;
 struct reclaim_state;
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
 	/* cumulative counters */
 	unsigned long	cpu_time,	/* time spent on the cpu */
@@ -548,9 +548,11 @@ struct sched_info {
 	unsigned long	last_arrival,	/* when we last ran on a cpu */
 			last_queued;	/* when we were last queued to run */
 };
+#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 
+#ifdef CONFIG_SCHEDSTATS
 extern struct file_operations proc_schedstat_operations;
-#endif
+#endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
@@ -580,7 +582,19 @@ struct task_delay_info {
 	u32 swapin_count;	/* total count of the number of swapin block */
 				/* io operations performed */
 };
+#endif	/* CONFIG_TASK_DELAY_ACCT */
+
+static inline int sched_info_on(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	return 1;
+#elif defined(CONFIG_TASK_DELAY_ACCT)
+	extern int delayacct_on;
+	return delayacct_on;
+#else
+	return 0;
 #endif
+}
 
 enum idle_type
 {
@@ -777,7 +791,7 @@ struct task_struct {
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9d42cbfc4f8b..b44b9a43b0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -502,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
 	.release = single_release,
 };
 
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq) {
+		rq->rq_sched_info.run_delay += delta_jiffies;
+		rq->rq_sched_info.pcnt++;
+	}
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq)
+		rq->rq_sched_info.cpu_time += delta_jiffies;
+}
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 #endif
@@ -524,7 +551,7 @@ static inline struct rq *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -552,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t)
  */
 static void sched_info_arrive(struct task_struct *t)
 {
-	unsigned long now = jiffies, diff = 0;
-	struct rq *rq = task_rq(t);
+	unsigned long now = jiffies, delta_jiffies = 0;
 
 	if (t->sched_info.last_queued)
-		diff = now - t->sched_info.last_queued;
+		delta_jiffies = now - t->sched_info.last_queued;
 	sched_info_dequeued(t);
-	t->sched_info.run_delay += diff;
+	t->sched_info.run_delay += delta_jiffies;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcnt++;
 
-	if (!rq)
-		return;
-
-	rq->rq_sched_info.run_delay += diff;
-	rq->rq_sched_info.pcnt++;
+	rq_sched_info_arrive(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -586,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t)
  */
 static inline void sched_info_queued(struct task_struct *t)
 {
-	if (!t->sched_info.last_queued)
-		t->sched_info.last_queued = jiffies;
+	if (unlikely(sched_info_on()))
+		if (!t->sched_info.last_queued)
+			t->sched_info.last_queued = jiffies;
 }
 
 /*
@@ -596,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t)
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
-	struct rq *rq = task_rq(t);
-	unsigned long diff = jiffies - t->sched_info.last_arrival;
-
-	t->sched_info.cpu_time += diff;
+	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 
-	if (rq)
-		rq->rq_sched_info.cpu_time += diff;
+	t->sched_info.cpu_time += delta_jiffies;
+	rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -611,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t)
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct rq *rq = task_rq(prev);
 
@@ -626,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 	if (next != rq->idle)
 		sched_info_arrive(next);
 }
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	if (unlikely(sched_info_on()))
+		__sched_info_switch(prev, next);
+}
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
 /*
  * Adding/removing a task to/from a priority array:
@@ -1531,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
-	memset(&p->sched_info, 0, sizeof(p->sched_info));
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
-- 
cgit v1.2.3-55-g7522


From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:40 -0700
Subject: [PATCH] per-task-delay-accounting: taskstats interface

Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC
family), for getting statistics of tasks and thread groups during their
lifetime and when they exit.  The interface is intended for use by multiple
accounting packages though it is being created in the context of delay
accounting.

This patch creates the interface without populating the fields of the data
that is sent to the user in response to a command or upon the exit of a task.
Each accounting package interested in using taskstats has to provide an
additional patch to add its stats to the common structure.

[akpm@osdl.org: cleanups, Kconfig fix]
Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/taskstats.txt | 146 ++++++++++++++
 include/linux/taskstats.h              |  84 +++++++++
 include/linux/taskstats_kern.h         |  57 ++++++
 init/Kconfig                           |  13 ++
 init/main.c                            |   2 +
 kernel/Makefile                        |   1 +
 kernel/exit.c                          |   7 +
 kernel/taskstats.c                     | 336 +++++++++++++++++++++++++++++++++
 8 files changed, 646 insertions(+)
 create mode 100644 Documentation/accounting/taskstats.txt
 create mode 100644 include/linux/taskstats.h
 create mode 100644 include/linux/taskstats_kern.h
 create mode 100644 kernel/taskstats.c

(limited to 'kernel')

diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
new file mode 100644
index 000000000000..ad9b6997e162
--- /dev/null
+++ b/Documentation/accounting/taskstats.txt
@@ -0,0 +1,146 @@
+Per-task statistics interface
+-----------------------------
+
+
+Taskstats is a netlink-based interface for sending per-task and
+per-process statistics from the kernel to userspace.
+
+Taskstats was designed for the following benefits:
+
+- efficiently provide statistics during lifetime of a task and on its exit
+- unified interface for multiple accounting subsystems
+- extensibility for use by future accounting patches
+
+Terminology
+-----------
+
+"pid", "tid" and "task" are used interchangeably and refer to the standard
+Linux task defined by struct task_struct.  per-pid stats are the same as
+per-task stats.
+
+"tgid", "process" and "thread group" are used interchangeably and refer to the
+tasks that share an mm_struct i.e. the traditional Unix process. Despite the
+use of tgid, there is no special treatment for the task that is thread group
+leader - a process is deemed alive as long as it has any task belonging to it.
+
+Usage
+-----
+
+To get statistics during task's lifetime, userspace opens a unicast netlink
+socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
+The response contains statistics for a task (if pid is specified) or the sum of
+statistics for all tasks of the process (if tgid is specified).
+
+To obtain statistics for tasks which are exiting, userspace opens a multicast
+netlink socket. Each time a task exits, two records are sent by the kernel to
+each listener on the multicast socket. The first the per-pid task's statistics
+and the second is the sum for all tasks of the process to which the task
+belongs (the task does not need to be the thread group leader). The need for
+per-tgid stats to be sent for each exiting task is explained in the per-tgid
+stats section below.
+
+
+Interface
+---------
+
+The user-kernel interface is encapsulated in include/linux/taskstats.h
+
+To avoid this documentation becoming obsolete as the interface evolves, only
+an outline of the current version is given. taskstats.h always overrides the
+description here.
+
+struct taskstats is the common accounting structure for both per-pid and
+per-tgid data. It is versioned and can be extended by each accounting subsystem
+that is added to the kernel. The fields and their semantics are defined in the
+taskstats.h file.
+
+The data exchanged between user and kernel space is a netlink message belonging
+to the NETLINK_GENERIC family and using the netlink attributes interface.
+The messages are in the format
+
+    +----------+- - -+-------------+-------------------+
+    | nlmsghdr | Pad |  genlmsghdr | taskstats payload |
+    +----------+- - -+-------------+-------------------+
+
+
+The taskstats payload is one of the following three kinds:
+
+1. Commands: Sent from user to kernel. The payload is one attribute, of type
+TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute
+payload. The pid/tgid denotes the task/process for which userspace wants
+statistics.
+
+2. Response for a command: sent from the kernel in response to a userspace
+command. The payload is a series of three attributes of type:
+
+a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
+a pid/tgid will be followed by some stats.
+
+b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
+is being returned.
+
+c) TASKSTATS_TYPE_STATS: attribute with a struct taskstsats as payload. The
+same structure is used for both per-pid and per-tgid stats.
+
+3. New message sent by kernel whenever a task exits. The payload consists of a
+   series of attributes of the following type:
+
+a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
+b) TASKSTATS_TYPE_PID: contains exiting task's pid
+c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
+d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
+e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
+f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
+
+
+per-tgid stats
+--------------
+
+Taskstats provides per-process stats, in addition to per-task stats, since
+resource management is often done at a process granularity and aggregating task
+stats in userspace alone is inefficient and potentially inaccurate (due to lack
+of atomicity).
+
+However, maintaining per-process, in addition to per-task stats, within the
+kernel has space and time overheads. Hence the taskstats implementation
+dynamically sums up the per-task stats for each task belonging to a process
+whenever per-process stats are needed.
+
+Not maintaining per-tgid stats creates a problem when userspace is interested
+in getting these stats when the process dies i.e. the last thread of
+a process exits. It isn't possible to simply return some aggregated per-process
+statistic from the kernel.
+
+The approach taken by taskstats is to return the per-tgid stats *each* time
+a task exits, in addition to the per-pid stats for that task. Userspace can
+maintain task<->process mappings and use them to maintain the per-process stats
+in userspace, updating the aggregate appropriately as the tasks of a process
+exit.
+
+Extending taskstats
+-------------------
+
+There are two ways to extend the taskstats interface to export more
+per-task/process stats as patches to collect them get added to the kernel
+in future:
+
+1. Adding more fields to the end of the existing struct taskstats. Backward
+   compatibility is ensured by the version number within the
+   structure. Userspace will use only the fields of the struct that correspond
+   to the version its using.
+
+2. Defining separate statistic structs and using the netlink attributes
+   interface to return them. Since userspace processes each netlink attribute
+   independently, it can always ignore attributes whose type it does not
+   understand (because it is using an older version of the interface).
+
+
+Choosing between 1. and 2. is a matter of trading off flexibility and
+overhead. If only a few fields need to be added, then 1. is the preferable
+path since the kernel and userspace don't need to incur the overhead of
+processing new netlink attributes. But if the new fields expand the existing
+struct too much, requiring disparate userspace accounting utilities to
+unnecessarily receive large structures whose fields are of no interest, then
+extending the attributes structure would be worthwhile.
+
+----
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
new file mode 100644
index 000000000000..51f62759bea9
--- /dev/null
+++ b/include/linux/taskstats.h
@@ -0,0 +1,84 @@
+/* taskstats.h - exporting per-task statistics
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef _LINUX_TASKSTATS_H
+#define _LINUX_TASKSTATS_H
+
+/* Format for per-task data returned to userland when
+ *	- a task exits
+ *	- listener requests stats for a task
+ *
+ * The struct is versioned. Newer versions should only add fields to
+ * the bottom of the struct to maintain backward compatibility.
+ *
+ *
+ * To add new fields
+ *	a) bump up TASKSTATS_VERSION
+ *	b) add comment indicating new version number at end of struct
+ *	c) add new fields after version comment; maintain 64-bit alignment
+ */
+
+#define TASKSTATS_VERSION	1
+
+struct taskstats {
+
+	/* Version 1 */
+	__u64	version;
+};
+
+
+#define TASKSTATS_LISTEN_GROUP	0x1
+
+/*
+ * Commands sent from userspace
+ * Not versioned. New commands should only be inserted at the enum's end
+ * prior to __TASKSTATS_CMD_MAX
+ */
+
+enum {
+	TASKSTATS_CMD_UNSPEC = 0,	/* Reserved */
+	TASKSTATS_CMD_GET,		/* user->kernel request/get-response */
+	TASKSTATS_CMD_NEW,		/* kernel->user event */
+	__TASKSTATS_CMD_MAX,
+};
+
+#define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1)
+
+enum {
+	TASKSTATS_TYPE_UNSPEC = 0,	/* Reserved */
+	TASKSTATS_TYPE_PID,		/* Process id */
+	TASKSTATS_TYPE_TGID,		/* Thread group id */
+	TASKSTATS_TYPE_STATS,		/* taskstats structure */
+	TASKSTATS_TYPE_AGGR_PID,	/* contains pid + stats */
+	TASKSTATS_TYPE_AGGR_TGID,	/* contains tgid + stats */
+	__TASKSTATS_TYPE_MAX,
+};
+
+#define TASKSTATS_TYPE_MAX (__TASKSTATS_TYPE_MAX - 1)
+
+enum {
+	TASKSTATS_CMD_ATTR_UNSPEC = 0,
+	TASKSTATS_CMD_ATTR_PID,
+	TASKSTATS_CMD_ATTR_TGID,
+	__TASKSTATS_CMD_ATTR_MAX,
+};
+
+#define TASKSTATS_CMD_ATTR_MAX (__TASKSTATS_CMD_ATTR_MAX - 1)
+
+/* NETLINK_GENERIC related info */
+
+#define TASKSTATS_GENL_NAME	"TASKSTATS"
+#define TASKSTATS_GENL_VERSION	0x1
+
+#endif /* _LINUX_TASKSTATS_H */
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
new file mode 100644
index 000000000000..bd0ecb969c26
--- /dev/null
+++ b/include/linux/taskstats_kern.h
@@ -0,0 +1,57 @@
+/* taskstats_kern.h - kernel header for per-task statistics interface
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ */
+
+#ifndef _LINUX_TASKSTATS_KERN_H
+#define _LINUX_TASKSTATS_KERN_H
+
+#include <linux/taskstats.h>
+#include <linux/sched.h>
+
+enum {
+	TASKSTATS_MSG_UNICAST,		/* send data only to requester */
+	TASKSTATS_MSG_MULTICAST,	/* send data to a group */
+};
+
+#ifdef CONFIG_TASKSTATS
+extern kmem_cache_t *taskstats_cache;
+
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
+					struct taskstats **ptgidstats)
+{
+	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+}
+
+static inline void taskstats_exit_free(struct taskstats *tidstats,
+					struct taskstats *tgidstats)
+{
+	if (tidstats)
+		kmem_cache_free(taskstats_cache, tidstats);
+	if (tgidstats)
+		kmem_cache_free(taskstats_cache, tgidstats);
+}
+
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *,
+				struct taskstats *);
+extern void taskstats_init_early(void);
+
+#else
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
+					struct taskstats **ptgidstats)
+{}
+static inline void taskstats_exit_free(struct taskstats *ptidstats,
+					struct taskstats *ptgidstats)
+{}
+static inline void taskstats_exit_send(struct task_struct *tsk,
+					struct taskstats *tidstats,
+					struct taskstats *tgidstats)
+{}
+static inline void taskstats_init_early(void)
+{}
+#endif /* CONFIG_TASKSTATS */
+
+#endif
+
diff --git a/init/Kconfig b/init/Kconfig
index 90498a3e53da..56a7093b4e4c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -158,6 +158,19 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.physik3.uni-rostock.de/tim/kernel/utils/acct/>.
 
+config TASKSTATS
+	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
+	depends on NET
+	default n
+	help
+	  Export selected statistics for tasks/processes through the
+	  generic netlink interface. Unlike BSD process accounting, the
+	  statistics are available during the lifetime of tasks/processes as
+	  responses to commands. Like BSD accounting, they are sent to user
+	  space on task exit.
+
+	  Say N if unsure.
+
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting (EXPERIMENTAL)"
 	help
diff --git a/init/main.c b/init/main.c
index 9e8e8c152142..8651a720a092 100644
--- a/init/main.c
+++ b/init/main.c
@@ -41,6 +41,7 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/efi.h>
+#include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
@@ -575,6 +576,7 @@ asmlinkage void __init start_kernel(void)
 	proc_root_init();
 #endif
 	cpuset_init();
+	taskstats_init_early();
 	delayacct_init();
 
 	check_bugs();
diff --git a/kernel/Makefile b/kernel/Makefile
index 87bb34cc8938..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index 3c2cf91defa7..9852ed8c2988 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
@@ -844,6 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct taskstats *tidstats, *tgidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -882,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
+	taskstats_exit_alloc(&tidstats, &tgidstats);
+
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
 		update_hiwater_rss(tsk->mm);
@@ -901,7 +905,10 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	taskstats_exit_send(tsk, tidstats, tgidstats);
+	taskstats_exit_free(tidstats, tgidstats);
 	delayacct_tsk_exit(tsk);
+
 	exit_mm(tsk);
 
 	if (group_dead)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..82ec9137d908
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,336 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static DEFINE_MUTEX(taskstats_exit_mutex);
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+};
+
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+			void **replyp, size_t size)
+{
+	struct sk_buff *skb;
+	void *reply;
+
+	/*
+	 * If new attributes are added, please revisit this allocation
+	 */
+	skb = nlmsg_new(size);
+	if (!skb)
+		return -ENOMEM;
+
+	if (!info) {
+		int seq = get_cpu_var(taskstats_seqnum)++;
+		put_cpu_var(taskstats_seqnum);
+
+		reply = genlmsg_put(skb, 0, seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	} else
+		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	if (reply == NULL) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	*replyp = reply;
+	return 0;
+}
+
+static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *reply;
+	int rc;
+
+	reply = genlmsg_data(genlhdr);
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	if (event == TASKSTATS_MSG_MULTICAST)
+		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+	return genlmsg_unicast(skb, pid);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk = pidtsk;
+
+	if (!pidtsk) {
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(tsk);
+
+	/*
+	 * Each accounting subsystem adds calls to its functions to
+	 * fill in relevant parts of struct taskstsats as follows
+	 *
+	 *	rc = per-task-foo(stats, tsk);
+	 *	if (rc)
+	 *		goto err;
+	 */
+
+err:
+	put_task_struct(tsk);
+	return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk, *first;
+
+	first = tgidtsk;
+	read_lock(&tasklist_lock);
+	if (!first) {
+		first = find_task_by_pid(tgid);
+		if (!first) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+	}
+	tsk = first;
+	do {
+		/*
+		 * Each accounting subsystem adds calls its functions to
+		 * fill in relevant parts of struct taskstsats as follows
+		 *
+		 *	rc = per-task-foo(stats, tsk);
+		 *	if (rc)
+		 *		break;
+		 */
+
+	} while_each_thread(first, tsk);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Accounting subsytems can also add calls here if they don't
+	 * wish to aggregate statistics for per-tgid stats
+	 */
+
+	return rc;
+}
+
+static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+{
+	int rc = 0;
+	struct sk_buff *rep_skb;
+	struct taskstats stats;
+	void *reply;
+	size_t size;
+	struct nlattr *na;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	memset(&stats, 0, sizeof(stats));
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		return rc;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+		rc = fill_pid(pid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+		rc = fill_tgid(tgid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nla_nest_end(rep_skb, na);
+
+	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+
+nla_put_failure:
+	return genlmsg_cancel(rep_skb, reply);
+err:
+	nlmsg_free(rep_skb);
+	return rc;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+			struct taskstats *tgidstats)
+{
+	int rc;
+	struct sk_buff *rep_skb;
+	void *reply;
+	size_t size;
+	int is_thread_group;
+	struct nlattr *na;
+
+	if (!family_registered || !tidstats)
+		return;
+
+	mutex_lock(&taskstats_exit_mutex);
+
+	is_thread_group = !thread_group_empty(tsk);
+	rc = 0;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	if (is_thread_group)
+		size = 2 * size;	/* PID + STATS + TGID + STATS */
+
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		goto ret;
+
+	rc = fill_pid(tsk->pid, tsk, tidstats);
+	if (rc < 0)
+		goto err_skb;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tidstats);
+	nla_nest_end(rep_skb, na);
+
+	if (!is_thread_group || !tgidstats) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	rc = fill_tgid(tsk->pid, tsk, tgidstats);
+	/*
+	 * If fill_tgid() failed then one probable reason could be that the
+	 * thread group leader has exited. fill_tgid() will fail, send out
+	 * the pid statistics collected earlier.
+	 */
+	if (rc < 0) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tgidstats);
+	nla_nest_end(rep_skb, na);
+
+	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	goto ret;
+
+nla_put_failure:
+	genlmsg_cancel(rep_skb, reply);
+	goto ret;
+err_skb:
+	nlmsg_free(rep_skb);
+ret:
+	mutex_unlock(&taskstats_exit_mutex);
+	return;
+}
+
+static struct genl_ops taskstats_ops = {
+	.cmd		= TASKSTATS_CMD_GET,
+	.doit		= taskstats_send_stats,
+	.policy		= taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+	taskstats_cache = kmem_cache_create("taskstats_cache",
+						sizeof(struct taskstats),
+						0, SLAB_PANIC, NULL, NULL);
+}
+
+static int __init taskstats_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&family);
+	if (rc)
+		return rc;
+
+	rc = genl_register_ops(&family, &taskstats_ops);
+	if (rc < 0)
+		goto err;
+
+	family_registered = 1;
+	return 0;
+err:
+	genl_unregister_family(&family);
+	return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
-- 
cgit v1.2.3-55-g7522


From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:41 -0700
Subject: [PATCH] per-task-delay-accounting: delay accounting usage of
 taskstats interface

Usage of taskstats interface by delay accounting.

Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/delayacct.h      | 15 ++++++++++
 include/linux/sched.h          |  1 +
 include/linux/taskstats.h      | 55 ++++++++++++++++++++++++++++++++++++-
 include/linux/taskstats_kern.h |  1 +
 init/Kconfig                   |  1 +
 kernel/delayacct.c             | 62 +++++++++++++++++++++++++++++++++++++++++-
 kernel/taskstats.c             | 16 +++++++----
 7 files changed, 144 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 0ecbf9aad8e1..d955078a1441 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -18,6 +18,7 @@
 #define _LINUX_DELAYACCT_H
 
 #include <linux/sched.h>
+#include <linux/taskstats_kern.h>
 
 /*
  * Per-task flags relevant to delay accounting
@@ -35,6 +36,7 @@ extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
 extern void __delayacct_blkio_end(void);
+extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 
 static inline void delayacct_set_flag(int flag)
 {
@@ -74,6 +76,16 @@ static inline void delayacct_blkio_end(void)
 		__delayacct_blkio_end();
 }
 
+static inline int delayacct_add_tsk(struct taskstats *d,
+					struct task_struct *tsk)
+{
+	if (likely(!delayacct_on))
+		return -EINVAL;
+	if (!tsk->delays)
+		return 0;
+	return __delayacct_add_tsk(d, tsk);
+}
+
 #else
 static inline void delayacct_set_flag(int flag)
 {}
@@ -89,6 +101,9 @@ static inline void delayacct_blkio_start(void)
 {}
 static inline void delayacct_blkio_end(void)
 {}
+static inline int delayacct_add_tsk(struct taskstats *d,
+					struct task_struct *tsk)
+{ return 0; }
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f751062d89a2..3c5610ca0c92 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -990,6 +990,7 @@ struct task_struct {
 	 */
 	struct pipe_inode_info *splice_pipe;
 #ifdef	CONFIG_TASK_DELAY_ACCT
+	spinlock_t delays_lock;
 	struct task_delay_info *delays;
 #endif
 };
diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index 51f62759bea9..c6aeca32348e 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -34,7 +34,60 @@
 struct taskstats {
 
 	/* Version 1 */
-	__u64	version;
+	__u16	version;
+	__u16	padding[3];	/* Userspace should not interpret the padding
+				 * field which can be replaced by useful
+				 * fields if struct taskstats is extended.
+				 */
+
+	/* Delay accounting fields start
+	 *
+	 * All values, until comment "Delay accounting fields end" are
+	 * available only if delay accounting is enabled, even though the last
+	 * few fields are not delays
+	 *
+	 * xxx_count is the number of delay values recorded
+	 * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+	 *
+	 * xxx_delay_total wraps around to zero on overflow
+	 * xxx_count incremented regardless of overflow
+	 */
+
+	/* Delay waiting for cpu, while runnable
+	 * count, delay_total NOT updated atomically
+	 */
+	__u64	cpu_count;
+	__u64	cpu_delay_total;
+
+	/* Following four fields atomically updated using task->delays->lock */
+
+	/* Delay waiting for synchronous block I/O to complete
+	 * does not account for delays in I/O submission
+	 */
+	__u64	blkio_count;
+	__u64	blkio_delay_total;
+
+	/* Delay waiting for page fault I/O (swap in only) */
+	__u64	swapin_count;
+	__u64	swapin_delay_total;
+
+	/* cpu "wall-clock" running time
+	 * On some architectures, value will adjust for cpu time stolen
+	 * from the kernel in involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_real_total;
+
+	/* cpu "virtual" running time
+	 * Uses time intervals seen by the kernel i.e. no adjustment
+	 * for kernel's involuntary waits due to virtualization.
+	 * Value is cumulative, in nanoseconds, without a corresponding count
+	 * and wraps around to zero silently on overflow
+	 */
+	__u64	cpu_run_virtual_total;
+	/* Delay accounting fields end */
+	/* version 1 ends here */
 };
 
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index bd0ecb969c26..fc9da2e26443 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -17,6 +17,7 @@ enum {
 
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
+extern struct mutex taskstats_exit_mutex;
 
 static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
 					struct taskstats **ptgidstats)
diff --git a/init/Kconfig b/init/Kconfig
index 56a7093b4e4c..a099fc6526d9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -173,6 +173,7 @@ config TASKSTATS
 
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting (EXPERIMENTAL)"
+	depends on TASKSTATS
 	help
 	  Collect information on time spent by a task waiting for system
 	  resources like cpu, synchronous block I/O completion and swapping
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 3546b0800f9f..1be274a462ca 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -41,6 +41,10 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
+	spin_lock_init(&tsk->delays_lock);
+	/* No need to acquire tsk->delays_lock for allocation here unless
+	   __delayacct_tsk_init called after tsk is attached to tasklist
+	*/
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
 	if (tsk->delays)
 		spin_lock_init(&tsk->delays->lock);
@@ -48,8 +52,11 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 
 void __delayacct_tsk_exit(struct task_struct *tsk)
 {
-	kmem_cache_free(delayacct_cache, tsk->delays);
+	struct task_delay_info *delays = tsk->delays;
+	spin_lock(&tsk->delays_lock);
 	tsk->delays = NULL;
+	spin_unlock(&tsk->delays_lock);
+	kmem_cache_free(delayacct_cache, delays);
 }
 
 /*
@@ -104,3 +111,56 @@ void __delayacct_blkio_end(void)
 			&current->delays->blkio_delay,
 			&current->delays->blkio_count);
 }
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+	s64 tmp;
+	struct timespec ts;
+	unsigned long t1,t2,t3;
+
+	spin_lock(&tsk->delays_lock);
+
+	/* Though tsk->delays accessed later, early exit avoids
+	 * unnecessary returning of other data
+	 */
+	if (!tsk->delays)
+		goto done;
+
+	tmp = (s64)d->cpu_run_real_total;
+	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+	tmp += timespec_to_ns(&ts);
+	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+	/*
+	 * No locking available for sched_info (and too expensive to add one)
+	 * Mitigate by taking snapshot of values
+	 */
+	t1 = tsk->sched_info.pcnt;
+	t2 = tsk->sched_info.run_delay;
+	t3 = tsk->sched_info.cpu_time;
+
+	d->cpu_count += t1;
+
+	jiffies_to_timespec(t2, &ts);
+	tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+	tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+	d->cpu_run_virtual_total =
+		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
+
+	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+	spin_lock(&tsk->delays->lock);
+	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	d->blkio_count += tsk->delays->blkio_count;
+	d->swapin_count += tsk->delays->swapin_count;
+	spin_unlock(&tsk->delays->lock);
+
+done:
+	spin_unlock(&tsk->delays_lock);
+	return 0;
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 82ec9137d908..ea9506de3b85 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,13 +18,13 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
-static DEFINE_MUTEX(taskstats_exit_mutex);
 
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
@@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 *		goto err;
 	 */
 
-err:
+	rc = delayacct_add_tsk(stats, tsk);
+	stats->version = TASKSTATS_VERSION;
+
+	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
 
@@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		 *		break;
 		 */
 
+		rc = delayacct_add_tsk(stats, tsk);
+		if (rc)
+			break;
+
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
+	stats->version = TASKSTATS_VERSION;
+
 
 	/*
 	 * Accounting subsytems can also add calls here if they don't
@@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	if (!family_registered || !tidstats)
 		return;
 
-	mutex_lock(&taskstats_exit_mutex);
-
 	is_thread_group = !thread_group_empty(tsk);
 	rc = 0;
 
@@ -292,7 +299,6 @@ nla_put_failure:
 err_skb:
 	nlmsg_free(rep_skb);
 ret:
-	mutex_unlock(&taskstats_exit_mutex);
 	return;
 }
 
-- 
cgit v1.2.3-55-g7522


From 25890454667b3295f67b3372352be90705f8667c Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:43 -0700
Subject: [PATCH] per-task-delay-accounting: /proc export of aggregated block
 I/O delays

Export I/O delays seen by a task through /proc/<tgid>/stats for use in top
etc.

Note that delays for I/O done for swapping in pages (swapin I/O) is clubbed
together with all other I/O here (this is not the case in the netlink
interface where the swapin I/O is kept distinct)

[akpm@osdl.org: printk warning fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/array.c           |  6 ++++--
 include/linux/delayacct.h | 10 ++++++++++
 kernel/delayacct.c        | 12 ++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7495d3e20775..0b615d62a159 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -74,6 +74,7 @@
 #include <linux/times.h>
 #include <linux/cpuset.h>
 #include <linux/rcupdate.h>
+#include <linux/delayacct.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -411,7 +412,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu %llu\n",
 		task->pid,
 		tcomm,
 		state,
@@ -455,7 +456,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		task->exit_signal,
 		task_cpu(task),
 		task->rt_priority,
-		task->policy);
+		task->policy,
+		(unsigned long long)delayacct_blkio_ticks(task));
 	if(mm)
 		mmput(mm);
 	return res;
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index d955078a1441..7e8b6011b8f3 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -37,6 +37,7 @@ extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
 extern void __delayacct_blkio_end(void);
 extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
+extern __u64 __delayacct_blkio_ticks(struct task_struct *);
 
 static inline void delayacct_set_flag(int flag)
 {
@@ -86,6 +87,13 @@ static inline int delayacct_add_tsk(struct taskstats *d,
 	return __delayacct_add_tsk(d, tsk);
 }
 
+static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
+{
+	if (tsk->delays)
+		return __delayacct_blkio_ticks(tsk);
+	return 0;
+}
+
 #else
 static inline void delayacct_set_flag(int flag)
 {}
@@ -104,6 +112,8 @@ static inline void delayacct_blkio_end(void)
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 { return 0; }
+static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
+{ return 0; }
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
 #endif
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 1be274a462ca..f05392d64267 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -164,3 +164,15 @@ done:
 	spin_unlock(&tsk->delays_lock);
 	return 0;
 }
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+	__u64 ret;
+
+	spin_lock(&tsk->delays->lock);
+	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+				tsk->delays->swapin_delay);
+	spin_unlock(&tsk->delays->lock);
+	return ret;
+}
+
-- 
cgit v1.2.3-55-g7522


From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:44 -0700
Subject: [PATCH] delay accounting taskstats interface send tgid once

Send per-tgid data only once during exit of a thread group instead of once
with each member thread exit.

Currently, when a thread exits, besides its per-tid data, the per-tgid data
of its thread group is also sent out, if its thread group is non-empty.
The per-tgid data sent consists of the sum of per-tid stats for all
*remaining* threads of the thread group.

This patch modifies this sending in two ways:

- the per-tgid data is sent only when the last thread of a thread group
  exits.  This cuts down heavily on the overhead of sending/receiving
  per-tgid data, especially when other exploiters of the taskstats
  interface aren't interested in per-tgid stats

- the semantics of the per-tgid data sent are changed.  Instead of being
  the sum of per-tid data for remaining threads, the value now sent is the
  true total accumalated statistics for all threads that are/were part of
  the thread group.

The patch also addresses a minor issue where failure of one accounting
subsystem to fill in the taskstats structure was causing the send of
taskstats to not be sent at all.

The patch has been tested for stability and run cerberus for over 4 hours
on an SMP.

[akpm@osdl.org: bugfixes]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/delay-accounting.txt | 13 ++--
 Documentation/accounting/taskstats.txt        | 33 ++++-----
 MAINTAINERS                                   | 12 ++++
 include/linux/sched.h                         |  4 ++
 include/linux/taskstats_kern.h                | 71 ++++++++++++++-----
 kernel/exit.c                                 |  8 +--
 kernel/fork.c                                 |  4 ++
 kernel/taskstats.c                            | 98 ++++++++++++++++++---------
 8 files changed, 162 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
index f3dc0ca04fa4..be215e58423b 100644
--- a/Documentation/accounting/delay-accounting.txt
+++ b/Documentation/accounting/delay-accounting.txt
@@ -48,9 +48,10 @@ counter (say cpu_delay_total) for a task will give the delay
 experienced by the task waiting for the corresponding resource
 in that interval.
 
-When a task exits, records containing the per-task and per-process statistics
-are sent to userspace without requiring a command. More details are given in
-the taskstats interface description.
+When a task exits, records containing the per-task statistics
+are sent to userspace without requiring a command. If it is the last exiting
+task of a thread group, the per-tgid statistics are also sent. More details
+are given in the taskstats interface description.
 
 The getdelays.c userspace utility in this directory allows simple commands to
 be run and the corresponding delay statistics to be displayed. It also serves
@@ -107,9 +108,3 @@ IO	count	delay total
 	0	0
 MEM	count	delay total
 	0	0
-
-
-
-
-
-
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
index acc6b4f37fc7..efd8f605bcd5 100644
--- a/Documentation/accounting/taskstats.txt
+++ b/Documentation/accounting/taskstats.txt
@@ -32,12 +32,11 @@ The response contains statistics for a task (if pid is specified) or the sum of
 statistics for all tasks of the process (if tgid is specified).
 
 To obtain statistics for tasks which are exiting, userspace opens a multicast
-netlink socket. Each time a task exits, two records are sent by the kernel to
-each listener on the multicast socket. The first the per-pid task's statistics
-and the second is the sum for all tasks of the process to which the task
-belongs (the task does not need to be the thread group leader). The need for
-per-tgid stats to be sent for each exiting task is explained in the per-tgid
-stats section below.
+netlink socket. Each time a task exits, its per-pid statistics is always sent
+by the kernel to each listener on the multicast socket. In addition, if it is
+the last thread exiting its thread group, an additional record containing the
+per-tgid stats are also sent. The latter contains the sum of per-pid stats for
+all threads in the thread group, both past and present.
 
 getdelays.c is a simple utility demonstrating usage of the taskstats interface
 for reporting delay accounting statistics.
@@ -104,20 +103,14 @@ stats in userspace alone is inefficient and potentially inaccurate (due to lack
 of atomicity).
 
 However, maintaining per-process, in addition to per-task stats, within the
-kernel has space and time overheads. Hence the taskstats implementation
-dynamically sums up the per-task stats for each task belonging to a process
-whenever per-process stats are needed.
-
-Not maintaining per-tgid stats creates a problem when userspace is interested
-in getting these stats when the process dies i.e. the last thread of
-a process exits. It isn't possible to simply return some aggregated per-process
-statistic from the kernel.
-
-The approach taken by taskstats is to return the per-tgid stats *each* time
-a task exits, in addition to the per-pid stats for that task. Userspace can
-maintain task<->process mappings and use them to maintain the per-process stats
-in userspace, updating the aggregate appropriately as the tasks of a process
-exit.
+kernel has space and time overheads. To address this, the taskstats code
+accumalates each exiting task's statistics into a process-wide data structure.
+When the last task of a process exits, the process level data accumalated also
+gets sent to userspace (along with the per-task data).
+
+When a user queries to get per-tgid data, the sum of all other live threads in
+the group is added up and added to the accumalated total for previously exited
+threads of the same thread group.
 
 Extending taskstats
 -------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 0557cfde053d..e99028ca2f7c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2240,6 +2240,12 @@ M:	tsbogend@alpha.franken.de
 L:	netdev@vger.kernel.org
 S:	Maintained
 
+PER-TASK DELAY ACCOUNTING
+P:	Shailabh Nagar
+M:	nagar@watson.ibm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 PERSONALITY HANDLING
 P:	Christoph Hellwig
 M:	hch@infradead.org
@@ -2767,6 +2773,12 @@ P:	Deepak Saxena
 M:	dsaxena@plexity.net
 S:	Maintained
 
+TASKSTATS STATISTICS INTERFACE
+P:	Shailabh Nagar
+M:	nagar@watson.ibm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 TI PARALLEL LINK CABLE DRIVER
 P:     Romain Lievin
 M:     roms@lpg.ticalc.org
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c5610ca0c92..6afa72e080cb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -463,6 +463,10 @@ struct signal_struct {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
+#ifdef CONFIG_TASKSTATS
+	spinlock_t stats_lock;
+	struct taskstats *stats;
+#endif
 };
 
 /* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index fc9da2e26443..0ae8f67af1fd 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -19,36 +19,75 @@ enum {
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {
 	*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-	*ptgidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
 }
 
-static inline void taskstats_exit_free(struct taskstats *tidstats,
-					struct taskstats *tgidstats)
+static inline void taskstats_exit_free(struct taskstats *tidstats)
 {
 	if (tidstats)
 		kmem_cache_free(taskstats_cache, tidstats);
-	if (tgidstats)
-		kmem_cache_free(taskstats_cache, tgidstats);
 }
 
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *,
-				struct taskstats *);
-extern void taskstats_init_early(void);
+static inline void taskstats_tgid_init(struct signal_struct *sig)
+{
+	spin_lock_init(&sig->stats_lock);
+	sig->stats = NULL;
+}
+
+static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+{
+	struct taskstats *stats;
+	unsigned long flags;
+
+	stats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!stats)
+		return;
+
+	spin_lock_irqsave(&sig->stats_lock, flags);
+	if (!sig->stats) {
+		sig->stats = stats;
+		stats = NULL;
+	}
+	spin_unlock_irqrestore(&sig->stats_lock, flags);
+
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
 
+static inline void taskstats_tgid_free(struct signal_struct *sig)
+{
+	struct taskstats *stats = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sig->stats_lock, flags);
+	if (sig->stats) {
+		stats = sig->stats;
+		sig->stats = NULL;
+	}
+	spin_unlock_irqrestore(&sig->stats_lock, flags);
+	if (stats)
+		kmem_cache_free(taskstats_cache, stats);
+}
+
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
+extern void taskstats_init_early(void);
+extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
-					struct taskstats **ptgidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
 {}
-static inline void taskstats_exit_free(struct taskstats *ptidstats,
-					struct taskstats *ptgidstats)
+static inline void taskstats_exit_free(struct taskstats *ptidstats)
 {}
 static inline void taskstats_exit_send(struct task_struct *tsk,
-					struct taskstats *tidstats,
-					struct taskstats *tgidstats)
+				       struct taskstats *tidstats,
+				       int group_dead)
+{}
+static inline void taskstats_tgid_init(struct signal_struct *sig)
+{}
+static inline void taskstats_tgid_alloc(struct signal_struct *sig)
+{}
+static inline void taskstats_tgid_free(struct signal_struct *sig)
 {}
 static inline void taskstats_init_early(void)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9852ed8c2988..67c1e9a4f812 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,7 +845,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
-	struct taskstats *tidstats, *tgidstats;
+	struct taskstats *tidstats;
 	int group_dead;
 
 	profile_task_exit(tsk);
@@ -884,7 +884,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats, &tgidstats);
+	taskstats_exit_alloc(&tidstats);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,8 +905,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, tgidstats);
-	taskstats_exit_free(tidstats, tgidstats);
+	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
 	exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 451cfd35bf22..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -819,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
+		taskstats_tgid_alloc(current->signal);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -863,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -884,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
+	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea9506de3b85..4a0a5022b299 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		struct taskstats *stats)
 {
-	int rc;
 	struct task_struct *tsk, *first;
+	unsigned long flags;
 
+	/*
+	 * Add additional stats from live tasks except zombie thread group
+	 * leaders who are already counted with the dead tasks
+	 */
 	first = tgidtsk;
-	read_lock(&tasklist_lock);
 	if (!first) {
+		read_lock(&tasklist_lock);
 		first = find_task_by_pid(tgid);
 		if (!first) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
-	}
+		get_task_struct(first);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(first);
+
+	/* Start with stats from dead tasks */
+	spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
+	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
 	tsk = first;
+	read_lock(&tasklist_lock);
 	do {
+		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+			continue;
 		/*
-		 * Each accounting subsystem adds calls its functions to
+		 * Accounting subsystem can call its functions here to
 		 * fill in relevant parts of struct taskstsats as follows
 		 *
-		 *	rc = per-task-foo(stats, tsk);
-		 *	if (rc)
-		 *		break;
+		 *	per-task-foo(stats, tsk);
 		 */
-
-		rc = delayacct_add_tsk(stats, tsk);
-		if (rc)
-			break;
+		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
 	stats->version = TASKSTATS_VERSION;
 
-
 	/*
-	 * Accounting subsytems can also add calls here if they don't
-	 * wish to aggregate statistics for per-tgid stats
+	 * Accounting subsytems can also add calls here to modify
+	 * fields of taskstats.
 	 */
 
-	return rc;
+	return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	if (!tsk->signal->stats)
+		goto ret;
+
+	/*
+	 * Each accounting subsystem calls its functions here to
+	 * accumalate its per-task stats for tsk, into the per-tgid structure
+	 *
+	 *	per-task-foo(tsk->signal->stats, tsk);
+	 */
+	delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	return;
 }
 
+
 static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -230,7 +263,7 @@ err:
 
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			struct taskstats *tgidstats)
+			int group_dead)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
+	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	is_thread_group = !thread_group_empty(tsk);
-	rc = 0;
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	is_thread_group = tsk->signal->stats ? 1 : 0;
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
 
+	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 			*tidstats);
 	nla_nest_end(rep_skb, na);
 
-	if (!is_thread_group || !tgidstats) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+	if (!is_thread_group)
+		goto send;
 
-	rc = fill_tgid(tsk->pid, tsk, tgidstats);
 	/*
-	 * If fill_tgid() failed then one probable reason could be that the
-	 * thread group leader has exited. fill_tgid() will fail, send out
-	 * the pid statistics collected earlier.
+	 * tsk has/had a thread group so fill the tsk->signal->stats structure
+	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (rc < 0) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+
+	fill_tgid_exit(tsk);
+	if (!group_dead)
+		goto send;
 
 	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
 	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	/* No locking needed for tsk->signal->stats since group is dead */
 	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tgidstats);
+			*tsk->signal->stats);
 	nla_nest_end(rep_skb, na);
 
+send:
 	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-	goto ret;
+	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-- 
cgit v1.2.3-55-g7522


From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] per-task delay accounting taskstats interface: control exit
 data through cpumasks

On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can
overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited and specific set of cpus.  By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of cpus
by specifying a cpumask.  The interest is recorded per-cpu.  When a task
exits on a cpu, its taskstats data is unicast to each listener interested
in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

[akpm@osdl.org: build fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/taskstats.h      |   4 +-
 include/linux/taskstats_kern.h |  27 +-----
 kernel/exit.c                  |   5 +-
 kernel/taskstats.c             | 200 ++++++++++++++++++++++++++++++++++++++---
 4 files changed, 198 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h
index c6aeca32348e..f1cb6cddd19d 100644
--- a/include/linux/taskstats.h
+++ b/include/linux/taskstats.h
@@ -91,8 +91,6 @@ struct taskstats {
 };
 
 
-#define TASKSTATS_LISTEN_GROUP	0x1
-
 /*
  * Commands sent from userspace
  * Not versioned. New commands should only be inserted at the enum's end
@@ -124,6 +122,8 @@ enum {
 	TASKSTATS_CMD_ATTR_UNSPEC = 0,
 	TASKSTATS_CMD_ATTR_PID,
 	TASKSTATS_CMD_ATTR_TGID,
+	TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+	TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
 	__TASKSTATS_CMD_ATTR_MAX,
 };
 
diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h
index 2b6adec3a2e4..16894b7edcc8 100644
--- a/include/linux/taskstats_kern.h
+++ b/include/linux/taskstats_kern.h
@@ -11,30 +11,10 @@
 #include <linux/sched.h>
 #include <net/genetlink.h>
 
-enum {
-	TASKSTATS_MSG_UNICAST,		/* send data only to requester */
-	TASKSTATS_MSG_MULTICAST,	/* send data to a group */
-};
-
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
 extern struct mutex taskstats_exit_mutex;
 
-static inline int taskstats_has_listeners(void)
-{
-	if (!genl_sock)
-		return 0;
-	return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP);
-}
-
-
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
-{
-	*ptidstats = NULL;
-	if (taskstats_has_listeners())
-		*ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
-}
-
 static inline void taskstats_exit_free(struct taskstats *tidstats)
 {
 	if (tidstats)
@@ -82,17 +62,18 @@ static inline void taskstats_tgid_free(struct signal_struct *sig)
 		kmem_cache_free(taskstats_cache, stats);
 }
 
-extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int);
+extern void taskstats_exit_alloc(struct taskstats **, unsigned int *);
+extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int);
 extern void taskstats_init_early(void);
 extern void taskstats_tgid_alloc(struct signal_struct *);
 #else
-static inline void taskstats_exit_alloc(struct taskstats **ptidstats)
+static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
 {}
 static inline void taskstats_exit_free(struct taskstats *ptidstats)
 {}
 static inline void taskstats_exit_send(struct task_struct *tsk,
 				       struct taskstats *tidstats,
-				       int group_dead)
+				       int group_dead, unsigned int cpu)
 {}
 static inline void taskstats_tgid_init(struct signal_struct *sig)
 {}
diff --git a/kernel/exit.c b/kernel/exit.c
index 67c1e9a4f812..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	struct task_struct *tsk = current;
 	struct taskstats *tidstats;
 	int group_dead;
+	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
-	taskstats_exit_alloc(&tidstats);
+	taskstats_exit_alloc(&tidstats, &mycpu);
 
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
@@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
-	taskstats_exit_send(tsk, tidstats, group_dead);
+	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
 	taskstats_exit_free(tidstats);
 	delayacct_tsk_exit(tsk);
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a0a5022b299..abb59e323544 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -19,9 +19,17 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
+
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
@@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
 __read_mostly = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+	struct list_head list;
+	pid_t pid;
 };
 
+struct listener_list {
+	struct rw_semaphore sem;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+	REGISTER,
+	DEREGISTER,
+	CPU_DONT_CARE
+};
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 			void **replyp, size_t size)
@@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	return 0;
 }
 
-static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	void *reply;
+	void *reply = genlmsg_data(genlhdr);
 	int rc;
 
-	reply = genlmsg_data(genlhdr);
-
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
 		return rc;
 	}
 
-	if (event == TASKSTATS_MSG_MULTICAST)
-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
 	return genlmsg_unicast(skb, pid);
 }
 
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	rc = 0;
+	listeners = &per_cpu(listener_array, cpu);
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		skb_next = NULL;
+		if (!list_is_last(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
+		if (ret == -ECONNREFUSED) {
+			list_del(&s->list);
+			kfree(s);
+			rc = ret;
+		}
+		skb_cur = skb_next;
+	}
+	up_write(&listeners->sem);
+
+	return rc;
+}
+
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
@@ -204,8 +272,73 @@ ret:
 	return;
 }
 
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	unsigned int cpu;
+	cpumask_t mask = *maskp;
 
-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+	if (!cpus_subset(mask, cpu_possible_map))
+		return -EINVAL;
+
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+					 cpu_to_node(cpu));
+			if (!s)
+				goto cleanup;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+
+			listeners = &per_cpu(listener_array, cpu);
+			down_write(&listeners->sem);
+			list_add(&s->list, &listeners->list);
+			up_write(&listeners->sem);
+		}
+		return 0;
+	}
+
+	/* Deregister or cleanup */
+cleanup:
+	for_each_cpu_mask(cpu, mask) {
+		listeners = &per_cpu(listener_array, cpu);
+		down_write(&listeners->sem);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+			if (s->pid == pid) {
+				list_del(&s->list);
+				kfree(s);
+				break;
+			}
+		}
+		up_write(&listeners->sem);
+	}
+	return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+	char *data;
+	int len;
+	int ret;
+
+	if (na == NULL)
+		return 1;
+	len = nla_len(na);
+	if (len > TASKSTATS_CPUMASK_MAXLEN)
+		return -E2BIG;
+	if (len < 1)
+		return -EINVAL;
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	nla_strlcpy(data, na, len);
+	ret = cpulist_parse(data, *mask);
+	kfree(data);
+	return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
@@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 	void *reply;
 	size_t size;
 	struct nlattr *na;
+	cpumask_t mask;
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
 
 	/*
 	 * Size includes space for nested attributes
@@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 
 	nla_nest_end(rep_skb, na);
 
-	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
 	return genlmsg_cancel(rep_skb, reply);
@@ -261,9 +407,35 @@ err:
 	return rc;
 }
 
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+	struct listener_list *listeners;
+	struct taskstats *tmp;
+	/*
+	 * This is the cpu on which the task is exiting currently and will
+	 * be the one for which the exit event is sent, even if the cpu
+	 * on which this function is running changes later.
+	 */
+	*mycpu = raw_smp_processor_id();
+
+	*ptidstats = NULL;
+	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!tmp)
+		return;
+
+	listeners = &per_cpu(listener_array, *mycpu);
+	down_read(&listeners->sem);
+	if (!list_empty(&listeners->list)) {
+		*ptidstats = tmp;
+		tmp = NULL;
+	}
+	up_read(&listeners->sem);
+	kfree(tmp);
+}
+
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead)
+			int group_dead, unsigned int mycpu)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	send_cpu_listeners(rep_skb, mycpu);
 	return;
 
 nla_put_failure:
@@ -338,16 +510,22 @@ ret:
 
 static struct genl_ops taskstats_ops = {
 	.cmd		= TASKSTATS_CMD_GET,
-	.doit		= taskstats_send_stats,
+	.doit		= taskstats_user_cmd,
 	.policy		= taskstats_cmd_get_policy,
 };
 
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
+	unsigned int i;
+
 	taskstats_cache = kmem_cache_create("taskstats_cache",
 						sizeof(struct taskstats),
 						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
 }
 
 static int __init taskstats_init(void)
-- 
cgit v1.2.3-55-g7522


From bb129994c3bff9c5e8df91f05d7e9b6402fbd83f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] Remove down_write() from taskstats code invoked on the exit()
 path

In send_cpu_listeners(), which is called on the exit path, a down_write()
was protecting operations like skb_clone() and genlmsg_unicast() that do
GFP_KERNEL allocations.  If the oom-killer decides to kill tasks to satisfy
the allocations,the exit of those tasks could block on the same semphore.

The down_write() was only needed to allow removal of invalid listeners from
the listener list.  The patch converts the down_write to a down_read and
defers the removal to a separate critical region.  This ensures that even
if the oom-killer is called, no other task's exit is blocked as it can
still acquire another down_read.

Thanks to Andrew Morton & Herbert Xu for pointing out the oom related
pitfalls, and to Chandra Seetharaman for suggesting this fix instead of
using something more complex like RCU.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index abb59e323544..f45179ce028e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -51,6 +51,7 @@ __read_mostly = {
 struct listener {
 	struct list_head list;
 	pid_t pid;
+	char valid;
 };
 
 struct listener_list {
@@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret;
+	int rc, ret, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
@@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
-	down_write(&listeners->sem);
+	down_read(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
@@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 		ret = genlmsg_unicast(skb_cur, s->pid);
 		if (ret == -ECONNREFUSED) {
-			list_del(&s->list);
-			kfree(s);
+			s->valid = 0;
+			delcount++;
 			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
-	up_write(&listeners->sem);
+	up_read(&listeners->sem);
+
+	if (!delcount)
+		return rc;
 
+	/* Delete invalidated entries */
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		if (!s->valid) {
+			list_del(&s->list);
+			kfree(s);
+		}
+	}
+	up_write(&listeners->sem);
 	return rc;
 }
 
@@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 				goto cleanup;
 			s->pid = pid;
 			INIT_LIST_HEAD(&s->list);
+			s->valid = 1;
 
 			listeners = &per_cpu(listener_array, cpu);
 			down_write(&listeners->sem);
-- 
cgit v1.2.3-55-g7522


From aa95387774039096c11803c04011f1aa42d85758 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Sun, 23 Jul 2006 12:12:16 -0700
Subject: cpu hotplug: simplify and hopefully fix locking

The CPU hotplug locking was quite messy, with a recursive lock to
handle the fact that both the actual up/down sequence wanted to
protect itself from being re-entered, but the callbacks that it
called also tended to want to protect themselves from CPU events.

This splits the lock into two (one to serialize the whole hotplug
sequence, the other to protect against the CPU present bitmaps
changing). The latter still allows recursive usage because some
subsystems (ondemand policy for cpufreq at least) had already gotten
too used to the lax locking, but the locking mistakes are hopefully
now less fundamental, and we now warn about recursive lock usage
when we see it, in the hope that it can be fixed.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpu.h |  6 -----
 kernel/cpu.c        | 75 ++++++++++++++++++++++++-----------------------------
 2 files changed, 34 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 44a11f1ccaf2..8fb344a9abd8 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,7 +48,6 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
 #endif
-extern int current_in_cpu_hotplug(void);
 
 int cpu_up(unsigned int cpu);
 
@@ -61,10 +60,6 @@ static inline int register_cpu_notifier(struct notifier_block *nb)
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
-static inline int current_in_cpu_hotplug(void)
-{
-	return 0;
-}
 
 #endif /* CONFIG_SMP */
 extern struct sysdev_class cpu_sysdev_class;
@@ -73,7 +68,6 @@ extern struct sysdev_class cpu_sysdev_class;
 /* Stop CPUs going up and down. */
 extern void lock_cpu_hotplug(void);
 extern void unlock_cpu_hotplug(void);
-extern int lock_cpu_hotplug_interruptible(void);
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 70fbf2e83766..f230f9ae01c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,56 +16,48 @@
 #include <linux/mutex.h>
 
 /* This protects CPUs going up and down... */
-static DEFINE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *lock_cpu_hotplug_owner;
-static int lock_cpu_hotplug_depth;
 
-static int __lock_cpu_hotplug(int interruptible)
-{
-	int ret = 0;
-
-	if (lock_cpu_hotplug_owner != current) {
-		if (interruptible)
-			ret = mutex_lock_interruptible(&cpucontrol);
-		else
-			mutex_lock(&cpucontrol);
-	}
-
-	/*
-	 * Set only if we succeed in locking
-	 */
-	if (!ret) {
-		lock_cpu_hotplug_depth++;
-		lock_cpu_hotplug_owner = current;
-	}
-
-	return ret;
-}
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
+static struct task_struct *recursive;
+static int recursive_depth;
 
 void lock_cpu_hotplug(void)
 {
-	__lock_cpu_hotplug(0);
+	struct task_struct *tsk = current;
+
+	if (tsk == recursive) {
+		static int warnings = 10;
+		if (warnings) {
+			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+			WARN_ON(1);
+			warnings--;
+		}
+		recursive_depth++;
+		return;
+	}
+	mutex_lock(&cpu_bitmask_lock);
+	recursive = tsk;
 }
 EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
 void unlock_cpu_hotplug(void)
 {
-	if (--lock_cpu_hotplug_depth == 0) {
-		lock_cpu_hotplug_owner = NULL;
-		mutex_unlock(&cpucontrol);
+	WARN_ON(recursive != current);
+	if (recursive_depth) {
+		recursive_depth--;
+		return;
 	}
+	mutex_unlock(&cpu_bitmask_lock);
+	recursive = NULL;
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
-int lock_cpu_hotplug_interruptible(void)
-{
-	return __lock_cpu_hotplug(1);
-}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
@@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu)
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 
-	if ((err = lock_cpu_hotplug_interruptible()) != 0)
-		return err;
-
+	mutex_lock(&cpu_add_remove_lock);
 	if (num_online_cpus() == 1) {
 		err = -EBUSY;
 		goto out;
@@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 
+	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
+	mutex_unlock(&cpu_bitmask_lock);
+
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -187,7 +180,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out:
-	unlock_cpu_hotplug();
+	mutex_unlock(&cpu_add_remove_lock);
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu)
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-		return ret;
-
+	mutex_lock(&cpu_add_remove_lock);
 	if (cpu_online(cpu) || !cpu_present(cpu)) {
 		ret = -EINVAL;
 		goto out;
@@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu)
 	}
 
 	/* Arch-specific enabling code. */
+	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
+	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
@@ -227,6 +220,6 @@ out_notify:
 		blocking_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED, hcpu);
 out:
-	unlock_cpu_hotplug();
+	mutex_unlock(&cpu_add_remove_lock);
 	return ret;
 }
-- 
cgit v1.2.3-55-g7522


From abb5a5cc6bba1516403146c5b79036fe843beb70 Mon Sep 17 00:00:00 2001
From: Paul Jackson
Date: Sun, 23 Jul 2006 11:36:08 -0700
Subject: [PATCH] Cpuset: fix ABBA deadlock with cpu hotplug lock

Fix ABBA deadlock between lock_cpu_hotplug() and the cpuset
callback_mutex lock.

It only happens on cpu_exclusive cpusets, due to the dynamic
sched domain code trying to take the cpu hotplug lock inside
the cpuset callback_mutex lock.

This bug has apparently been here for several months, but didn't
get hit until the right customer load on a large system.

This fix appears right from inspection, but it will take a few
more days running it on that customers workload to be confident
we nailed it.  We don't have any other reproducible test case.

The cpu_hotplug_lock() tends to cover large runs of code.
The other places that hold both that lock and the cpuset callback
mutex lock always nest the cpuset lock inside the hotplug lock.
This place tries to do the reverse, risking an ABBA deadlock.

This is in the cpuset_rmdir() code, where we:
  * take the callback_mutex lock
  * mark the cpuset CS_REMOVED
  * call update_cpu_domains for cpu_exclusive cpusets
  * in that call, take the cpu_hotplug lock if the
    cpuset is marked for removal.

Thanks to Jack Steiner for identifying this deadlock.

The fix is to tear down the dynamic sched domain before we grab
the cpuset callback_mutex lock.  This way, the two locks are
serialized, with the hotplug lock taken and released before
trying for the cpuset lock.

I suspect that this bug was introduced when I changed the
cpuset locking from one lock to two.  The dynamic sched domain
dependency on cpu_exclusive cpusets and its hotplug hooks were
added to this code earlier, when cpusets had only a single lock.
It may well have been fine then.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c232dc077438..1a649f2bb9bb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *
  * Call with manage_mutex held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
  */
 
 static void update_cpu_domains(struct cpuset *cur)
@@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur)
 		if (is_cpu_exclusive(c))
 			cpus_andnot(pspan, pspan, c->cpus_allowed);
 	}
-	if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+	if (!is_cpu_exclusive(cur)) {
 		cpus_or(pspan, pspan, cur->cpus_allowed);
 		if (cpus_equal(pspan, cur->cpus_allowed))
 			return;
@@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex.  Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
 static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cpuset *cs = dentry->d_fsdata;
@@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
+	if (is_cpu_exclusive(cs)) {
+		int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+		if (retval < 0) {
+			mutex_unlock(&manage_mutex);
+			return retval;
+		}
+	}
 	parent = cs->parent;
 	mutex_lock(&callback_mutex);
 	set_bit(CS_REMOVED, &cs->flags);
-	if (is_cpu_exclusive(cs))
-		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
-- 
cgit v1.2.3-55-g7522


From 627371d73cdd04ed23fe098755b4f855138ad9e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sat, 29 Jul 2006 05:16:20 +0200
Subject: [PATCH] pi-futex: robust-futex exit crash fix

Fix pi_state->list handling bugs: list handling mishap, locking error.
Plus add more debug checks and fix a few style issues i noticed while
debugging this.

(reported by Ulrich Drepper and Jakub Jelinek.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index cf0c8e21d1ab..f59003b1d8f9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -415,15 +415,15 @@ out_unlock:
  */
 void exit_pi_state_list(struct task_struct *curr)
 {
-	struct futex_hash_bucket *hb;
 	struct list_head *next, *head = &curr->pi_state_list;
 	struct futex_pi_state *pi_state;
+	struct futex_hash_bucket *hb;
 	union futex_key key;
 
 	/*
 	 * We are a ZOMBIE and nobody can enqueue itself on
 	 * pi_state_list anymore, but we have to be careful
-	 * versus waiters unqueueing themselfs
+	 * versus waiters unqueueing themselves:
 	 */
 	spin_lock_irq(&curr->pi_lock);
 	while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
 		next = head->next;
 		pi_state = list_entry(next, struct futex_pi_state, list);
 		key = pi_state->key;
+		hb = hash_futex(&key);
 		spin_unlock_irq(&curr->pi_lock);
 
-		hb = hash_futex(&key);
 		spin_lock(&hb->lock);
 
 		spin_lock_irq(&curr->pi_lock);
+		/*
+		 * We dropped the pi-lock, so re-check whether this
+		 * task still owns the PI-state:
+		 */
 		if (head->next != next) {
 			spin_unlock(&hb->lock);
 			continue;
 		}
 
-		list_del_init(&pi_state->list);
-
 		WARN_ON(pi_state->owner != curr);
-
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
 		pi_state->owner = NULL;
 		spin_unlock_irq(&curr->pi_lock);
 
@@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	head = &hb->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
-		if (match_futex (&this->key, &me->key)) {
+		if (match_futex(&this->key, &me->key)) {
 			/*
 			 * Another waiter already exists - bump up
 			 * the refcount and return its pi_state:
@@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			if (unlikely(!pi_state))
 				return -EINVAL;
 
+			WARN_ON(!atomic_read(&pi_state->refcount));
+
 			atomic_inc(&pi_state->refcount);
 			me->pi_state = pi_state;
 
@@ -510,6 +515,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	pi_state->key = me->key;
 
 	spin_lock_irq(&p->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &p->pi_state_list);
 	pi_state->owner = p;
 	spin_unlock_irq(&p->pi_lock);
@@ -584,9 +590,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	if (curval != uval)
 		return -EINVAL;
 
-	list_del_init(&pi_state->owner->pi_state_list);
+	spin_lock_irq(&pi_state->owner->pi_lock);
+	WARN_ON(list_empty(&pi_state->list));
+	list_del_init(&pi_state->list);
+	spin_unlock_irq(&pi_state->owner->pi_lock);
+
+	spin_lock_irq(&new_owner->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &new_owner->pi_state_list);
 	pi_state->owner = new_owner;
+	spin_unlock_irq(&new_owner->pi_lock);
+
 	rt_mutex_unlock(&pi_state->pi_mutex);
 
 	return 0;
@@ -1236,6 +1250,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 		/* Owner died? */
 		if (q.pi_state->owner != NULL) {
 			spin_lock_irq(&q.pi_state->owner->pi_lock);
+			WARN_ON(list_empty(&q.pi_state->list));
 			list_del_init(&q.pi_state->list);
 			spin_unlock_irq(&q.pi_state->owner->pi_lock);
 		} else
@@ -1244,6 +1259,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 		q.pi_state->owner = current;
 
 		spin_lock_irq(&current->pi_lock);
+		WARN_ON(!list_empty(&q.pi_state->list));
 		list_add(&q.pi_state->list, &current->pi_state_list);
 		spin_unlock_irq(&current->pi_lock);
 
-- 
cgit v1.2.3-55-g7522


From e3f2ddeac718c768fdac4b7fe69d465172f788a8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sat, 29 Jul 2006 05:17:57 +0200
Subject: [PATCH] pi-futex: robust-futex exit

Fix robust PI-futexes to be properly unlocked on unexpected exit.

For this to work the kernel has to know whether a futex is a PI or a
non-PI one, because the semantics are different.  Since the space in
relevant glibc data structures is extremely scarce, the best solution is
to encode the 'PI' information in bit 0 of the robust list pointer.
Existing (non-PI) glibc robust futexes have this bit always zero, so the
ABI is kept.  New glibc with PI-robust-futexes will set this bit.

Further fixes from Thomas Gleixner <tglx@linutronix.de>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h |  3 +-
 kernel/futex.c        | 91 +++++++++++++++++++++++++++++++++++----------------
 kernel/futex_compat.c | 34 ++++++++++++++-----
 3 files changed, 89 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34c3a215f2cd..d097b5b72bc6 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -96,7 +96,8 @@ struct robust_list_head {
 long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
-extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
+extern int
+handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
diff --git a/kernel/futex.c b/kernel/futex.c
index f59003b1d8f9..dda2049692a2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -495,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	}
 
 	/*
-	 * We are the first waiter - try to look up the real owner and
-	 * attach the new pi_state to it:
+	 * We are the first waiter - try to look up the real owner and attach
+	 * the new pi_state to it, but bail out when the owner died bit is set
+	 * and TID = 0:
 	 */
 	pid = uval & FUTEX_TID_MASK;
+	if (!pid && (uval & FUTEX_OWNER_DIED))
+		return -ESRCH;
 	p = futex_find_get_task(pid);
 	if (!p)
 		return -ESRCH;
@@ -579,16 +582,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	 * kept enabled while there is PI state around. We must also
 	 * preserve the owner died bit.)
 	 */
-	newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
-
-	inc_preempt_count();
-	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-	dec_preempt_count();
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		newval = FUTEX_WAITERS | new_owner->pid;
 
-	if (curval == -EFAULT)
-		return -EFAULT;
-	if (curval != uval)
-		return -EINVAL;
+		inc_preempt_count();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+		dec_preempt_count();
+		if (curval == -EFAULT)
+			return -EFAULT;
+		if (curval != uval)
+			return -EINVAL;
+	}
 
 	spin_lock_irq(&pi_state->owner->pi_lock);
 	WARN_ON(list_empty(&pi_state->list));
@@ -1443,9 +1447,11 @@ retry_locked:
 	 * again. If it succeeds then we can return without waking
 	 * anyone else up:
 	 */
-	inc_preempt_count();
-	uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-	dec_preempt_count();
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		inc_preempt_count();
+		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+		dec_preempt_count();
+	}
 
 	if (unlikely(uval == -EFAULT))
 		goto pi_faulted;
@@ -1478,9 +1484,11 @@ retry_locked:
 	/*
 	 * No waiters - kernel unlocks the futex:
 	 */
-	ret = unlock_futex_pi(uaddr, uval);
-	if (ret == -EFAULT)
-		goto pi_faulted;
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		ret = unlock_futex_pi(uaddr, uval);
+		if (ret == -EFAULT)
+			goto pi_faulted;
+	}
 
 out_unlock:
 	spin_unlock(&hb->lock);
@@ -1699,9 +1707,9 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-	u32 uval, nval;
+	u32 uval, nval, mval;
 
 retry:
 	if (get_user(uval, uaddr))
@@ -1718,20 +1726,44 @@ retry:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
-						     uval | FUTEX_OWNER_DIED);
+		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
 		if (nval == -EFAULT)
 			return -1;
 
 		if (nval != uval)
 			goto retry;
 
-		if (uval & FUTEX_WAITERS)
-			futex_wake(uaddr, 1);
+		/*
+		 * Wake robust non-PI futexes here. The wakeup of
+		 * PI futexes happens in exit_pi_state():
+		 */
+		if (!pi) {
+			if (uval & FUTEX_WAITERS)
+				futex_wake(uaddr, 1);
+		}
 	}
 	return 0;
 }
 
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+				     struct robust_list __user **head, int *pi)
+{
+	unsigned long uentry;
+
+	if (get_user(uentry, (unsigned long *)head))
+		return -EFAULT;
+
+	*entry = (void *)(uentry & ~1UL);
+	*pi = uentry & 1;
+
+	return 0;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -1742,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr)
 {
 	struct robust_list_head __user *head = curr->robust_list;
 	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned long futex_offset;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (get_user(entry, &head->list.next))
+	if (fetch_robust_entry(&entry, &head->list.next, &pi))
 		return;
 	/*
 	 * Fetch the relative futex offset:
@@ -1760,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (get_user(pending, &head->list_op_pending))
+	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
 		return;
+
 	if (pending)
-		handle_futex_death((void *)pending + futex_offset, curr);
+		handle_futex_death((void *)pending + futex_offset, curr, pip);
 
 	while (entry != &head->list) {
 		/*
@@ -1772,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr)
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
-						curr))
+						curr, pi))
 				return;
 		/*
 		 * Fetch the next entry in the list:
 		 */
-		if (get_user(entry, &entry->next))
+		if (fetch_robust_entry(&entry, &entry->next, &pi))
 			return;
 		/*
 		 * Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb7..d1aab1a452cc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
 
 #include <asm/uaccess.h>
 
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+		   compat_uptr_t *head, int *pi)
+{
+	if (get_user(*uentry, head))
+		return -EFAULT;
+
+	*entry = compat_ptr((*uentry) & ~1);
+	*pi = (unsigned int)(*uentry) & 1;
+
+	return 0;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
 	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi;
 	compat_uptr_t uentry, upending;
-	unsigned int limit = ROBUST_LIST_LIMIT;
 	compat_long_t futex_offset;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (get_user(uentry, &head->list.next))
+	if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
 		return;
-	entry = compat_ptr(uentry);
 	/*
 	 * Fetch the relative futex offset:
 	 */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (get_user(upending, &head->list_op_pending))
+	if (fetch_robust_entry(&upending, &pending,
+			       &head->list_op_pending, &pi))
 		return;
-	pending = compat_ptr(upending);
 	if (upending)
-		handle_futex_death((void *)pending + futex_offset, curr);
+		handle_futex_death((void *)pending + futex_offset, curr, pi);
 
 	while (compat_ptr(uentry) != &head->list) {
 		/*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
-						curr))
+						curr, pi))
 				return;
 
 		/*
 		 * Fetch the next entry in the list:
 		 */
-		if (get_user(uentry, (compat_uptr_t *)&entry->next))
+		if (fetch_robust_entry(&uentry, &entry,
+				       (compat_uptr_t *)&entry->next, &pi))
 			return;
-		entry = compat_ptr(uentry);
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
-- 
cgit v1.2.3-55-g7522


From f712c0c7e1796f92e45e4de144e247816d974b8f Mon Sep 17 00:00:00 2001
From: Siddha, Suresh B
Date: Sun, 30 Jul 2006 03:02:59 -0700
Subject: [PATCH] sched: build_sched_domains() fix

Use the correct groups while initializing sched groups power for
allnodes_domain.  This fixes the crash observed while creating exclusive
cpusets.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Reported-and-tested-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b44b9a43b0fc..41a571806ce0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6494,7 +6494,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
-	init_numa_sched_groups_power(sched_group_allnodes);
+	if (sched_group_allnodes) {
+		int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+		struct sched_group *sg = &sched_group_allnodes[group];
+
+		init_numa_sched_groups_power(sg);
+	}
 #endif
 
 	/* Attach the domains */
-- 
cgit v1.2.3-55-g7522


From 15a647eba94c3da27ccc666bea72e7cca06b2d19 Mon Sep 17 00:00:00 2001
From: David Brownell
Date: Sun, 30 Jul 2006 03:03:08 -0700
Subject: [PATCH] genirq: {en,dis}able_irq_wake() need refcounting too

IRQs need refcounting and a state flag to track whether the the IRQ should
be enabled or disabled as a "normal IRQ" source after a series of calls to
{en,dis}able_irq().  For shared IRQs, the IRQ must be enabled so long as at
least one driver needs it active.

Likewise, IRQs need the same support to track whether the IRQ should be
enabled or disabled as a "wakeup event" source after a series of calls to
{en,dis}able_irq_wake().  For shared IRQs, the IRQ must be enabled as a
wakeup source during sleep so long as at least one driver needs it.  But
right now they _don't have_ that refcounting ...  which means sharing a
wakeup-capable IRQ can't work correctly in some configurations.

This patch adds the refcount and flag mechanisms to set_irq_wake() -- which
is what {en,dis}able_irq_wake() call -- and minimal documentation of what
the irq wake mechanism does.

Drivers relying on the older (broken) "toggle" semantics will trigger a
warning; that'll be a handful of drivers on ARM systems.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h |  3 +++
 kernel/irq/manage.c | 28 ++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b48eae32dc61..6e59ac05ef2a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -58,6 +58,7 @@
 #define IRQ_NOREQUEST		0x04000000	/* IRQ cannot be requested */
 #define IRQ_NOAUTOEN		0x08000000	/* IRQ will not be enabled on request irq */
 #define IRQ_DELAYED_DISABLE	0x10000000	/* IRQ disable (masking) happens delayed. */
+#define IRQ_WAKEUP		0x20000000	/* IRQ triggers system wakeup */
 
 struct proc_dir_entry;
 
@@ -124,6 +125,7 @@ struct irq_chip {
  * @action:		the irq action chain
  * @status:		status information
  * @depth:		disable-depth, for nested irq_disable() calls
+ * @wake_depth:		enable depth, for multiple set_irq_wake() callers
  * @irq_count:		stats field to detect stalled irqs
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @lock:		locking for SMP
@@ -147,6 +149,7 @@ struct irq_desc {
 	unsigned int		status;		/* IRQ status */
 
 	unsigned int		depth;		/* nested irq disables */
+	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		irq_count;	/* For detecting broken IRQs */
 	unsigned int		irqs_unhandled;
 	spinlock_t		lock;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4e461438e48b..92be519eff26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq);
  *	@irq:	interrupt to control
  *	@on:	enable/disable power management wakeup
  *
- *	Enable/disable power management wakeup mode
+ *	Enable/disable power management wakeup mode, which is
+ *	disabled by default.  Enables and disables must match,
+ *	just as they match for non-wakeup mode support.
+ *
+ *	Wakeup mode lets this IRQ wake the system from sleep
+ *	states like "suspend to RAM".
  */
 int set_irq_wake(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 	int ret = -ENXIO;
+	int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
 
+	/* wakeup-capable irqs can be shared between drivers that
+	 * don't need to have the same sleep mode behaviors.
+	 */
 	spin_lock_irqsave(&desc->lock, flags);
-	if (desc->chip->set_wake)
+	if (on) {
+		if (desc->wake_depth++ == 0)
+			desc->status |= IRQ_WAKEUP;
+		else
+			set_wake = NULL;
+	} else {
+		if (desc->wake_depth == 0) {
+			printk(KERN_WARNING "Unbalanced IRQ %d "
+					"wake disable\n", irq);
+			WARN_ON(1);
+		} else if (--desc->wake_depth == 0)
+			desc->status &= ~IRQ_WAKEUP;
+		else
+			set_wake = NULL;
+	}
+	if (set_wake)
 		ret = desc->chip->set_wake(irq, on);
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
-- 
cgit v1.2.3-55-g7522


From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Sun, 30 Jul 2006 03:03:10 -0700
Subject: [PATCH] make taskstats sending completely independent of delay
 accounting on/off status

Complete the separation of delay accounting and taskstats by ignoring the
return value of delay accounting functions that fill in parts of taskstats
before it is sent out (either in response to a command or as part of a task
exit).

Also make delayacct_add_tsk return silently when delay accounting is turned
off rather than treat it as an error.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/delayacct.h | 4 +---
 kernel/taskstats.c        | 8 +++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 7e8b6011b8f3..8a284cc6fd5f 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -80,9 +80,7 @@ static inline void delayacct_blkio_end(void)
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 {
-	if (likely(!delayacct_on))
-		return -EINVAL;
-	if (!tsk->delays)
+	if (likely(!delayacct_on) || !tsk->delays)
 		return 0;
 	return __delayacct_add_tsk(d, tsk);
 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45179ce028e..b4c737a11408 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
-	int rc;
+	int rc = 0;
 	struct task_struct *tsk = pidtsk;
 
 	if (!pidtsk) {
@@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 * Each accounting subsystem adds calls to its functions to
 	 * fill in relevant parts of struct taskstsats as follows
 	 *
-	 *	rc = per-task-foo(stats, tsk);
-	 *	if (rc)
-	 *		goto err;
+	 *	per-task-foo(stats, tsk);
 	 */
 
-	rc = delayacct_add_tsk(stats, tsk);
+	delayacct_add_tsk(stats, tsk);
 	stats->version = TASKSTATS_VERSION;
 
 	/* Define err: label here if needed */
-- 
cgit v1.2.3-55-g7522


From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Sun, 30 Jul 2006 03:03:11 -0700
Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners

Add a missing freeing of skb in the case there are no listeners at all.
Also remove the returning of error values by the function as it is unused
by the sole caller.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b4c737a11408..e78187657330 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
  * Send taskstats data in @skb to listeners registered for @cpu's exit data
  */
-static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret, delcount = 0;
+	int rc, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
-		return rc;
+		return;
 	}
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
 	down_read(&listeners->sem);
-	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+	list_for_each_entry(s, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
 			skb_next = skb_clone(skb_cur, GFP_KERNEL);
-			if (!skb_next) {
-				nlmsg_free(skb_cur);
-				rc = -ENOMEM;
+			if (!skb_next)
 				break;
-			}
 		}
-		ret = genlmsg_unicast(skb_cur, s->pid);
-		if (ret == -ECONNREFUSED) {
+		rc = genlmsg_unicast(skb_cur, s->pid);
+		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
-			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
 	up_read(&listeners->sem);
 
+	if (skb_cur)
+		nlmsg_free(skb_cur);
+
 	if (!delcount)
-		return rc;
+		return;
 
 	/* Delete invalidated entries */
 	down_write(&listeners->sem);
@@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 	}
 	up_write(&listeners->sem);
-	return rc;
 }
 
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
-- 
cgit v1.2.3-55-g7522


From 163ecdff060f2fa9e8f5238882fd0137493556a6 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar
Date: Sun, 30 Jul 2006 03:03:11 -0700
Subject: [PATCH] delay accounting: temporarily enable by default

Enable delay accounting by default so that feature gets coverage testing
without requiring special measures.

Earlier, it was off by default and had to be enabled via a boot time param.
 This patch reverses the default behaviour to improve coverage testing.  It
can be removed late in the kernel development cycle if its believed users
shouldn't have to incur any cost if they don't want delay accounting.  Or
it can be retained forever if the utility of the stats is deemed common
enough to warrant keeping the feature on.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/accounting/delay-accounting.txt | 10 ++++++----
 Documentation/kernel-parameters.txt           |  4 ++--
 include/linux/delayacct.h                     |  4 ++--
 kernel/delayacct.c                            |  8 ++++----
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
index be215e58423b..1443cd71d263 100644
--- a/Documentation/accounting/delay-accounting.txt
+++ b/Documentation/accounting/delay-accounting.txt
@@ -64,11 +64,13 @@ Compile the kernel with
 	CONFIG_TASK_DELAY_ACCT=y
 	CONFIG_TASKSTATS=y
 
-Enable the accounting at boot time by adding
-the following to the kernel boot options
-	delayacct
+Delay accounting is enabled by default at boot up.
+To disable, add
+   nodelayacct
+to the kernel boot options. The rest of the instructions
+below assume this has not been done.
 
-and after the system has booted up, use a utility
+After the system has booted up, use a utility
 similar to  getdelays.c to access the delays
 seen by a given task or a task group (tgid).
 The utility also allows a given command to be
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e11f7728ec6f..b50595a0550f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -448,8 +448,6 @@ running once the system is up.
 			Format: <area>[,<node>]
 			See also Documentation/networking/decnet.txt.
 
-	delayacct	[KNL] Enable per-task delay accounting
-
 	dhash_entries=	[KNL]
 			Set number of hash buckets for dentry cache.
 
@@ -1031,6 +1029,8 @@ running once the system is up.
 
 	nocache		[ARM]
 
+	nodelayacct	[KNL] Disable per-task delay accounting
+
 	nodisconnect	[HW,SCSI,M68K] Disables SCSI disconnects.
 
 	noexec		[IA-64]
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 8a284cc6fd5f..11487b6e7127 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -55,7 +55,7 @@ static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
 	/* reinitialize in case parent's non-null pointer was dup'ed*/
 	tsk->delays = NULL;
-	if (unlikely(delayacct_on))
+	if (delayacct_on)
 		__delayacct_tsk_init(tsk);
 }
 
@@ -80,7 +80,7 @@ static inline void delayacct_blkio_end(void)
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 {
-	if (likely(!delayacct_on) || !tsk->delays)
+	if (!delayacct_on || !tsk->delays)
 		return 0;
 	return __delayacct_add_tsk(d, tsk);
 }
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index f05392d64267..57ca3730205d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,15 +19,15 @@
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
 
-int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
+int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
 kmem_cache_t *delayacct_cache;
 
-static int __init delayacct_setup_enable(char *str)
+static int __init delayacct_setup_disable(char *str)
 {
-	delayacct_on = 1;
+	delayacct_on = 0;
 	return 1;
 }
-__setup("delayacct", delayacct_setup_enable);
+__setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-- 
cgit v1.2.3-55-g7522


From a9ad965ea9a6d719daf333847a2ceb0e363994bd Mon Sep 17 00:00:00 2001
From: bibo, mao
Date: Sun, 30 Jul 2006 03:03:26 -0700
Subject: [PATCH] IA64: kprobe invalidate icache of jump buffer

Kprobe inserts breakpoint instruction in probepoint and then jumps to
instruction slot when breakpoint is hit, the instruction slot icache must
be consistent with dcache.  Here is the patch which invalidates instruction
slot icache area.

Without this patch, in some machines there will be fault when executing
instruction slot where icache content is inconsistent with dcache.

Signed-off-by: bibo,mao <bibo.mao@intel.com>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Acked-by: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/kprobes.c    | 9 +++++++++
 include/asm-i386/kprobes.h    | 1 +
 include/asm-ia64/kprobes.h    | 1 +
 include/asm-powerpc/kprobes.h | 1 +
 include/asm-sparc64/kprobes.h | 1 +
 include/asm-x86_64/kprobes.h  | 1 +
 kernel/kprobes.c              | 1 +
 7 files changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 00d9c83b8020..781960f80b6f 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -448,11 +448,20 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 	return 0;
 }
 
+void __kprobes flush_insn_slot(struct kprobe *p)
+{
+	unsigned long arm_addr;
+
+	arm_addr = ((unsigned long)&p->opcode.bundle) & ~0xFULL;
+	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
+}
+
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	unsigned long addr = (unsigned long)p->addr;
 	unsigned long arm_addr = addr & ~0xFULL;
 
+	flush_insn_slot(p);
 	memcpy((char *)arm_addr, &p->ainsn.insn.bundle, sizeof(bundle_t));
 	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
 }
diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index 0730a20f6db8..8774d06689da 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -45,6 +45,7 @@ typedef u8 kprobe_opcode_t;
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define ARCH_SUPPORTS_KRETPROBES
 #define  ARCH_INACTIVE_KPROBE_COUNT 0
+#define flush_insn_slot(p)	do { } while (0)
 
 void arch_remove_kprobe(struct kprobe *p);
 void kretprobe_trampoline(void);
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 2418a787c405..938904910115 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -125,5 +125,6 @@ static inline void jprobe_return(void)
 }
 extern void invalidate_stacked_regs(void);
 extern void flush_register_stack(void);
+extern void flush_insn_slot(struct kprobe *p);
 
 #endif				/* _ASM_KPROBES_H */
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index 2d0af52c823d..34e1f89a5fa0 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -51,6 +51,7 @@ typedef unsigned int kprobe_opcode_t;
 
 #define ARCH_SUPPORTS_KRETPROBES
 #define  ARCH_INACTIVE_KPROBE_COUNT 1
+#define flush_insn_slot(p)	do { } while (0)
 
 void kretprobe_trampoline(void);
 extern void arch_remove_kprobe(struct kprobe *p);
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index 15065af566c2..c9f5c34d318c 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -13,6 +13,7 @@ typedef u32 kprobe_opcode_t;
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 #define arch_remove_kprobe(p)	do {} while (0)
 #define  ARCH_INACTIVE_KPROBE_COUNT 0
+#define flush_insn_slot(p)	do { } while (0)
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h
index d36febd9bb18..cf5317898fb0 100644
--- a/include/asm-x86_64/kprobes.h
+++ b/include/asm-x86_64/kprobes.h
@@ -47,6 +47,7 @@ typedef u8 kprobe_opcode_t;
 
 void kretprobe_trampoline(void);
 extern void arch_remove_kprobe(struct kprobe *p);
+#define flush_insn_slot(p)	do { } while (0)
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 64aab081153b..3f57dfdc8f92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	copy_kprobe(p, ap);
+	flush_insn_slot(ap);
 	ap->addr = p->addr;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
-- 
cgit v1.2.3-55-g7522


From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman
Date: Sun, 30 Jul 2006 03:03:35 -0700
Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu
 notifications

Few of the callback functions and notifier blocks that are associated with cpu
notifications incorrectly have __devinit and __devinitdata.  They should be
__cpuinit and __cpuinitdata instead.

It makes no functional difference but wastes text area when CONFIG_HOTPLUG is
enabled and CONFIG_HOTPLUG_CPU is not.

This patch fixes all those instances.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/kernel/sysfs.c | 4 ++--
 kernel/hrtimer.c            | 4 ++--
 kernel/rcupdate.c           | 4 ++--
 kernel/softirq.c            | 4 ++--
 kernel/softlockup.c         | 4 ++--
 kernel/timer.c              | 4 ++--
 mm/slab.c                   | 2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 010435095550..fec228cd0163 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -278,7 +278,7 @@ static void unregister_cpu_online(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit sysfs_cpu_notify(struct notifier_block *self,
+static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
@@ -296,7 +296,7 @@ static int __devinit sysfs_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata sysfs_cpu_nb = {
+static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
 	.notifier_call	= sysfs_cpu_notify,
 };
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d17766d40dab..be989efc7856 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block __cpuinitdata hrtimers_nb = {
 	.notifier_call = hrtimer_cpu_notify,
 };
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 759805c9859a..436ab35f6fa7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
-static int __devinit rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block __cpuinitdata rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f08a84ae307..aab880677ce0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -547,7 +547,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
@@ -587,7 +587,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b76caa22981..03e6a2b0b787 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
  * Create/destroy watchdog threads as CPUs come and go:
  */
-static int __devinit
+static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 05809c2e2fd6..ee319fc69f40 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1688,7 +1688,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1708,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 
diff --git a/mm/slab.c b/mm/slab.c
index 0f20843beffd..252972ff6495 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-static int __devinit cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
-- 
cgit v1.2.3-55-g7522


From 6ea24f9ad18c65cc179593b5cc2a88cdadf8cc0c Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Sun, 30 Jul 2006 03:03:38 -0700
Subject: [PATCH] fix bad macro param in timer.c

We have

#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK

and it's used via

	list = varray[i + 1]->vec + (INDEX(i + 1));

So, due to underparenthesisation, this INDEX(i+1) is now a ...  (TVR_BITS + i
+ 1 * TVN_BITS)) ...

So this bugfix changes behaviour.  It worked before by sheer luck:

  "If i was anything but 0, it was broken.  But this was only used by
   s390 and arm.  Since it was for the next interrupt, could that next
   interrupt be a problem (going into the second cascade)? But it was
   probably seldom wrong.  That is, this would fail if the next
   interrupt was in the second cascade, and was wrapped.  Which may
   never of happened.  Also if it did happen, it would have just missed
   the interrupt.

   If an interrupt was missed, and no one was there to miss it, was it
   really missed :-)"

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index ee319fc69f40..726caf9a0a69 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 static inline void __run_timers(tvec_base_t *base)
 {
-- 
cgit v1.2.3-55-g7522


From 2d7d253548cffdce80f4e03664686e9ccb1b0ed7 Mon Sep 17 00:00:00 2001
From: Jim Houston
Date: Sun, 30 Jul 2006 03:03:39 -0700
Subject: [PATCH] fix cond_resched() fix

In cond_resched_lock() it calls __resched_legal() before dropping the spin
lock.  __resched_legal() will always finds the preempt_count non-zero and
will prevent the call to __cond_resched().

The attached patch adds a parameter to __resched_legal() with the expected
preempt_count value.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 41a571806ce0..de440b220b4c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4456,9 +4456,9 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-static inline int __resched_legal(void)
+static inline int __resched_legal(int expected_preempt_count)
 {
-	if (unlikely(preempt_count()))
+	if (unlikely(preempt_count() != expected_preempt_count))
 		return 0;
 	if (unlikely(system_state != SYSTEM_RUNNING))
 		return 0;
@@ -4484,7 +4484,7 @@ static void __cond_resched(void)
 
 int __sched cond_resched(void)
 {
-	if (need_resched() && __resched_legal()) {
+	if (need_resched() && __resched_legal(0)) {
 		__cond_resched();
 		return 1;
 	}
@@ -4510,7 +4510,7 @@ int cond_resched_lock(spinlock_t *lock)
 		ret = 1;
 		spin_lock(lock);
 	}
-	if (need_resched() && __resched_legal()) {
+	if (need_resched() && __resched_legal(1)) {
 		spin_release(&lock->dep_map, 1, _THIS_IP_);
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
@@ -4526,7 +4526,7 @@ int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
-	if (need_resched() && __resched_legal()) {
+	if (need_resched() && __resched_legal(0)) {
 		raw_local_irq_disable();
 		_local_bh_enable();
 		raw_local_irq_enable();
-- 
cgit v1.2.3-55-g7522


From 0fcb78c22f06340cdba884d7381adb3a0148bbb6 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer
Date: Sun, 30 Jul 2006 03:03:42 -0700
Subject: [PATCH] Add DocBook documentation for workqueue functions

kernel/workqueue.c was omitted from generating kernel documentation.  This
adds a new section "Workqueues and Kevents" and adds documentation for some
of the functions.

Some functions in this file already had DocBook-style comments, now they
finally become visible.

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl |  3 ++
 kernel/workqueue.c                    | 58 ++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 356e34e6ce33..f8fe882e33dc 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -58,6 +58,9 @@
 !Iinclude/linux/ktime.h
 !Iinclude/linux/hrtimer.h
 !Ekernel/hrtimer.c
+     </sect1>
+     <sect1><title>Workqueues and Kevents</title>
+!Ekernel/workqueue.c
      </sect1>
      <sect1><title>Internal Functions</title>
 !Ikernel/exit.c
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eebb1d839235..448e8f7b342d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
-/*
- * Queue work on a workqueue. Return non-zero if it was successfully
- * added.
+/**
+ * queue_work - queue work on a workqueue
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns non-zero if it was successfully added.
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
@@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
 	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
 {
@@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
 
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work, unsigned long delay)
 {
@@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 	}
 }
 
-/*
+/**
  * flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
  *
  * Forces execution of the workqueue and blocks until its completion.
  * This is typically used in driver shutdown handlers.
@@ -400,6 +421,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 		kthread_stop(p);
 }
 
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
@@ -425,18 +452,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
 
 static struct workqueue_struct *keventd_wq;
 
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
 int fastcall schedule_work(struct work_struct *work)
 {
 	return queue_work(keventd_wq, work);
 }
 EXPORT_SYMBOL(schedule_work);
 
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
 int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
 {
 	return queue_delayed_work(keventd_wq, work, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
 int schedule_delayed_work_on(int cpu,
 			struct work_struct *work, unsigned long delay)
 {
-- 
cgit v1.2.3-55-g7522


From b50f60ceeef2e38e529737c0260d9543939915ad Mon Sep 17 00:00:00 2001
From: Heiko Carstens
Date: Sun, 30 Jul 2006 03:03:52 -0700
Subject: [PATCH] pi-futex: missing pi_waiters plist initialization

Initialize init task's pi_waiters plist.  Otherwise cpu hotplug of cpu 0
might crash, since rt_mutex_getprio() accesses an uninitialized list head.

call chain which led to crash:

take_cpu_down
sched_idle_next
__setscheduler
rt_mutex_getprio

Using PLIST_HEAD_INIT in the INIT_TASK macro doesn't work unfortunately,
since the pi_waiters member is only conditionally present.

Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index de440b220b4c..a2be2d055299 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6766,6 +6766,11 @@ void __init sched_init(void)
 	}
 
 	set_load_weight(&init_task);
+
+#ifdef CONFIG_RT_MUTEXES
+	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-- 
cgit v1.2.3-55-g7522


From 3c829c367a1a52550378584a657768217971e587 Mon Sep 17 00:00:00 2001
From: Tim Chen
Date: Sun, 30 Jul 2006 03:04:02 -0700
Subject: [PATCH] Reducing local_bh_enable/disable overhead in irqtrace

The recent changes from irqtrace feature has added overheads to
local_bh_disable and local_bh_enable that reduces UDP performance across
x86_64 and IA64, even though IA64 does not support the irqtrace feature.
Patch in question is

[PATCH]lockdep: irqtrace subsystem, core
http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c
ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986

Prior to this patch, local_bh_disable was a short macro.  Now it is a
function which calls __local_bh_disable with added irq flags save and
restore.  The irq flags save and restore were also added to
local_bh_enable, probably for injecting the trace irqs code.

This overhead is on the generic code path across all architectures.  On a
IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's
UDP streaming test, the added overhead results in a drop of 3% in
throughput, as udp_sendmsg calls the local_bh_enable/disable several times.

Other workloads that have heavy usages of local_bh_enable/disable could
also be affected.  The patch ideally should not have affected IA-64
performance as it does not have IRQ tracing support.  A significant portion
of the overhead is in the added irq flags save and restore, which I think
is not needed if IRQ tracing is unused.  A suggested patch is attached
below that recovers the lost performance.  However, the "ifdef"s in the
patch are a bit ugly.

Signed-off-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index aab880677ce0..3789ca98197c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void)
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
  */
+#ifdef CONFIG_TRACE_IRQFLAGS
 static void __local_bh_disable(unsigned long ip)
 {
 	unsigned long flags;
@@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
 }
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+	add_preempt_count(SOFTIRQ_OFFSET);
+	barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
 
 void local_bh_disable(void)
 {
@@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable);
 
 void local_bh_enable(void)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned long flags;
 
 	WARN_ON_ONCE(in_irq());
+#endif
 	WARN_ON_ONCE(irqs_disabled());
 
+#ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_save(flags);
+#endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
@@ -142,18 +154,22 @@ void local_bh_enable(void)
 		do_softirq();
 
 	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_restore(flags);
+#endif
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable);
 
 void local_bh_enable_ip(unsigned long ip)
 {
+#ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned long flags;
 
 	WARN_ON_ONCE(in_irq());
 
 	local_irq_save(flags);
+#endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
@@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip)
 		do_softirq();
 
 	dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_restore(flags);
+#endif
 	preempt_check_resched();
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
-- 
cgit v1.2.3-55-g7522


From d07fe82c24daab2360e2790f488bcffa7db74825 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Sun, 30 Jul 2006 03:04:03 -0700
Subject: [PATCH] reference rt-mutex-design in rtmutex.c

In order to prevent Doc Rot, this patch adds a reference to the design
document for rtmutex.c in rtmutex.c.  So when someone needs to update or
change the design of that file they will know that a document actually
exists that explains the design (helping them change it), and hopefully
that they will update the document if they too change the design.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rtmutex.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index d2ef13b485e7..3e13a1e5856f 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -7,6 +7,8 @@
  *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
+ *
+ *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
 #include <linux/module.h>
-- 
cgit v1.2.3-55-g7522


From 51d8c5edd3b166fcc51aba84d78761d578400a7c Mon Sep 17 00:00:00 2001
From: Josh Triplett
Date: Sun, 30 Jul 2006 03:04:14 -0700
Subject: [PATCH] timer: Fix tvec_bases initializer

kernel/timer.c defines a (per-cpu) pointer to tvec_base_t, but initializes
it using { &a_tvec_base_t }, which sparse warns about; change this to just
&a_tvec_base_t.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 726caf9a0a69..b650f04888ed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
 
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
-- 
cgit v1.2.3-55-g7522


From ae74c3b69a08e1de20cb681ec959f3a48af0006a Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Wed, 2 Aug 2006 20:17:49 -0700
Subject: Fix force_sig_info() semantics after cleanups

Suresh points out that commit b0423a0d9cc836b2c3d796623cd19236bfedfe63
broke the semantics of a synchronous signal like SIGSEGV occurring
recursively inside its own handler handler (or, indeed, any other
context when the signal was blocked).

That was unintentional, and this fixes things up by reinstating the old
semantics, but without reverting the cleanups.

Cc: Paul E. McKenney <paulmck@us.ibm.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 7fe874d12fae..bfdb5686fa3e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -791,22 +791,31 @@ out:
 /*
  * Force a signal that the process can't ignore: if necessary
  * we unblock the signal and change any SIG_IGN to SIG_DFL.
+ *
+ * Note: If we unblock the signal, we always reset it to SIG_DFL,
+ * since we do not want to have a signal handler that was blocked
+ * be invoked when user space had explicitly blocked it.
+ *
+ * We don't want to have recursive SIGSEGV's etc, for example.
  */
-
 int
 force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
 	unsigned long int flags;
-	int ret;
+	int ret, blocked, ignored;
+	struct k_sigaction *action;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
-		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
-	}
-	if (sigismember(&t->blocked, sig)) {
-		sigdelset(&t->blocked, sig);
+	action = &t->sighand->action[sig-1];
+	ignored = action->sa.sa_handler == SIG_IGN;
+	blocked = sigismember(&t->blocked, sig);
+	if (blocked || ignored) {
+		action->sa.sa_handler = SIG_DFL;
+		if (blocked) {
+			sigdelset(&t->blocked, sig);
+			recalc_sigpending_tsk(t);
+		}
 	}
-	recalc_sigpending_tsk(t);
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
-- 
cgit v1.2.3-55-g7522


From 3e2efce067cec0099f99ae59f28feda99b02b498 Mon Sep 17 00:00:00 2001
From: Amy Griffis
Date: Thu, 13 Jul 2006 13:16:02 -0400
Subject: [PATCH] fix faulty inode data collection for open() with O_CREAT

When the specified path is an existing file or when it is a symlink, audit
collects the wrong inode number, which causes it to miss the open() event.
Adding a second hook to the open() path fixes this.

Also add audit_copy_inode() to consolidate some code.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  2 ++
 include/linux/audit.h |  7 ++++++
 kernel/auditsc.c      | 63 +++++++++++++++++++++++++++++++++------------------
 3 files changed, 50 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index e01070d7bf58..47a7bad92d2a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1659,6 +1659,7 @@ do_last:
 	 * It already exists.
 	 */
 	mutex_unlock(&dir->d_inode->i_mutex);
+	audit_inode_update(path.dentry->d_inode);
 
 	error = -EEXIST;
 	if (flag & O_EXCL)
@@ -1669,6 +1670,7 @@ do_last:
 		if (flag & O_NOFOLLOW)
 			goto exit_dput;
 	}
+
 	error = -ENOENT;
 	if (!path.dentry->d_inode)
 		goto exit_dput;
diff --git a/include/linux/audit.h b/include/linux/audit.h
index b27d7debc5a1..e7e5e5348987 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -328,6 +328,7 @@ extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct inode *inode);
 extern void __audit_inode_child(const char *dname, const struct inode *inode,
 				unsigned long pino);
+extern void __audit_inode_update(const struct inode *inode);
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(current->audit_context))
@@ -343,6 +344,10 @@ static inline void audit_inode_child(const char *dname,
 	if (unlikely(current->audit_context))
 		__audit_inode_child(dname, inode, pino);
 }
+static inline void audit_inode_update(const struct inode *inode) {
+	if (unlikely(current->audit_context))
+		__audit_inode_update(inode);
+}
 
 				/* Private API (for audit.c only) */
 extern unsigned int audit_serial(void);
@@ -414,8 +419,10 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 #define audit_putname(n) do { ; } while (0)
 #define __audit_inode(n,i) do { ; } while (0)
 #define __audit_inode_child(d,i,p) do { ; } while (0)
+#define __audit_inode_update(i) do { ; } while (0)
 #define audit_inode(n,i) do { ; } while (0)
 #define audit_inode_child(d,i,p) do { ; } while (0)
+#define audit_inode_update(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_obj(i) ({ 0; })
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae40ac8c39e7..b939ed2da3ee 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1199,14 +1199,18 @@ void audit_putname(const char *name)
 #endif
 }
 
-static void audit_inode_context(int idx, const struct inode *inode)
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
 {
-	struct audit_context *context = current->audit_context;
-
-	selinux_get_inode_sid(inode, &context->names[idx].osid);
+	name->ino   = inode->i_ino;
+	name->dev   = inode->i_sb->s_dev;
+	name->mode  = inode->i_mode;
+	name->uid   = inode->i_uid;
+	name->gid   = inode->i_gid;
+	name->rdev  = inode->i_rdev;
+	selinux_get_inode_sid(inode, &name->osid);
 }
 
-
 /**
  * audit_inode - store the inode and device from a lookup
  * @name: name being audited
@@ -1240,13 +1244,7 @@ void __audit_inode(const char *name, const struct inode *inode)
 		++context->ino_count;
 #endif
 	}
-	context->names[idx].ino   = inode->i_ino;
-	context->names[idx].dev	  = inode->i_sb->s_dev;
-	context->names[idx].mode  = inode->i_mode;
-	context->names[idx].uid   = inode->i_uid;
-	context->names[idx].gid   = inode->i_gid;
-	context->names[idx].rdev  = inode->i_rdev;
-	audit_inode_context(idx, inode);
+	audit_copy_inode(&context->names[idx], inode);
 }
 
 /**
@@ -1302,16 +1300,37 @@ update_context:
 	context->names[idx].name_len = AUDIT_NAME_FULL;
 	context->names[idx].name_put = 0;	/* don't call __putname() */
 
-	if (inode) {
-		context->names[idx].ino   = inode->i_ino;
-		context->names[idx].dev	  = inode->i_sb->s_dev;
-		context->names[idx].mode  = inode->i_mode;
-		context->names[idx].uid   = inode->i_uid;
-		context->names[idx].gid   = inode->i_gid;
-		context->names[idx].rdev  = inode->i_rdev;
-		audit_inode_context(idx, inode);
-	} else
-		context->names[idx].ino   = (unsigned long)-1;
+	if (!inode)
+		context->names[idx].ino = (unsigned long)-1;
+	else
+		audit_copy_inode(&context->names[idx], inode);
+}
+
+/**
+ * audit_inode_update - update inode info for last collected name
+ * @inode: inode being audited
+ *
+ * When open() is called on an existing object with the O_CREAT flag, the inode
+ * data audit initially collects is incorrect.  This additional hook ensures
+ * audit has the inode data for the actual object to be opened.
+ */
+void __audit_inode_update(const struct inode *inode)
+{
+	struct audit_context *context = current->audit_context;
+	int idx;
+
+	if (!context->in_syscall || !inode)
+		return;
+
+	if (context->name_count == 0) {
+		context->name_count++;
+#if AUDIT_DEBUG
+		context->ino_count++;
+#endif
+	}
+	idx = context->name_count - 1;
+
+	audit_copy_inode(&context->names[idx], inode);
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From 73d3ec5abad3f1730ac8530899d2c14d92f3ad63 Mon Sep 17 00:00:00 2001
From: Amy Griffis
Date: Thu, 13 Jul 2006 13:16:39 -0400
Subject: [PATCH] fix missed create event for directory audit

When an object is created via a symlink into an audited directory, audit misses
the event due to not having collected the inode data for the directory.  Modify
__audit_inode_child() to copy the parent inode data if a parent wasn't found in
audit_names[].

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |  2 +-
 include/linux/audit.h    |  8 ++++----
 include/linux/fsnotify.h |  6 +++---
 kernel/auditsc.c         | 16 +++++++++++++---
 4 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index 47a7bad92d2a..0ab26cbdacc0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1357,7 +1357,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return -ENOENT;
 
 	BUG_ON(victim->d_parent->d_inode != dir);
-	audit_inode_child(victim->d_name.name, victim->d_inode, dir->i_ino);
+	audit_inode_child(victim->d_name.name, victim->d_inode, dir);
 
 	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
 	if (error)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index e7e5e5348987..bf196c05826c 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -327,7 +327,7 @@ extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct inode *inode);
 extern void __audit_inode_child(const char *dname, const struct inode *inode,
-				unsigned long pino);
+				const struct inode *parent);
 extern void __audit_inode_update(const struct inode *inode);
 static inline void audit_getname(const char *name)
 {
@@ -339,10 +339,10 @@ static inline void audit_inode(const char *name, const struct inode *inode) {
 		__audit_inode(name, inode);
 }
 static inline void audit_inode_child(const char *dname, 
-				     const struct inode *inode, 
-				     unsigned long pino) {
+				     const struct inode *inode,
+				     const struct inode *parent) {
 	if (unlikely(current->audit_context))
-		__audit_inode_child(dname, inode, pino);
+		__audit_inode_child(dname, inode, parent);
 }
 static inline void audit_inode_update(const struct inode *inode) {
 	if (unlikely(current->audit_context))
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index cc5dec70c32c..d4f219ffaa5d 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -67,7 +67,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
 	}
-	audit_inode_child(new_name, source, new_dir->i_ino);
+	audit_inode_child(new_name, source, new_dir);
 }
 
 /*
@@ -98,7 +98,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
-	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
+	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode);
 }
 
 /*
@@ -109,7 +109,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
 				  dentry->d_name.name, dentry->d_inode);
-	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
+	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode);
 }
 
 /*
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b939ed2da3ee..b1356fc63b26 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1251,7 +1251,7 @@ void __audit_inode(const char *name, const struct inode *inode)
  * audit_inode_child - collect inode info for created/removed objects
  * @dname: inode's dentry name
  * @inode: inode being audited
- * @pino: inode number of dentry parent
+ * @parent: inode of dentry parent
  *
  * For syscalls that create or remove filesystem objects, audit_inode
  * can only collect information for the filesystem object's parent.
@@ -1262,7 +1262,7 @@ void __audit_inode(const char *name, const struct inode *inode)
  * unsuccessful attempts.
  */
 void __audit_inode_child(const char *dname, const struct inode *inode,
-			 unsigned long pino)
+			 const struct inode *parent)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
@@ -1276,7 +1276,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
 	if (!dname)
 		goto update_context;
 	for (idx = 0; idx < context->name_count; idx++)
-		if (context->names[idx].ino == pino) {
+		if (context->names[idx].ino == parent->i_ino) {
 			const char *name = context->names[idx].name;
 
 			if (!name)
@@ -1304,6 +1304,16 @@ update_context:
 		context->names[idx].ino = (unsigned long)-1;
 	else
 		audit_copy_inode(&context->names[idx], inode);
+
+	/* A parent was not found in audit_names, so copy the inode data for the
+	 * provided parent. */
+	if (!found_name) {
+		idx = context->name_count++;
+#if AUDIT_DEBUG
+		context->ino_count++;
+#endif
+		audit_copy_inode(&context->names[idx], parent);
+	}
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From 6988434ee5f532c71be3131fba23283f5cf43847 Mon Sep 17 00:00:00 2001
From: Amy Griffis
Date: Thu, 13 Jul 2006 13:17:12 -0400
Subject: [PATCH] fix oops with CONFIG_AUDIT and !CONFIG_AUDITSYSCALL

Always initialize the audit_inode_hash[] so we don't oops on list rules.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index d417ca1db79b..0a36091ed712 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = {
 /* Initialize audit support at boot time. */
 static int __init audit_init(void)
 {
-#ifdef CONFIG_AUDITSYSCALL
 	int i;
-#endif
 
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
@@ -717,10 +715,10 @@ static int __init audit_init(void)
 	audit_ih = inotify_init(&audit_inotify_ops);
 	if (IS_ERR(audit_ih))
 		audit_panic("cannot initialize inotify handle");
+#endif
 
 	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
 		INIT_LIST_HEAD(&audit_inode_hash[i]);
-#endif
 
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 5422e01ac16df7398b2bad1eccad0ae3be4dee32 Mon Sep 17 00:00:00 2001
From: Amy Griffis
Date: Tue, 1 Aug 2006 17:52:26 -0400
Subject: [PATCH] fix audit oops with invalid operator

Michael C Thompson wrote:  [Tue Aug 01 2006, 02:36:36PM EDT]
> The trigger for this oops is:
> # auditctl -a exit,always -S pread64 -F 'inode<1'

Setting the err value will fix it.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 5b4e16276ca0..32420f914028 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -442,6 +442,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_EQUAL:
 			break;
 		default:
+			err = -EINVAL;
 			goto exit_free;
 		}
 	}
@@ -579,6 +580,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_EQUAL:
 			break;
 		default:
+			err = -EINVAL;
 			goto exit_free;
 		}
 	}
-- 
cgit v1.2.3-55-g7522


From 471a5c7c839114cc8b55876203aeb2817c33e3c5 Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Mon, 10 Jul 2006 08:29:24 -0400
Subject: [PATCH] introduce audit rules counter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  2 ++
 kernel/auditfilter.c  | 24 ++++++++++++++++++++++++
 kernel/auditsc.c      |  3 +++
 3 files changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index bf196c05826c..f1bfcff497bf 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -410,6 +410,7 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 		return __audit_mq_getsetattr(mqdes, mqstat);
 	return 0;
 }
+extern int audit_n_rules;
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
@@ -437,6 +438,7 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
+#define audit_n_rules 0
 #endif
 
 #ifdef CONFIG_AUDIT
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 32420f914028..6a9a5c5a4e7d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1136,6 +1136,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
 	struct audit_watch *watch = entry->rule.watch;
 	struct nameidata *ndp, *ndw;
 	int h, err, putnd_needed = 0;
+#ifdef CONFIG_AUDITSYSCALL
+	int dont_count = 0;
+
+	/* If either of these, don't count towards total */
+	if (entry->rule.listnr == AUDIT_FILTER_USER ||
+		entry->rule.listnr == AUDIT_FILTER_TYPE)
+		dont_count = 1;
+#endif
 
 	if (inode_f) {
 		h = audit_hash_ino(inode_f->val);
@@ -1176,6 +1184,10 @@ static inline int audit_add_rule(struct audit_entry *entry,
 	} else {
 		list_add_tail_rcu(&entry->list, list);
 	}
+#ifdef CONFIG_AUDITSYSCALL
+	if (!dont_count)
+		audit_n_rules++;
+#endif
 	mutex_unlock(&audit_filter_mutex);
 
 	if (putnd_needed)
@@ -1200,6 +1212,14 @@ static inline int audit_del_rule(struct audit_entry *entry,
 	struct audit_watch *watch, *tmp_watch = entry->rule.watch;
 	LIST_HEAD(inotify_list);
 	int h, ret = 0;
+#ifdef CONFIG_AUDITSYSCALL
+	int dont_count = 0;
+
+	/* If either of these, don't count towards total */
+	if (entry->rule.listnr == AUDIT_FILTER_USER ||
+		entry->rule.listnr == AUDIT_FILTER_TYPE)
+		dont_count = 1;
+#endif
 
 	if (inode_f) {
 		h = audit_hash_ino(inode_f->val);
@@ -1237,6 +1257,10 @@ static inline int audit_del_rule(struct audit_entry *entry,
 	list_del_rcu(&e->list);
 	call_rcu(&e->rcu, audit_free_rule_rcu);
 
+#ifdef CONFIG_AUDITSYSCALL
+	if (!dont_count)
+		audit_n_rules--;
+#endif
 	mutex_unlock(&audit_filter_mutex);
 
 	if (!list_empty(&inotify_list))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b1356fc63b26..3ea836d3d941 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -85,6 +85,9 @@ extern int audit_enabled;
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
 
+/* number of audit rules */
+int audit_n_rules;
+
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
-- 
cgit v1.2.3-55-g7522


From d51374adf5f2f88155a072d3d801104e3c0c3d7f Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Thu, 3 Aug 2006 10:59:26 -0400
Subject: [PATCH] mark context of syscall entered with no rules as dummy

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 6 ++++++
 kernel/auditsc.c      | 6 ++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index f1bfcff497bf..3f736d658218 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -329,6 +329,11 @@ extern void __audit_inode(const char *name, const struct inode *inode);
 extern void __audit_inode_child(const char *dname, const struct inode *inode,
 				const struct inode *parent);
 extern void __audit_inode_update(const struct inode *inode);
+static inline int audit_dummy_context(void)
+{
+	void *p = current->audit_context;
+	return !p || *(int *)p;
+}
 static inline void audit_getname(const char *name)
 {
 	if (unlikely(current->audit_context))
@@ -416,6 +421,7 @@ extern int audit_n_rules;
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
 #define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
 #define __audit_inode(n,i) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3ea836d3d941..9618d1507251 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -177,6 +177,7 @@ struct audit_aux_data_path {
 
 /* The per-task audit context. */
 struct audit_context {
+	int		    dummy;	/* must be the first element */
 	int		    in_syscall;	/* 1 if task is in a syscall */
 	enum audit_state    state;
 	unsigned int	    serial;     /* serial number for record */
@@ -517,7 +518,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 	context->return_valid = return_valid;
 	context->return_code  = return_code;
 
-	if (context->in_syscall && !context->auditable) {
+	if (context->in_syscall && !context->dummy && !context->auditable) {
 		enum audit_state state;
 
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
@@ -1069,7 +1070,8 @@ void audit_syscall_entry(int arch, int major,
 	context->argv[3]    = a4;
 
 	state = context->state;
-	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+	context->dummy = !audit_n_rules;
+	if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
 	if (likely(state == AUDIT_DISABLED))
 		return;
-- 
cgit v1.2.3-55-g7522


From 5ac3a9c26c1cc4861d9cdd8b293fecbfcdc81afe Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Sun, 16 Jul 2006 06:38:45 -0400
Subject: [PATCH] don't bother with aux entires for dummy context

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  4 ++--
 include/linux/audit.h | 22 +++++++++++-----------
 kernel/auditsc.c      |  6 +++---
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index 0ab26cbdacc0..55a131230f94 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -159,7 +159,7 @@ char * getname(const char __user * filename)
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		audit_putname(name);
 	else
 		__putname(name);
@@ -1125,7 +1125,7 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 	retval = link_path_walk(name, nd);
 out:
 	if (likely(retval == 0)) {
-		if (unlikely(current->audit_context && nd && nd->dentry &&
+		if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
 				nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode);
 	}
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3f736d658218..64f9f9e56ac5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -336,21 +336,21 @@ static inline int audit_dummy_context(void)
 }
 static inline void audit_getname(const char *name)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		__audit_getname(name);
 }
 static inline void audit_inode(const char *name, const struct inode *inode) {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		__audit_inode(name, inode);
 }
 static inline void audit_inode_child(const char *dname, 
 				     const struct inode *inode,
 				     const struct inode *parent) {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		__audit_inode_child(dname, inode, parent);
 }
 static inline void audit_inode_update(const struct inode *inode) {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		__audit_inode_update(inode);
 }
 
@@ -375,43 +375,43 @@ extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_ipc_obj(ipcp);
 	return 0;
 }
 static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_ipc_set_perm(qbytes, uid, gid, mode);
 	return 0;
 }
 static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_mq_open(oflag, mode, u_attr);
 	return 0;
 }
 static inline int audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout);
 	return 0;
 }
 static inline int audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout);
 	return 0;
 }
 static inline int audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_mq_notify(mqdes, u_notification);
 	return 0;
 }
 static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
-	if (unlikely(current->audit_context))
+	if (unlikely(!audit_dummy_context()))
 		return __audit_mq_getsetattr(mqdes, mqstat);
 	return 0;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9618d1507251..f571c7e925e6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1676,7 +1676,7 @@ int audit_bprm(struct linux_binprm *bprm)
 	unsigned long p, next;
 	void *to;
 
-	if (likely(!audit_enabled || !context))
+	if (likely(!audit_enabled || !context || context->dummy))
 		return 0;
 
 	ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
@@ -1714,7 +1714,7 @@ int audit_socketcall(int nargs, unsigned long *args)
 	struct audit_aux_data_socketcall *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context))
+	if (likely(!context || context->dummy))
 		return 0;
 
 	ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
@@ -1742,7 +1742,7 @@ int audit_sockaddr(int len, void *a)
 	struct audit_aux_data_sockaddr *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (likely(!context))
+	if (likely(!context || context->dummy))
 		return 0;
 
 	ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
-- 
cgit v1.2.3-55-g7522


From 3f2792ffbd88dc1cd41d226674cc428914981e98 Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Sun, 16 Jul 2006 06:43:48 -0400
Subject: [PATCH] take filling ->pid, etc. out of audit_get_context()

move that stuff downstream and into the only branch where it'll be
used.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f571c7e925e6..efc1b74bebf3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -534,17 +534,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 	}
 
 get_context:
-	context->pid = tsk->pid;
-	context->ppid = sys_getppid();	/* sic.  tsk == current in all cases */
-	context->uid = tsk->uid;
-	context->gid = tsk->gid;
-	context->euid = tsk->euid;
-	context->suid = tsk->suid;
-	context->fsuid = tsk->fsuid;
-	context->egid = tsk->egid;
-	context->sgid = tsk->sgid;
-	context->fsgid = tsk->fsgid;
-	context->personality = tsk->personality;
+
 	tsk->audit_context = NULL;
 	return context;
 }
@@ -753,6 +743,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	const char *tty;
 
 	/* tsk == current */
+	context->pid = tsk->pid;
+	context->ppid = sys_getppid();	/* sic.  tsk == current in all cases */
+	context->uid = tsk->uid;
+	context->gid = tsk->gid;
+	context->euid = tsk->euid;
+	context->suid = tsk->suid;
+	context->fsuid = tsk->fsuid;
+	context->egid = tsk->egid;
+	context->sgid = tsk->sgid;
+	context->fsgid = tsk->fsgid;
+	context->personality = tsk->personality;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
 	if (!ab)
-- 
cgit v1.2.3-55-g7522


From a7ef7878ea7c8bca9b624db3f61223cdadda2a0a Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Sat, 5 Aug 2006 12:13:42 -0700
Subject: [PATCH] Make suspend possible with a traced process at a breakpoint

It should be possible to suspend, either to RAM or to disk, if there's a
traced process that has just reached a breakpoint.  However, this is a
special case, because its parent process might have been frozen already and
then we are unable to deliver the "freeze" signal to the traced process.
If this happens, it's better to cancel the freezing of the traced process.

Ref. http://bugzilla.kernel.org/show_bug.cgi?id=6787

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h  |  8 ++++++++
 kernel/power/process.c | 26 ++++++++++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6afa72e080cb..6674fc1e51bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1557,6 +1557,14 @@ static inline void freeze(struct task_struct *p)
 	p->flags |= PF_FREEZE;
 }
 
+/*
+ * Sometimes we may need to cancel the previous 'freeze' request
+ */
+static inline void do_not_freeze(struct task_struct *p)
+{
+	p->flags &= ~PF_FREEZE;
+}
+
 /*
  * Wake up a frozen process
  */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b2a5f671d6cd..72e72d2c61e6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p)
 	}
 }
 
+static void cancel_freezing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (freezing(p)) {
+		pr_debug("  clean up: %s\n", p->comm);
+		do_not_freeze(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		recalc_sigpending_tsk(p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
+
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
 	int todo, nr_user, user_frozen;
 	unsigned long start_time;
 	struct task_struct *g, *p;
-	unsigned long flags;
 
 	printk( "Stopping tasks: " );
 	start_time = jiffies;
@@ -85,6 +97,10 @@ int freeze_processes(void)
 				continue;
 			if (frozen(p))
 				continue;
+			if (p->state == TASK_TRACED && frozen(p->parent)) {
+				cancel_freezing(p);
+				continue;
+			}
 			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
 				/* The task is a user-space one.
 				 * Freeze it unless there's a vfork completion
@@ -126,13 +142,7 @@ int freeze_processes(void)
 		do_each_thread(g, p) {
 			if (freezeable(p) && !frozen(p))
 				printk(KERN_ERR "  %s\n", p->comm);
-			if (freezing(p)) {
-				pr_debug("  clean up: %s\n", p->comm);
-				p->flags &= ~PF_FREEZE;
-				spin_lock_irqsave(&p->sighand->siglock, flags);
-				recalc_sigpending_tsk(p);
-				spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			}
+			cancel_freezing(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		return todo;
-- 
cgit v1.2.3-55-g7522


From e91467ecd1ef381377fd327c0ded922835ec52ab Mon Sep 17 00:00:00 2001
From: Christian Borntraeger
Date: Sat, 5 Aug 2006 12:13:52 -0700
Subject: [PATCH] bug in futex unqueue_me

This patch adds a barrier() in futex unqueue_me to avoid aliasing of two
pointers.

On my s390x system I saw the following oops:

Unable to handle kernel pointer dereference at virtual kernel address
0000000000000000
Oops: 0004 [#1]
CPU:    0    Not tainted
Process mytool (pid: 13613, task: 000000003ecb6ac0, ksp: 00000000366bdbd8)
Krnl PSW : 0704d00180000000 00000000003c9ac2 (_spin_lock+0xe/0x30)
Krnl GPRS: 00000000ffffffff 000000003ecb6ac0 0000000000000000 0700000000000000
           0000000000000000 0000000000000000 000001fe00002028 00000000000c091f
           000001fe00002054 000001fe00002054 0000000000000000 00000000366bddc0
           00000000005ef8c0 00000000003d00e8 0000000000144f91 00000000366bdcb8
Krnl Code: ba 4e 20 00 12 44 b9 16 00 3e a7 84 00 08 e3 e0 f0 88 00 04
Call Trace:
([<0000000000144f90>] unqueue_me+0x40/0xe4)
 [<0000000000145a0c>] do_futex+0x33c/0xc40
 [<000000000014643e>] sys_futex+0x12e/0x144
 [<000000000010bb00>] sysc_noemu+0x10/0x16
 [<000002000003741c>] 0x2000003741c

The code in question is:

static int unqueue_me(struct futex_q *q)
{
        int ret = 0;
        spinlock_t *lock_ptr;

        /* In the common case we don't take the spinlock, which is nice. */
 retry:
        lock_ptr = q->lock_ptr;
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
		/*
                 * q->lock_ptr can change between reading it and
                 * spin_lock(), causing us to take the wrong lock.  This
                 * corrects the race condition.
[...]

and my compiler (gcc 4.1.0) makes the following out of it:

00000000000003c8 <unqueue_me>:
     3c8:       eb bf f0 70 00 24       stmg    %r11,%r15,112(%r15)
     3ce:       c0 d0 00 00 00 00       larl    %r13,3ce <unqueue_me+0x6>
                        3d0: R_390_PC32DBL      .rodata+0x2a
     3d4:       a7 f1 1e 00             tml     %r15,7680
     3d8:       a7 84 00 01             je      3da <unqueue_me+0x12>
     3dc:       b9 04 00 ef             lgr     %r14,%r15
     3e0:       a7 fb ff d0             aghi    %r15,-48
     3e4:       b9 04 00 b2             lgr     %r11,%r2
     3e8:       e3 e0 f0 98 00 24       stg     %r14,152(%r15)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
		/* write q->lock_ptr in r12 */
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       a7 84 00 4b             je      48e <unqueue_me+0xc6>
		/* if r12 is zero then jump over the code.... */
     3fc:       e3 20 b0 28 00 04       lg      %r2,40(%r11)
		/* write q->lock_ptr in r2 */
     402:       c0 e5 00 00 00 00       brasl   %r14,402 <unqueue_me+0x3a>
                        404: R_390_PC32DBL      _spin_lock+0x2
		/* use r2 as parameter for spin_lock */

So the code becomes more or less:
if (q->lock_ptr != 0) spin_lock(q->lock_ptr)
instead of
if (lock_ptr != 0) spin_lock(lock_ptr)

Which caused the oops from above.
After adding a barrier gcc creates code without this problem:
[...] (the same)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       b9 04 00 2c             lgr     %r2,%r12
     3fc:       a7 84 00 48             je      48c <unqueue_me+0xc4>
     400:       c0 e5 00 00 00 00       brasl   %r14,400 <unqueue_me+0x38>
                        402: R_390_PC32DBL      _spin_lock+0x2

As a general note, this code of unqueue_me seems a bit fishy. The retry logic
of unqueue_me only works if we can guarantee, that the original value of
q->lock_ptr is always a spinlock (Otherwise we overwrite kernel memory). We
know that q->lock_ptr can change. I dont know what happens with the original
spinlock, as I am not an expert with the futex code.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@timesys.com>
Signed-off-by: Christian Borntraeger <borntrae@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index dda2049692a2..c2b2e0b83abf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -948,6 +948,7 @@ static int unqueue_me(struct futex_q *q)
 	/* In the common case we don't take the spinlock, which is nice. */
  retry:
 	lock_ptr = q->lock_ptr;
+	barrier();
 	if (lock_ptr != 0) {
 		spin_lock(lock_ptr);
 		/*
-- 
cgit v1.2.3-55-g7522


From 9f59ce5d0e0dd837853385927b150f5cef3a7f52 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert
Date: Sat, 5 Aug 2006 12:14:11 -0700
Subject: [PATCH] ptrace: make pid of child process available for
 PTRACE_EVENT_VFORK_DONE

When delivering PTRACE_EVENT_VFORK_DONE, provide pid of the child process
when tracer calls ptrace(PTRACE_GETEVENTMSG).  This is already
(accidentally) available when the tracer is tracing VFORK in addition to
VFORK_DONE.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Daniel Jacobowitz <dan@debian.org>
Cc: Albert Cahalan <acahalan@gmail.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1b0f7b1e0881..aa36c43783cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1387,8 +1387,10 @@ long do_fork(unsigned long clone_flags,
 
 		if (clone_flags & CLONE_VFORK) {
 			wait_for_completion(&vfork);
-			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+				current->ptrace_message = nr;
 				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+			}
 		}
 	} else {
 		free_pid(pid);
-- 
cgit v1.2.3-55-g7522


From 78944e549d36673eb6265a2411574e79c28e23dc Mon Sep 17 00:00:00 2001
From: Antonino A. Daplas
Date: Sat, 5 Aug 2006 12:14:16 -0700
Subject: [PATCH] vt: printk: Fix framebuffer console triggering might_sleep
 assertion

Reported by: Dave Jones

Whilst printk'ing to both console and serial console, I got this...
(2.6.18rc1)

BUG: sleeping function called from invalid context at kernel/sched.c:4438
in_atomic():0, irqs_disabled():1

Call Trace:
 [<ffffffff80271db8>] show_trace+0xaa/0x23d
 [<ffffffff80271f60>] dump_stack+0x15/0x17
 [<ffffffff8020b9f8>] __might_sleep+0xb2/0xb4
 [<ffffffff8029232e>] __cond_resched+0x15/0x55
 [<ffffffff80267eb8>] cond_resched+0x3b/0x42
 [<ffffffff80268c64>] console_conditional_schedule+0x12/0x14
 [<ffffffff80368159>] fbcon_redraw+0xf6/0x160
 [<ffffffff80369c58>] fbcon_scroll+0x5d9/0xb52
 [<ffffffff803a43c4>] scrup+0x6b/0xd6
 [<ffffffff803a4453>] lf+0x24/0x44
 [<ffffffff803a7ff8>] vt_console_print+0x166/0x23d
 [<ffffffff80295528>] __call_console_drivers+0x65/0x76
 [<ffffffff80295597>] _call_console_drivers+0x5e/0x62
 [<ffffffff80217e3f>] release_console_sem+0x14b/0x232
 [<ffffffff8036acd6>] fb_flashcursor+0x279/0x2a6
 [<ffffffff80251e3f>] run_workqueue+0xa8/0xfb
 [<ffffffff8024e5e0>] worker_thread+0xef/0x122
 [<ffffffff8023660f>] kthread+0x100/0x136
 [<ffffffff8026419e>] child_rip+0x8/0x12

This can occur when release_console_sem() is called but the log
buffer still has contents that need to be flushed. The console drivers
are called while the console_may_schedule flag is still true. The
might_sleep() is triggered when fbcon calls console_conditional_schedule().

Fix by setting console_may_schedule to zero earlier, before the call to the
console drivers.

Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 65ca0688f86f..1149365e989e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -799,6 +799,9 @@ void release_console_sem(void)
 		up(&secondary_console_sem);
 		return;
 	}
+
+	console_may_schedule = 0;
+
 	for ( ; ; ) {
 		spin_lock_irqsave(&logbuf_lock, flags);
 		wake_klogd |= log_start - log_end;
@@ -812,7 +815,6 @@ void release_console_sem(void)
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
-	console_may_schedule = 0;
 	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
 	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
-- 
cgit v1.2.3-55-g7522


From 0f04ab5efbca73ab366a156d96b073d2da35b158 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Sat, 5 Aug 2006 12:14:59 -0700
Subject: [PATCH] memory hotadd fixes: change find_next_system_ram's return
 value manner

find_next_system_ram() returns valid memory range which meets requested area,
only used by memory-hot-add.

This function always rewrite requested resource even if returned area is not
fully fit in requested one.  And sometimes the returnd resource is larger than
requested area.  This annoyes the caller.  This patch changes the returned
value to fit in requested area.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Keith Mannthey <kmannth@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/resource.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 0dd3a857579e..63e879379dbd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -261,8 +261,10 @@ int find_next_system_ram(struct resource *res)
 	if (!p)
 		return -1;
 	/* copy data */
-	res->start = p->start;
-	res->end = p->end;
+	if (res->start < p->start)
+		res->start = p->start;
+	if (res->end > p->end)
+		res->end = p->end;
 	return 0;
 }
 #endif
-- 
cgit v1.2.3-55-g7522


From 58c1b5b079071d82b2f924000b7e8fb5585ce7d8 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Sat, 5 Aug 2006 12:15:01 -0700
Subject: [PATCH] memory hotadd fixes: find_next_system_ram catch range fix

find_next_system_ram() is used to find available memory resource at onlining
newly added memory.  This patch fixes following problem.

find_next_system_ram() cannot catch this case.

Resource:      (start)-------------(end)
Section :                (start)-------------(end)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Keith Mannthey <kmannth@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/resource.c   | 3 ++-
 mm/memory_hotplug.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 63e879379dbd..46286434af80 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -244,6 +244,7 @@ int find_next_system_ram(struct resource *res)
 
 	start = res->start;
 	end = res->end;
+	BUG_ON(start >= end);
 
 	read_lock(&resource_lock);
 	for (p = iomem_resource.child; p ; p = p->sibling) {
@@ -254,7 +255,7 @@ int find_next_system_ram(struct resource *res)
 			p = NULL;
 			break;
 		}
-		if (p->start >= start)
+		if ((p->end >= start) && (p->start < end))
 			break;
 	}
 	read_unlock(&resource_lock);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7d25cc12235f..26f1840879d6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -163,7 +163,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	res.flags = IORESOURCE_MEM; /* we just need system ram */
 	section_end = res.end;
 
-	while (find_next_system_ram(&res) >= 0) {
+	while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
 		start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
 		nr_pages = (unsigned long)
                            ((res.end + 1 - res.start) >> PAGE_SHIFT);
-- 
cgit v1.2.3-55-g7522


From ce2c6b53847afc444c4d0a7a1075c61f499c57a5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sat, 5 Aug 2006 12:15:15 -0700
Subject: [PATCH] futex: Apply recent futex fixes to futex_compat

The recent fixups in futex.c need to be applied to futex_compat.c too.  Fixes
a hang reported by Olaf.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex_compat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1aab1a452cc..c5cca3f65cb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -39,7 +39,7 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
 	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	compat_uptr_t uentry, upending;
 	compat_long_t futex_offset;
 
@@ -59,10 +59,10 @@ void compat_exit_robust_list(struct task_struct *curr)
 	 * if it exists:
 	 */
 	if (fetch_robust_entry(&upending, &pending,
-			       &head->list_op_pending, &pi))
+			       &head->list_op_pending, &pip))
 		return;
 	if (upending)
-		handle_futex_death((void *)pending + futex_offset, curr, pi);
+		handle_futex_death((void *)pending + futex_offset, curr, pip);
 
 	while (compat_ptr(uentry) != &head->list) {
 		/*
-- 
cgit v1.2.3-55-g7522


From 3773dc92055f219c2f2403f9de774b8ad41a9214 Mon Sep 17 00:00:00 2001
From: Jan Blunck
Date: Sun, 13 Aug 2006 23:24:19 -0700
Subject: [PATCH] fix hrtimer percpu usage typo

The percpu variable is used incorrectly in switch_hrtimer_base().

Signed-off-by: Jan Blunck <jblunck@suse.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/hrtimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index be989efc7856..21c38a7e666b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -187,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	struct hrtimer_base *new_base;
 
-	new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+	new_base = &__get_cpu_var(hrtimer_bases)[base->index];
 
 	if (base != new_base) {
 		/*
-- 
cgit v1.2.3-55-g7522


From 657b3010d8f8a72195dfcbe63040127d596f0b14 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Sun, 13 Aug 2006 23:24:19 -0700
Subject: [PATCH] panic.c build fix

kernel/panic.c: In function 'add_taint':
kernel/panic.c:176: warning: implicit declaration of function 'debug_locks_off'

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/panic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index d8a0bca21233..9b8dcfd1ca93 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
 #include <linux/kexec.h>
+#include <linux/debug_locks.h>
 
 int panic_on_oops;
 int tainted;
-- 
cgit v1.2.3-55-g7522


From 6997a6faaa129a1c91775f7344c8d371a05178ea Mon Sep 17 00:00:00 2001
From: Kirill Korotaev
Date: Sun, 13 Aug 2006 23:24:23 -0700
Subject: [PATCH] sys_getppid oopses on debug kernel

sys_getppid() optimization can access a freed memory.  On kernels with
DEBUG_SLAB turned ON, this results in Oops.  As Dave Hansen noted, this
optimization is also unsafe for memory hotplug.

So this patch always takes the lock to be safe.

[oleg@tv-sign.ru: simplifications]
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: <stable@kernel.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/timer.c | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index b650f04888ed..1d7dd6267c2d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1324,46 +1324,19 @@ asmlinkage long sys_getpid(void)
 }
 
 /*
- * Accessing ->group_leader->real_parent is not SMP-safe, it could
- * change from under us. However, rather than getting any lock
- * we can use an optimistic algorithm: get the parent
- * pid, and go back and check that the parent is still
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer (it was and remains a dereferencable kernel pointer
- * no matter what): we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * NOTE2: ->group_leader never changes from under us.
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
  */
 asmlinkage long sys_getppid(void)
 {
 	int pid;
-	struct task_struct *me = current;
-	struct task_struct *parent;
 
-	parent = me->group_leader->real_parent;
-	for (;;) {
-		pid = parent->tgid;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-{
-		struct task_struct *old = parent;
+	rcu_read_lock();
+	pid = rcu_dereference(current->real_parent)->tgid;
+	rcu_read_unlock();
 
-		/*
-		 * Make sure we read the pid before re-reading the
-		 * parent pointer:
-		 */
-		smp_rmb();
-		parent = me->group_leader->real_parent;
-		if (old != parent)
-			continue;
-}
-#endif
-		break;
-	}
 	return pid;
 }
 
-- 
cgit v1.2.3-55-g7522


From e579dcbf23604cb33c08b5c3c3ac06ca36e7c683 Mon Sep 17 00:00:00 2001
From: john stultz
Date: Sun, 13 Aug 2006 23:24:24 -0700
Subject: [PATCH] futex_handle_fault always fails

We found this issue last week w/ the -RT kernel, but it seems the same
issue is in mainline as well.

Basically it is possible for futex_unlock_pi to return without actually
freeing the lock.  This is due to buggy logic in the use of
futex_handle_fault() and its attempt argument in a failure case.

Looking at futex.c the logic is as follows:

1) In futex_unlock_pi() we start w/ ret=0 and we go down to the first
   futex_atomic_cmpxchg_inatomic(), where we find uval==-EFAULT.  We then
   jump to the pi_faulted label.

2) From pi_faulted: We increment attempt, unlock the sem and hit the
   retry label.

3) From the retry label, with ret still zero, we again hit EFAULT on the
   first futex_atomic_cmpxchg_inatomic(), and again goto the pi_faulted
   label.

4) Again from pi_faulted: we increment attempt and enter the
   conditional, where we call futex_handle_fault.

5) futex_handle_fault fails, and we goto the out_unlock_release_sem
   label.

6) From out_unlock_release_sem we return, and since ret is still zero,
   we return without error, while never actually unlocking the lock.

Issue #1: at the first futex_atomic_cmpxchg_inatomic() we should probably
be setting ret=-EFAULT before jumping to pi_faulted: However in our case
this doesn't really affect anything, as the glibc we're using ignores the
error value from futex_unlock_pi().

Issue #2: Look at futex_handle_fault(), its first conditional will return
-EFAULT if attempt is >= 2.  However, from the "if(attempt++)
futex_handle_fault(attempt)" logic above, we'll *never* call
futex_handle_fault when attempt is less then two.  So we never get a chance
to even try to fault the page in.

The following patch addresses these two issues by 1) Always setting ret to
-EFAULT if futex_handle_fault fails, and 2) Removing the = in
futex_handle_fault's (attempt >= 2) check.

I'm really not sure this is the right fix, but wanted to bring it up so
folks knew the issue is alive and well in the current -git tree.  From
looking at the git logs the logic was first introduced (then later copied
to other places) in the following commit almost a year ago:

http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=4732efbeb997189d9f9b04708dc26bf8613ed721;hp=5b039e681b8c5f30aac9cc04385cc94be45d0823

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/futex.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index c2b2e0b83abf..d4633c588f33 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt)
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
 
-	if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+	if (attempt > 2 || !(vma = find_vma(mm, address)) ||
 	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
 		return -EFAULT;
 
@@ -747,8 +747,10 @@ retry:
 		 */
 		if (attempt++) {
 			if (futex_handle_fault((unsigned long)uaddr2,
-					       attempt))
+						attempt)) {
+				ret = -EFAULT;
 				goto out;
+			}
 			goto retry;
 		}
 
@@ -1322,9 +1324,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt))
+		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+			ret = -EFAULT;
 			goto out_unlock_release_sem;
-
+		}
 		goto retry_locked;
 	}
 
@@ -1506,9 +1509,10 @@ pi_faulted:
 	 * still holding the mmap_sem.
 	 */
 	if (attempt++) {
-		if (futex_handle_fault((unsigned long)uaddr, attempt))
+		if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+			ret = -EFAULT;
 			goto out_unlock;
-
+		}
 		goto retry_locked;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 9b41ea7289a589993d3daabc61f999b4147872c4 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Sun, 13 Aug 2006 23:24:26 -0700
Subject: [PATCH] workqueue: remove lock_cpu_hotplug()

Use a private lock instead.  It protects all per-cpu data structures in
workqueue.c, including the workqueues list.

Fix a bug in schedule_on_each_cpu(): it was forgetting to lock down the
per-cpu resources.

Unfixed long-standing bug: if someone unplugs the CPU identified by
`singlethread_cpu' the kernel will get very sick.

Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/workqueue.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 448e8f7b342d..835fe28b87a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,7 +68,7 @@ struct workqueue_struct {
 
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
    threads to each one as cpus come/go. */
-static DEFINE_SPINLOCK(workqueue_lock);
+static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu;
@@ -320,10 +320,10 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 	} else {
 		int cpu;
 
-		lock_cpu_hotplug();
+		mutex_lock(&workqueue_mutex);
 		for_each_online_cpu(cpu)
 			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-		unlock_cpu_hotplug();
+		mutex_unlock(&workqueue_mutex);
 	}
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -371,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	}
 
 	wq->name = name;
-	/* We don't need the distraction of CPUs appearing and vanishing. */
-	lock_cpu_hotplug();
+	mutex_lock(&workqueue_mutex);
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
 		p = create_workqueue_thread(wq, singlethread_cpu);
@@ -381,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 		else
 			wake_up_process(p);
 	} else {
-		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
-		spin_unlock(&workqueue_lock);
 		for_each_online_cpu(cpu) {
 			p = create_workqueue_thread(wq, cpu);
 			if (p) {
@@ -393,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 				destroy = 1;
 		}
 	}
-	unlock_cpu_hotplug();
+	mutex_unlock(&workqueue_mutex);
 
 	/*
 	 * Was there any error during startup? If yes then clean up:
@@ -434,17 +431,15 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	flush_workqueue(wq);
 
 	/* We don't need the distraction of CPUs appearing and vanishing. */
-	lock_cpu_hotplug();
+	mutex_lock(&workqueue_mutex);
 	if (is_single_threaded(wq))
 		cleanup_workqueue_thread(wq, singlethread_cpu);
 	else {
 		for_each_online_cpu(cpu)
 			cleanup_workqueue_thread(wq, cpu);
-		spin_lock(&workqueue_lock);
 		list_del(&wq->list);
-		spin_unlock(&workqueue_lock);
 	}
-	unlock_cpu_hotplug();
+	mutex_unlock(&workqueue_mutex);
 	free_percpu(wq->cpu_wq);
 	kfree(wq);
 }
@@ -515,11 +510,13 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
 	if (!works)
 		return -ENOMEM;
 
+	mutex_lock(&workqueue_mutex);
 	for_each_online_cpu(cpu) {
 		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
 				per_cpu_ptr(works, cpu));
 	}
+	mutex_unlock(&workqueue_mutex);
 	flush_workqueue(keventd_wq);
 	free_percpu(works);
 	return 0;
@@ -635,6 +632,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_PREPARE:
+		mutex_lock(&workqueue_mutex);
 		/* Create a new workqueue thread for it. */
 		list_for_each_entry(wq, &workqueues, list) {
 			if (!create_workqueue_thread(wq, hotcpu)) {
@@ -653,6 +651,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			kthread_bind(cwq->thread, hotcpu);
 			wake_up_process(cwq->thread);
 		}
+		mutex_unlock(&workqueue_mutex);
 		break;
 
 	case CPU_UP_CANCELED:
@@ -664,6 +663,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 				     any_online_cpu(cpu_online_map));
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
+		mutex_unlock(&workqueue_mutex);
+		break;
+
+	case CPU_DOWN_PREPARE:
+		mutex_lock(&workqueue_mutex);
+		break;
+
+	case CPU_DOWN_FAILED:
+		mutex_unlock(&workqueue_mutex);
 		break;
 
 	case CPU_DEAD:
@@ -671,6 +679,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 			cleanup_workqueue_thread(wq, hotcpu);
 		list_for_each_entry(wq, &workqueues, list)
 			take_over_work(wq, hotcpu);
+		mutex_unlock(&workqueue_mutex);
 		break;
 	}
 
-- 
cgit v1.2.3-55-g7522


From f8986c241dfd54d51c9eff967129a550ae230144 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Sun, 27 Aug 2006 01:23:34 -0700
Subject: [PATCH] revert "Drop tasklist lock in do_sched_setscheduler"

sched_setscheduler() looks at ->signal->rlim[].  It is unsafe do
dereference ->signal unless tasklist_lock or ->siglock is held (or p ==
current).  We pin the task structure, but this can't prevent from
release_task()->__exit_signal() which sets ->signal = NULL.

Restore tasklist_lock across the setscheduler call.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a2be2d055299..a234fbee1238 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4162,10 +4162,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 		read_unlock_irq(&tasklist_lock);
 		return -ESRCH;
 	}
-	get_task_struct(p);
-	read_unlock_irq(&tasklist_lock);
 	retval = sched_setscheduler(p, policy, &lparam);
-	put_task_struct(p);
+	read_unlock_irq(&tasklist_lock);
 
 	return retval;
 }
-- 
cgit v1.2.3-55-g7522


From d015baebba44613ef59ddffeae2114fa4ede7104 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Sun, 27 Aug 2006 01:23:40 -0700
Subject: [PATCH] futex_find_get_task(): remove an obscure EXIT_ZOMBIE check

futex_find_get_task:

	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE)
		return NULL;

I can't understand this.  First, p->state can't be EXIT_ZOMBIE.  The
->exit_state check looks strange too.  Sub-threads or tasks whose ->parent
ignores SIGCHLD go directly to EXIT_DEAD state (I am ignoring a ptrace
case).  Why EXIT_DEAD tasks should be ok?  Yes, EXIT_ZOMBIE is more
important (a task may stay zombie for a long time), but this doesn't mean
we should explicitely ignore other EXIT_XXX states.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index d4633c588f33..b9b8aea5389e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 		p = NULL;
 		goto out_unlock;
 	}
-	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+	if (p->exit_state != 0) {
 		p = NULL;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3-55-g7522


From 4edb9a143e31d2e191c199262226e1a5923ff8f7 Mon Sep 17 00:00:00 2001
From: Yingchao Zhou
Date: Sun, 27 Aug 2006 01:23:46 -0700
Subject: [PATCH] Remove redundant up() in stop_machine()

An up() is called in kernel/stop_machine.c on failure, and also in the
caller (unconditionally).

Signed-off-by: Zhou Yingchao <yingchao.zhou@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/stop_machine.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..51cacd111dbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -111,7 +111,6 @@ static int stop_machine(void)
 	/* If some failed, kill them all. */
 	if (ret < 0) {
 		stopmachine_set_state(STOPMACHINE_EXIT);
-		up(&stopmachine_mutex);
 		return ret;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 4c4d50f7b39cc58f1064b93a61ad617451ae41df Mon Sep 17 00:00:00 2001
From: Paul Jackson
Date: Sun, 27 Aug 2006 01:23:51 -0700
Subject: [PATCH] cpuset: top_cpuset tracks hotplug changes to cpu_online_map

Change the list of cpus allowed to tasks in the top (root) cpuset to
dynamically track what cpus are online, using a CPU hotplug notifier.  Make
this top cpus file read-only.

On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.

If that system does support CPU hotplug, then these tasks cannot make use
of CPUs that are added after system boot, because the CPUs are not allowed
in the top cpuset.  This is a surprising regression over earlier kernels
that didn't have cpusets enabled.

In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'cpus' file in the top (root) cpuset, making it read
only, and making it automatically track the value of cpu_online_map.  Thus
tasks in the top cpuset will have automatic use of hot plugged CPUs allowed
by their cpuset.

Thanks to Anton Blanchard and Nathan Lynch for reporting this problem,
driving the fix, and earlier versions of this patch.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt |  6 ++++++
 kernel/cpuset.c           | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 159e2a0c3e80..76b44290c154 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -217,6 +217,12 @@ exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
 to represent the cpuset hierarchy provides for a familiar permission
 and name space for cpusets, with a minimum of additional kernel code.
 
+The cpus file in the root (top_cpuset) cpuset is read-only.
+It automatically tracks the value of cpu_online_map, using a CPU
+hotplug notifier.  If and when memory nodes can be hotplugged,
+we expect to make the mems file in the root cpuset read-only
+as well, and have it track the value of node_online_map.
+
 
 1.4 What are exclusive cpusets ?
 --------------------------------
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1a649f2bb9bb..f1dda98bdffe 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -816,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	struct cpuset trialcs;
 	int retval, cpus_unchanged;
 
+	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	trialcs = *cs;
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
 	if (retval < 0)
@@ -2033,6 +2037,33 @@ out:
 	return err;
 }
 
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period.  This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This handles CPU hotplug (cpuhp) events.  If someday Memory
+ * Nodes can be hotplugged (dynamically changing node_online_map)
+ * then we should handle that too, perhaps in a similar way.
+ */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+				unsigned long phase, void *cpu)
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	top_cpuset.cpus_allowed = cpu_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+
+	return 0;
+}
+#endif
+
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
@@ -2043,6 +2074,8 @@ void __init cpuset_init_smp(void)
 {
 	top_cpuset.cpus_allowed = cpu_online_map;
 	top_cpuset.mems_allowed = node_online_map;
+
+	hotcpu_notifier(cpuset_handle_cpuhp, 0);
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From 0d673a5a4775d3dc565b6668ed75fd2db2ede624 Mon Sep 17 00:00:00 2001
From: Nick Piggin
Date: Sun, 27 Aug 2006 01:23:54 -0700
Subject: [PATCH] cpuset: oom panic fix

cpuset_excl_nodes_overlap always returns 0 if current is exiting.  This caused
customer's systems to panic in the OOM killer when processes were having
trouble getting memory for the final put_user in mm_release.  Even though
there were lots of processes to kill.

Change to returning 1 in this case.  This achieves parity with !CONFIG_CPUSETS
case, and was observed to fix the problem.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f1dda98bdffe..4ea6f0dc2fc5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2420,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
-	int overlap = 0;		/* do cpusets overlap? */
+	int overlap = 1;		/* do cpusets overlap? */
 
 	task_lock(current);
 	if (current->flags & PF_EXITING) {
-- 
cgit v1.2.3-55-g7522