From 245282557c49754af3dbcc732316e814340d6bce Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Fri, 20 Jan 2012 11:58:43 +0800
Subject: cgroup: move struct cgroup_pidlist out from the header file

It's internally used only.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..6ca7acad7c55 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3043,6 +3043,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  *
  */
 
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+	CGROUP_FILE_PROCS,
+	CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+	/*
+	 * used to find which pidlist is wanted. doesn't change as long as
+	 * this particular list stays in the list.
+	*/
+	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+	/* array of xids */
+	pid_t *list;
+	/* how many elements the above list has */
+	int length;
+	/* how many files are using the current array */
+	int use_count;
+	/* each of these stored in a list by its cgroup */
+	struct list_head links;
+	/* pointer to the cgroup we belong to, for list removal purposes */
+	struct cgroup *owner;
+	/* protects the other fields */
+	struct rw_semaphore mutex;
+};
+
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
-- 
cgit v1.2.3-55-g7522


From b78949ebfb563c29808a9d0a772e3adb5561bc80 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines
Date: Tue, 3 Jan 2012 21:18:30 -0800
Subject: cgroup: simplify double-check locking in cgroup_attach_proc

To keep the complexity of the double-check locking in one place, move
the thread_group_leader check up into attach_task_by_pid().  This
allows us to use a goto instead of returning -EAGAIN.

While at it, convert a couple of returns to gotos and use rcu for the
!pid case also in order to simplify the logic.

Changes in V2:
* https://lkml.org/lkml/2011/12/22/86 (Tejun Heo)
  * Use a goto instead of returning -EAGAIN

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 79 +++++++++++++++++++++------------------------------------
 1 file changed, 29 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6ca7acad7c55..12c07e8fd69c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2104,19 +2104,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 
 	/* prevent changes to the threadgroup list while we take a snapshot. */
 	read_lock(&tasklist_lock);
-	if (!thread_group_leader(leader)) {
-		/*
-		 * a race with de_thread from another thread's exec() may strip
-		 * us of our leadership, making while_each_thread unsafe to use
-		 * on this task. if this happens, there is no choice but to
-		 * throw this task away and try again (from cgroup_procs_write);
-		 * this is "double-double-toil-and-trouble-check locking".
-		 */
-		read_unlock(&tasklist_lock);
-		retval = -EAGAIN;
-		goto out_free_group_list;
-	}
-
 	tsk = leader;
 	i = 0;
 	do {
@@ -2245,22 +2232,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 
+retry_find_task:
+	rcu_read_lock();
 	if (pid) {
-		rcu_read_lock();
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			rcu_read_unlock();
-			cgroup_unlock();
-			return -ESRCH;
-		}
-		if (threadgroup) {
-			/*
-			 * RCU protects this access, since tsk was found in the
-			 * tid map. a race with de_thread may cause group_leader
-			 * to stop being the leader, but cgroup_attach_proc will
-			 * detect it later.
-			 */
-			tsk = tsk->group_leader;
+			ret= -ESRCH;
+			goto out_unlock_cgroup;
 		}
 		/*
 		 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2250,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 		    cred->euid != tcred->uid &&
 		    cred->euid != tcred->suid) {
 			rcu_read_unlock();
-			cgroup_unlock();
-			return -EACCES;
+			ret = -EACCES;
+			goto out_unlock_cgroup;
 		}
-		get_task_struct(tsk);
-		rcu_read_unlock();
-	} else {
-		if (threadgroup)
-			tsk = current->group_leader;
-		else
-			tsk = current;
-		get_task_struct(tsk);
-	}
-
-	threadgroup_lock(tsk);
+	} else
+		tsk = current;
 
 	if (threadgroup)
+		tsk = tsk->group_leader;
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	threadgroup_lock(tsk);
+	if (threadgroup) {
+		if (!thread_group_leader(tsk)) {
+			/*
+			 * a race with de_thread from another thread's exec()
+			 * may strip us of our leadership, if this happens,
+			 * there is no choice but to throw this task away and
+			 * try again; this is
+			 * "double-double-toil-and-trouble-check locking".
+			 */
+			threadgroup_unlock(tsk);
+			put_task_struct(tsk);
+			goto retry_find_task;
+		}
 		ret = cgroup_attach_proc(cgrp, tsk);
-	else
+	} else
 		ret = cgroup_attach_task(cgrp, tsk);
-
 	threadgroup_unlock(tsk);
 
 	put_task_struct(tsk);
+out_unlock_cgroup:
 	cgroup_unlock();
 	return ret;
 }
@@ -2305,16 +2293,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 
 static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 {
-	int ret;
-	do {
-		/*
-		 * attach_proc fails with -EAGAIN if threadgroup leadership
-		 * changes in the middle of the operation, in which case we need
-		 * to find the task_struct for the new leader and start over.
-		 */
-		ret = attach_task_by_pid(cgrp, tgid, true);
-	} while (ret == -EAGAIN);
-	return ret;
+	return attach_task_by_pid(cgrp, tgid, true);
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From fb5d2b4cfc24963d0e8a7df57de1ecffa10a04cf Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines
Date: Tue, 3 Jan 2012 21:18:31 -0800
Subject: cgroup: replace tasklist_lock with rcu_read_lock

We can replace the tasklist_lock in cgroup_attach_proc with an
rcu_read_lock().

Changes in V4:
* https://lkml.org/lkml/2011/12/23/284 (Frederic Weisbecker)
  * Minimize size of rcu_read_lock critical section
  * Add comment
* https://lkml.org/lkml/2011/12/26/136 (Li Zefan)
  * Split into two patches
Changes in V3:
* https://lkml.org/lkml/2011/12/22/419 (Frederic Weisbecker)
  * Add an rcu_read_lock to protect against exit
Changes in V2:
* https://lkml.org/lkml/2011/12/22/86 (Tejun Heo)
  * Use a goto instead of returning -EAGAIN

Suggested-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 12c07e8fd69c..1626152dcc1e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2102,10 +2102,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	if (retval)
 		goto out_free_group_list;
 
-	/* prevent changes to the threadgroup list while we take a snapshot. */
-	read_lock(&tasklist_lock);
 	tsk = leader;
 	i = 0;
+	/*
+	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
+	 * already PF_EXITING could be freed from underneath us unless we
+	 * take an rcu_read_lock.
+	 */
+	rcu_read_lock();
 	do {
 		struct task_and_cgroup ent;
 
@@ -2128,11 +2132,11 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		BUG_ON(retval != 0);
 		i++;
 	} while_each_thread(leader, tsk);
+	rcu_read_unlock();
 	/* remember the number of threads in the array for later. */
 	group_size = i;
 	tset.tc_array = group;
 	tset.tc_array_len = group_size;
-	read_unlock(&tasklist_lock);
 
 	/* methods shouldn't be called if no task is actually migrating */
 	retval = 0;
-- 
cgit v1.2.3-55-g7522


From 0ce8974d504913a0f0ae2d97b20a5ac665431a41 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Fri, 6 Jan 2012 03:13:27 -0800
Subject: sysctl: Consolidate !CONFIG_SYSCTL handling

- In sysctl.h move functions only available if CONFIG_SYSCL
  is defined inside of #ifdef CONFIG_SYSCTL

- Move the stub function definitions for !CONFIG_SYSCTL
  into sysctl.h and make them static inlines.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/sysctl.h | 95 ++++++++++++++++++++++++++++++++------------------
 kernel/sysctl.c        | 26 --------------
 2 files changed, 62 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bb9127dd814b..cf3ee7f246d6 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -937,30 +937,8 @@ enum
 struct ctl_table;
 struct nsproxy;
 struct ctl_table_root;
-
-struct ctl_table_set {
-	struct list_head list;
-	struct ctl_table_set *parent;
-	int (*is_seen)(struct ctl_table_set *);
-};
-
-extern void setup_sysctl_set(struct ctl_table_set *p,
-	struct ctl_table_set *parent,
-	int (*is_seen)(struct ctl_table_set *));
-
 struct ctl_table_header;
 
-extern void sysctl_head_get(struct ctl_table_header *);
-extern void sysctl_head_put(struct ctl_table_header *);
-extern int sysctl_is_seen(struct ctl_table_header *);
-extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *);
-extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
-extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-						struct ctl_table_header *prev);
-extern void sysctl_head_finish(struct ctl_table_header *prev);
-extern int sysctl_perm(struct ctl_table_root *root,
-		struct ctl_table *table, int op);
-
 typedef struct ctl_table ctl_table;
 
 typedef int proc_handler (struct ctl_table *ctl, int write,
@@ -1023,8 +1001,6 @@ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
 	return (void *)(unsigned long)atomic_read(&poll->event);
 }
 
-void proc_sys_poll_notify(struct ctl_table_poll *poll);
-
 #define __CTL_TABLE_POLL_INITIALIZER(name) {				\
 	.event = ATOMIC_INIT(0),					\
 	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }
@@ -1047,15 +1023,6 @@ struct ctl_table
 	void *extra2;
 };
 
-struct ctl_table_root {
-	struct list_head root_list;
-	struct ctl_table_set default_set;
-	struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
-					   struct nsproxy *namespaces);
-	int (*permissions)(struct ctl_table_root *root,
-			struct nsproxy *namespaces, struct ctl_table *table);
-};
-
 /* struct ctl_table_header is used to maintain dynamic lists of
    struct ctl_table trees. */
 struct ctl_table_header
@@ -1078,11 +1045,45 @@ struct ctl_table_header
 	struct ctl_table_header *parent;
 };
 
+struct ctl_table_set {
+	struct list_head list;
+	struct ctl_table_set *parent;
+	int (*is_seen)(struct ctl_table_set *);
+};
+
+struct ctl_table_root {
+	struct list_head root_list;
+	struct ctl_table_set default_set;
+	struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
+					   struct nsproxy *namespaces);
+	int (*permissions)(struct ctl_table_root *root,
+			struct nsproxy *namespaces, struct ctl_table *table);
+};
+
 /* struct ctl_path describes where in the hierarchy a table is added */
 struct ctl_path {
 	const char *procname;
 };
 
+#ifdef CONFIG_SYSCTL
+
+void proc_sys_poll_notify(struct ctl_table_poll *poll);
+
+extern void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *));
+
+extern void sysctl_head_get(struct ctl_table_header *);
+extern void sysctl_head_put(struct ctl_table_header *);
+extern int sysctl_is_seen(struct ctl_table_header *);
+extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *);
+extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
+extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+						struct ctl_table_header *prev);
+extern void sysctl_head_finish(struct ctl_table_header *prev);
+extern int sysctl_perm(struct ctl_table_root *root,
+		struct ctl_table *table, int op);
+
 void register_sysctl_root(struct ctl_table_root *root);
 struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_root *root, struct nsproxy *namespaces,
@@ -1094,6 +1095,34 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
+#else /* CONFIG_SYSCTL */
+static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
+{
+	return NULL;
+}
+
+static inline struct ctl_table_header *register_sysctl_paths(
+			const struct ctl_path *path, struct ctl_table *table)
+{
+	return NULL;
+}
+
+static inline void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+
+static inline void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+}
+
+static inline void sysctl_head_put(struct ctl_table_header *head)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SYSCTL_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..d5bbddd0de24 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2017,32 +2017,6 @@ void setup_sysctl_set(struct ctl_table_set *p,
 	p->is_seen = is_seen;
 }
 
-#else /* !CONFIG_SYSCTL */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-{
-	return NULL;
-}
-
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						    struct ctl_table *table)
-{
-	return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
-	struct ctl_table_set *parent,
-	int (*is_seen)(struct ctl_table_set *))
-{
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-}
-
 #endif /* CONFIG_SYSCTL */
 
 /*
-- 
cgit v1.2.3-55-g7522


From de4e83bd6b5e16d491ec068cd22801d5d063b07a Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Fri, 6 Jan 2012 03:34:20 -0800
Subject: sysctl: Register the base sysctl table like any other sysctl table.

Simplify the code by treating the base sysctl table like any other
sysctl table and register it with register_sysctl_table.

To ensure this table is registered early enough to avoid problems
call sysctl_init from proc_sys_init.

Rename sysctl_net.c:sysctl_init() to net_sysctl_init() to avoid
name conflicts now that kernel/sysctl.c:sysctl_init() is no longer
static.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  |  3 ++-
 include/linux/sysctl.h |  1 +
 kernel/sysctl.c        | 13 ++++---------
 net/sysctl_net.c       |  4 ++--
 4 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d82f4a8b4b80..9d29d28af577 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -468,5 +468,6 @@ int __init proc_sys_init(void)
 	proc_sys_root->proc_iops = &proc_sys_dir_operations;
 	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
-	return 0;
+
+	return sysctl_init();
 }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cf3ee7f246d6..5e3532e9599f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1095,6 +1095,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
+extern int sysctl_init(void);
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d5bbddd0de24..ad460248acc7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -192,7 +192,7 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 
 #endif
 
-static struct ctl_table root_table[];
+static struct ctl_table root_table[1];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
 	{{.count = 1,
@@ -222,7 +222,7 @@ int sysctl_legacy_va_layout;
 
 /* The default sysctl tables: */
 
-static struct ctl_table root_table[] = {
+static struct ctl_table sysctl_base_table[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
@@ -1747,17 +1747,12 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 	}
 }
 
-static __init int sysctl_init(void)
+int __init sysctl_init(void)
 {
-	sysctl_set_parent(NULL, root_table);
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-	sysctl_check_table(current->nsproxy, root_table);
-#endif
+	register_sysctl_table(sysctl_base_table);
 	return 0;
 }
 
-core_initcall(sysctl_init);
-
 static struct ctl_table *is_branch_in(struct ctl_table *branch,
 				      struct ctl_table *table)
 {
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index e75813904f26..a6bbee2bc710 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -90,7 +90,7 @@ static struct pernet_operations sysctl_pernet_ops = {
 	.exit = sysctl_net_exit,
 };
 
-static __init int sysctl_init(void)
+static __init int net_sysctl_init(void)
 {
 	int ret;
 	ret = register_pernet_subsys(&sysctl_pernet_ops);
@@ -102,7 +102,7 @@ static __init int sysctl_init(void)
 out:
 	return ret;
 }
-subsys_initcall(sysctl_init);
+subsys_initcall(net_sysctl_init);
 
 struct ctl_table_header *register_net_sysctl_table(struct net *net,
 	const struct ctl_path *path, struct ctl_table *table)
-- 
cgit v1.2.3-55-g7522


From 1f87f0b52b1d6581168cb80f86746bc4df918d01 Mon Sep 17 00:00:00 2001
From: Eric W. Biederman
Date: Fri, 6 Jan 2012 04:07:15 -0800
Subject: sysctl: Move the implementation into fs/proc/proc_sysctl.c

Move the core sysctl code from kernel/sysctl.c and kernel/sysctl_check.c
into fs/proc/proc_sysctl.c.

Currently sysctl maintenance is hampered by the sysctl implementation
being split across 3 files with artificial layering between them.
Consolidate the entire sysctl implementation into 1 file so that
it is easier to see what is going on and hopefully allowing for
simpler maintenance.

For functions that are now only used in fs/proc/proc_sysctl.c remove
their declarations from sysctl.h and make them static in fs/proc/proc_sysctl.c

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/internal.h     |   3 +
 fs/proc/proc_sysctl.c  | 622 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h |  16 --
 kernel/Makefile        |   1 -
 kernel/sysctl.c        | 464 ------------------------------------
 kernel/sysctl_check.c  | 160 -------------
 6 files changed, 625 insertions(+), 641 deletions(-)
 delete mode 100644 kernel/sysctl_check.c

(limited to 'kernel')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 292577531ad1..3b5ecd960d6a 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,12 +10,15 @@
  */
 
 #include <linux/proc_fs.h>
+struct  ctl_table_header;
 
 extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
 #endif
 #ifdef CONFIG_NET
 extern int proc_net_init(void);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9d29d28af577..06e6f10ee8ec 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -7,6 +7,7 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
+#include <linux/module.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
@@ -24,6 +25,209 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
 	wake_up_interruptible(&poll->wait);
 }
 
+static struct ctl_table root_table[1];
+static struct ctl_table_root sysctl_table_root;
+static struct ctl_table_header root_table_header = {
+	{{.count = 1,
+	.ctl_table = root_table,
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
+	.root = &sysctl_table_root,
+	.set = &sysctl_table_root.default_set,
+};
+static struct ctl_table_root sysctl_table_root = {
+	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+	if (unlikely(p->unregistering))
+		return 0;
+	p->used++;
+	return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+	if (!--p->used)
+		if (unlikely(p->unregistering))
+			complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+	/*
+	 * if p->used is 0, nobody will ever touch that entry again;
+	 * we'll eliminate all paths to it before dropping sysctl_lock
+	 */
+	if (unlikely(p->used)) {
+		struct completion wait;
+		init_completion(&wait);
+		p->unregistering = &wait;
+		spin_unlock(&sysctl_lock);
+		wait_for_completion(&wait);
+		spin_lock(&sysctl_lock);
+	} else {
+		/* anything non-NULL; we'll never dereference it */
+		p->unregistering = ERR_PTR(-EINVAL);
+	}
+	/*
+	 * do not remove from the list until nobody holds it; walking the
+	 * list in do_sysctl() relies on that.
+	 */
+	list_del_init(&p->ctl_entry);
+}
+
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	if (!--head->count)
+		kfree_rcu(head, rcu);
+	spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+	if (!head)
+		BUG();
+	spin_lock(&sysctl_lock);
+	if (!use_table(head))
+		head = ERR_PTR(-ENOENT);
+	spin_unlock(&sysctl_lock);
+	return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+	if (!head)
+		return;
+	spin_lock(&sysctl_lock);
+	unuse_table(head);
+	spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = &root->default_set;
+	if (root->lookup)
+		set = root->lookup(root, namespaces);
+	return set;
+}
+
+static struct list_head *
+lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = lookup_header_set(root, namespaces);
+	return &set->list;
+}
+
+static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
+						struct ctl_table_header *prev)
+{
+	struct ctl_table_root *root;
+	struct list_head *header_list;
+	struct ctl_table_header *head;
+	struct list_head *tmp;
+
+	spin_lock(&sysctl_lock);
+	if (prev) {
+		head = prev;
+		tmp = &prev->ctl_entry;
+		unuse_table(prev);
+		goto next;
+	}
+	tmp = &root_table_header.ctl_entry;
+	for (;;) {
+		head = list_entry(tmp, struct ctl_table_header, ctl_entry);
+
+		if (!use_table(head))
+			goto next;
+		spin_unlock(&sysctl_lock);
+		return head;
+	next:
+		root = head->root;
+		tmp = tmp->next;
+		header_list = lookup_header_list(root, namespaces);
+		if (tmp != header_list)
+			continue;
+
+		do {
+			root = list_entry(root->root_list.next,
+					struct ctl_table_root, root_list);
+			if (root == &sysctl_table_root)
+				goto out;
+			header_list = lookup_header_list(root, namespaces);
+		} while (list_empty(header_list));
+		tmp = header_list->next;
+	}
+out:
+	spin_unlock(&sysctl_lock);
+	return NULL;
+}
+
+static struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
+{
+	return __sysctl_head_next(current->nsproxy, prev);
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+	spin_lock(&sysctl_lock);
+	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
+	spin_unlock(&sysctl_lock);
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+	if (!current_euid())
+		mode >>= 6;
+	else if (in_egroup_p(0))
+		mode >>= 3;
+	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+		return 0;
+	return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+	int mode;
+
+	if (root->permissions)
+		mode = root->permissions(root, current->nsproxy, table);
+	else
+		mode = table->mode;
+
+	return test_perm(mode, op);
+}
+
+static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
+{
+	for (; table->procname; table++) {
+		table->parent = parent;
+		if (table->child)
+			sysctl_set_parent(table, table->child);
+	}
+}
+
+
 static struct inode *proc_sys_make_inode(struct super_block *sb,
 		struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -435,6 +639,21 @@ static int proc_sys_delete(const struct dentry *dentry)
 	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
 
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+	struct ctl_table_set *set = p->set;
+	int res;
+	spin_lock(&sysctl_lock);
+	if (p->unregistering)
+		res = 0;
+	else if (!set->is_seen)
+		res = 1;
+	else
+		res = set->is_seen(set);
+	spin_unlock(&sysctl_lock);
+	return res;
+}
+
 static int proc_sys_compare(const struct dentry *parent,
 		const struct inode *pinode,
 		const struct dentry *dentry, const struct inode *inode,
@@ -460,6 +679,409 @@ static const struct dentry_operations proc_sys_dentry_operations = {
 	.d_compare	= proc_sys_compare,
 };
 
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+				      struct ctl_table *table)
+{
+	struct ctl_table *p;
+	const char *s = branch->procname;
+
+	/* branch should have named subdirectory as its first element */
+	if (!s || !branch->child)
+		return NULL;
+
+	/* ... and nothing else */
+	if (branch[1].procname)
+		return NULL;
+
+	/* table should contain subdirectory with the same name */
+	for (p = table; p->procname; p++) {
+		if (!p->child)
+			continue;
+		if (p->procname && strcmp(p->procname, s) == 0)
+			return p;
+	}
+	return NULL;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+	struct ctl_table *next;
+	int is_better = 0;
+	int not_in_parent = !p->attached_by;
+
+	while ((next = is_branch_in(by, to)) != NULL) {
+		if (by == q->attached_by)
+			is_better = 1;
+		if (to == p->attached_by)
+			not_in_parent = 1;
+		by = by->child;
+		to = next->child;
+	}
+
+	if (is_better && not_in_parent) {
+		q->attached_by = by;
+		q->attached_to = to;
+		q->parent = p;
+	}
+}
+
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+static int sysctl_depth(struct ctl_table *table)
+{
+	struct ctl_table *tmp;
+	int depth;
+
+	depth = 0;
+	for (tmp = table; tmp->parent; tmp = tmp->parent)
+		depth++;
+
+	return depth;
+}
+
+static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
+{
+	int i;
+
+	for (i = 0; table && i < n; i++)
+		table = table->parent;
+
+	return table;
+}
+
+
+static void sysctl_print_path(struct ctl_table *table)
+{
+	struct ctl_table *tmp;
+	int depth, i;
+	depth = sysctl_depth(table);
+	if (table->procname) {
+		for (i = depth; i >= 0; i--) {
+			tmp = sysctl_parent(table, i);
+			printk("/%s", tmp->procname?tmp->procname:"");
+		}
+	}
+	printk(" ");
+}
+
+static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
+						struct ctl_table *table)
+{
+	struct ctl_table_header *head;
+	struct ctl_table *ref, *test;
+	int depth, cur_depth;
+
+	depth = sysctl_depth(table);
+
+	for (head = __sysctl_head_next(namespaces, NULL); head;
+	     head = __sysctl_head_next(namespaces, head)) {
+		cur_depth = depth;
+		ref = head->ctl_table;
+repeat:
+		test = sysctl_parent(table, cur_depth);
+		for (; ref->procname; ref++) {
+			int match = 0;
+			if (cur_depth && !ref->child)
+				continue;
+
+			if (test->procname && ref->procname &&
+			    (strcmp(test->procname, ref->procname) == 0))
+					match++;
+
+			if (match) {
+				if (cur_depth != 0) {
+					cur_depth--;
+					ref = ref->child;
+					goto repeat;
+				}
+				goto out;
+			}
+		}
+	}
+	ref = NULL;
+out:
+	sysctl_head_finish(head);
+	return ref;
+}
+
+static void set_fail(const char **fail, struct ctl_table *table, const char *str)
+{
+	if (*fail) {
+		printk(KERN_ERR "sysctl table check failed: ");
+		sysctl_print_path(table);
+		printk(" %s\n", *fail);
+		dump_stack();
+	}
+	*fail = str;
+}
+
+static void sysctl_check_leaf(struct nsproxy *namespaces,
+				struct ctl_table *table, const char **fail)
+{
+	struct ctl_table *ref;
+
+	ref = sysctl_check_lookup(namespaces, table);
+	if (ref && (ref != table))
+		set_fail(fail, table, "Sysctl already exists");
+}
+
+static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+{
+	int error = 0;
+	for (; table->procname; table++) {
+		const char *fail = NULL;
+
+		if (table->parent) {
+			if (!table->parent->procname)
+				set_fail(&fail, table, "Parent without procname");
+		}
+		if (table->child) {
+			if (table->data)
+				set_fail(&fail, table, "Directory with data?");
+			if (table->maxlen)
+				set_fail(&fail, table, "Directory with maxlen?");
+			if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
+				set_fail(&fail, table, "Writable sysctl directory");
+			if (table->proc_handler)
+				set_fail(&fail, table, "Directory with proc_handler");
+			if (table->extra1)
+				set_fail(&fail, table, "Directory with extra1");
+			if (table->extra2)
+				set_fail(&fail, table, "Directory with extra2");
+		} else {
+			if ((table->proc_handler == proc_dostring) ||
+			    (table->proc_handler == proc_dointvec) ||
+			    (table->proc_handler == proc_dointvec_minmax) ||
+			    (table->proc_handler == proc_dointvec_jiffies) ||
+			    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+			    (table->proc_handler == proc_dointvec_ms_jiffies) ||
+			    (table->proc_handler == proc_doulongvec_minmax) ||
+			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+				if (!table->data)
+					set_fail(&fail, table, "No data");
+				if (!table->maxlen)
+					set_fail(&fail, table, "No maxlen");
+			}
+#ifdef CONFIG_PROC_SYSCTL
+			if (!table->proc_handler)
+				set_fail(&fail, table, "No proc_handler");
+#endif
+			sysctl_check_leaf(namespaces, table, &fail);
+		}
+		if (table->mode > 0777)
+			set_fail(&fail, table, "bogus .mode");
+		if (fail) {
+			set_fail(&fail, table, NULL);
+			error = -EINVAL;
+		}
+		if (table->child)
+			error |= sysctl_check_table(namespaces, table->child);
+	}
+	return error;
+}
+#endif /* CONFIG_SYSCTL_SYSCALL_CHECK */
+
+/**
+ * __register_sysctl_paths - register a sysctl hierarchy
+ * @root: List of sysctl headers to register on
+ * @namespaces: Data to compute which lists of sysctl entries are visible
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+	struct ctl_table_root *root,
+	struct nsproxy *namespaces,
+	const struct ctl_path *path, struct ctl_table *table)
+{
+	struct ctl_table_header *header;
+	struct ctl_table *new, **prevp;
+	unsigned int n, npath;
+	struct ctl_table_set *set;
+
+	/* Count the path components */
+	for (npath = 0; path[npath].procname; ++npath)
+		;
+
+	/*
+	 * For each path component, allocate a 2-element ctl_table array.
+	 * The first array element will be filled with the sysctl entry
+	 * for this, the second will be the sentinel (procname == 0).
+	 *
+	 * We allocate everything in one go so that we don't have to
+	 * worry about freeing additional memory in unregister_sysctl_table.
+	 */
+	header = kzalloc(sizeof(struct ctl_table_header) +
+			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
+	if (!header)
+		return NULL;
+
+	new = (struct ctl_table *) (header + 1);
+
+	/* Now connect the dots */
+	prevp = &header->ctl_table;
+	for (n = 0; n < npath; ++n, ++path) {
+		/* Copy the procname */
+		new->procname = path->procname;
+		new->mode     = 0555;
+
+		*prevp = new;
+		prevp = &new->child;
+
+		new += 2;
+	}
+	*prevp = table;
+	header->ctl_table_arg = table;
+
+	INIT_LIST_HEAD(&header->ctl_entry);
+	header->used = 0;
+	header->unregistering = NULL;
+	header->root = root;
+	sysctl_set_parent(NULL, header->ctl_table);
+	header->count = 1;
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+	if (sysctl_check_table(namespaces, header->ctl_table)) {
+		kfree(header);
+		return NULL;
+	}
+#endif
+	spin_lock(&sysctl_lock);
+	header->set = lookup_header_set(root, namespaces);
+	header->attached_by = header->ctl_table;
+	header->attached_to = root_table;
+	header->parent = &root_table_header;
+	for (set = header->set; set; set = set->parent) {
+		struct ctl_table_header *p;
+		list_for_each_entry(p, &set->list, ctl_entry) {
+			if (p->unregistering)
+				continue;
+			try_attach(p, header);
+		}
+	}
+	header->parent->count++;
+	list_add_tail(&header->ctl_entry, &header->set->list);
+	spin_unlock(&sysctl_lock);
+
+	return header;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+						struct ctl_table *table)
+{
+	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
+					path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+	static const struct ctl_path null_path[] = { {} };
+
+	return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+	might_sleep();
+
+	if (header == NULL)
+		return;
+
+	spin_lock(&sysctl_lock);
+	start_unregistering(header);
+	if (!--header->parent->count) {
+		WARN_ON(1);
+		kfree_rcu(header->parent, rcu);
+	}
+	if (!--header->count)
+		kfree_rcu(header, rcu);
+	spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+	INIT_LIST_HEAD(&p->list);
+	p->parent = parent ? parent : &sysctl_table_root.default_set;
+	p->is_seen = is_seen;
+}
+
+
 int __init proc_sys_init(void)
 {
 	struct proc_dir_entry *proc_sys_root;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 5e3532e9599f..08cabbfddacb 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1073,17 +1073,6 @@ extern void setup_sysctl_set(struct ctl_table_set *p,
 	struct ctl_table_set *parent,
 	int (*is_seen)(struct ctl_table_set *));
 
-extern void sysctl_head_get(struct ctl_table_header *);
-extern void sysctl_head_put(struct ctl_table_header *);
-extern int sysctl_is_seen(struct ctl_table_header *);
-extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *);
-extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev);
-extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-						struct ctl_table_header *prev);
-extern void sysctl_head_finish(struct ctl_table_header *prev);
-extern int sysctl_perm(struct ctl_table_root *root,
-		struct ctl_table *table, int op);
-
 void register_sysctl_root(struct ctl_table_root *root);
 struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_root *root, struct nsproxy *namespaces,
@@ -1093,7 +1082,6 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 						struct ctl_table *table);
 
 void unregister_sysctl_table(struct ctl_table_header * table);
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
 
 extern int sysctl_init(void);
 #else /* CONFIG_SYSCTL */
@@ -1118,10 +1106,6 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
 {
 }
 
-static inline void sysctl_head_put(struct ctl_table_header *head)
-{
-}
-
 #endif /* CONFIG_SYSCTL */
 
 #endif /* __KERNEL__ */
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ad460248acc7..b774909ed46c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -192,20 +192,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 
 #endif
 
-static struct ctl_table root_table[1];
-static struct ctl_table_root sysctl_table_root;
-static struct ctl_table_header root_table_header = {
-	{{.count = 1,
-	.ctl_table = root_table,
-	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
-	.root = &sysctl_table_root,
-	.set = &sysctl_table_root.default_set,
-};
-static struct ctl_table_root sysctl_table_root = {
-	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
-};
-
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
 static struct ctl_table fs_table[];
@@ -1559,459 +1545,12 @@ static struct ctl_table dev_table[] = {
 	{ }
 };
 
-static DEFINE_SPINLOCK(sysctl_lock);
-
-/* called under sysctl_lock */
-static int use_table(struct ctl_table_header *p)
-{
-	if (unlikely(p->unregistering))
-		return 0;
-	p->used++;
-	return 1;
-}
-
-/* called under sysctl_lock */
-static void unuse_table(struct ctl_table_header *p)
-{
-	if (!--p->used)
-		if (unlikely(p->unregistering))
-			complete(p->unregistering);
-}
-
-/* called under sysctl_lock, will reacquire if has to wait */
-static void start_unregistering(struct ctl_table_header *p)
-{
-	/*
-	 * if p->used is 0, nobody will ever touch that entry again;
-	 * we'll eliminate all paths to it before dropping sysctl_lock
-	 */
-	if (unlikely(p->used)) {
-		struct completion wait;
-		init_completion(&wait);
-		p->unregistering = &wait;
-		spin_unlock(&sysctl_lock);
-		wait_for_completion(&wait);
-		spin_lock(&sysctl_lock);
-	} else {
-		/* anything non-NULL; we'll never dereference it */
-		p->unregistering = ERR_PTR(-EINVAL);
-	}
-	/*
-	 * do not remove from the list until nobody holds it; walking the
-	 * list in do_sysctl() relies on that.
-	 */
-	list_del_init(&p->ctl_entry);
-}
-
-void sysctl_head_get(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	head->count++;
-	spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	if (!--head->count)
-		kfree_rcu(head, rcu);
-	spin_unlock(&sysctl_lock);
-}
-
-struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
-{
-	if (!head)
-		BUG();
-	spin_lock(&sysctl_lock);
-	if (!use_table(head))
-		head = ERR_PTR(-ENOENT);
-	spin_unlock(&sysctl_lock);
-	return head;
-}
-
-void sysctl_head_finish(struct ctl_table_header *head)
-{
-	if (!head)
-		return;
-	spin_lock(&sysctl_lock);
-	unuse_table(head);
-	spin_unlock(&sysctl_lock);
-}
-
-static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-	struct ctl_table_set *set = &root->default_set;
-	if (root->lookup)
-		set = root->lookup(root, namespaces);
-	return set;
-}
-
-static struct list_head *
-lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-	struct ctl_table_set *set = lookup_header_set(root, namespaces);
-	return &set->list;
-}
-
-struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-					    struct ctl_table_header *prev)
-{
-	struct ctl_table_root *root;
-	struct list_head *header_list;
-	struct ctl_table_header *head;
-	struct list_head *tmp;
-
-	spin_lock(&sysctl_lock);
-	if (prev) {
-		head = prev;
-		tmp = &prev->ctl_entry;
-		unuse_table(prev);
-		goto next;
-	}
-	tmp = &root_table_header.ctl_entry;
-	for (;;) {
-		head = list_entry(tmp, struct ctl_table_header, ctl_entry);
-
-		if (!use_table(head))
-			goto next;
-		spin_unlock(&sysctl_lock);
-		return head;
-	next:
-		root = head->root;
-		tmp = tmp->next;
-		header_list = lookup_header_list(root, namespaces);
-		if (tmp != header_list)
-			continue;
-
-		do {
-			root = list_entry(root->root_list.next,
-					struct ctl_table_root, root_list);
-			if (root == &sysctl_table_root)
-				goto out;
-			header_list = lookup_header_list(root, namespaces);
-		} while (list_empty(header_list));
-		tmp = header_list->next;
-	}
-out:
-	spin_unlock(&sysctl_lock);
-	return NULL;
-}
-
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
-{
-	return __sysctl_head_next(current->nsproxy, prev);
-}
-
-void register_sysctl_root(struct ctl_table_root *root)
-{
-	spin_lock(&sysctl_lock);
-	list_add_tail(&root->root_list, &sysctl_table_root.root_list);
-	spin_unlock(&sysctl_lock);
-}
-
-/*
- * sysctl_perm does NOT grant the superuser all rights automatically, because
- * some sysctl variables are readonly even to root.
- */
-
-static int test_perm(int mode, int op)
-{
-	if (!current_euid())
-		mode >>= 6;
-	else if (in_egroup_p(0))
-		mode >>= 3;
-	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
-		return 0;
-	return -EACCES;
-}
-
-int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
-{
-	int mode;
-
-	if (root->permissions)
-		mode = root->permissions(root, current->nsproxy, table);
-	else
-		mode = table->mode;
-
-	return test_perm(mode, op);
-}
-
-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
-	for (; table->procname; table++) {
-		table->parent = parent;
-		if (table->child)
-			sysctl_set_parent(table, table->child);
-	}
-}
-
 int __init sysctl_init(void)
 {
 	register_sysctl_table(sysctl_base_table);
 	return 0;
 }
 
-static struct ctl_table *is_branch_in(struct ctl_table *branch,
-				      struct ctl_table *table)
-{
-	struct ctl_table *p;
-	const char *s = branch->procname;
-
-	/* branch should have named subdirectory as its first element */
-	if (!s || !branch->child)
-		return NULL;
-
-	/* ... and nothing else */
-	if (branch[1].procname)
-		return NULL;
-
-	/* table should contain subdirectory with the same name */
-	for (p = table; p->procname; p++) {
-		if (!p->child)
-			continue;
-		if (p->procname && strcmp(p->procname, s) == 0)
-			return p;
-	}
-	return NULL;
-}
-
-/* see if attaching q to p would be an improvement */
-static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
-{
-	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
-	struct ctl_table *next;
-	int is_better = 0;
-	int not_in_parent = !p->attached_by;
-
-	while ((next = is_branch_in(by, to)) != NULL) {
-		if (by == q->attached_by)
-			is_better = 1;
-		if (to == p->attached_by)
-			not_in_parent = 1;
-		by = by->child;
-		to = next->child;
-	}
-
-	if (is_better && not_in_parent) {
-		q->attached_by = by;
-		q->attached_to = to;
-		q->parent = p;
-	}
-}
-
-/**
- * __register_sysctl_paths - register a sysctl hierarchy
- * @root: List of sysctl headers to register on
- * @namespaces: Data to compute which lists of sysctl entries are visible
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * The members of the &struct ctl_table structure are used as follows:
- *
- * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
- *            enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file, and for sysctl(2)
- *
- * child - a pointer to the child sysctl table if this entry is a directory, or
- *         %NULL.
- *
- * proc_handler - the text handler routine (described below)
- *
- * de - for internal use by the sysctl routines
- *
- * extra1, extra2 - extra pointers usable by the proc handler routines
- *
- * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
- *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table.  The data and maxlen fields of the ctl_table
- * struct enable minimal validation of the values being written to be
- * performed, and the mode field allows minimal authentication.
- *
- * There must be a proc_handler routine for any terminal nodes
- * mirrored under /proc/sys (non-terminals are handled by a built-in
- * directory handler).  Several default handlers are available to
- * cover common cases -
- *
- * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
- * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
- * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
- *
- * It is the handler's job to read the input buffer from user memory
- * and process it. The handler should return 0 on success.
- *
- * This routine returns %NULL on a failure to register, and a pointer
- * to the table header on success.
- */
-struct ctl_table_header *__register_sysctl_paths(
-	struct ctl_table_root *root,
-	struct nsproxy *namespaces,
-	const struct ctl_path *path, struct ctl_table *table)
-{
-	struct ctl_table_header *header;
-	struct ctl_table *new, **prevp;
-	unsigned int n, npath;
-	struct ctl_table_set *set;
-
-	/* Count the path components */
-	for (npath = 0; path[npath].procname; ++npath)
-		;
-
-	/*
-	 * For each path component, allocate a 2-element ctl_table array.
-	 * The first array element will be filled with the sysctl entry
-	 * for this, the second will be the sentinel (procname == 0).
-	 *
-	 * We allocate everything in one go so that we don't have to
-	 * worry about freeing additional memory in unregister_sysctl_table.
-	 */
-	header = kzalloc(sizeof(struct ctl_table_header) +
-			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
-	if (!header)
-		return NULL;
-
-	new = (struct ctl_table *) (header + 1);
-
-	/* Now connect the dots */
-	prevp = &header->ctl_table;
-	for (n = 0; n < npath; ++n, ++path) {
-		/* Copy the procname */
-		new->procname = path->procname;
-		new->mode     = 0555;
-
-		*prevp = new;
-		prevp = &new->child;
-
-		new += 2;
-	}
-	*prevp = table;
-	header->ctl_table_arg = table;
-
-	INIT_LIST_HEAD(&header->ctl_entry);
-	header->used = 0;
-	header->unregistering = NULL;
-	header->root = root;
-	sysctl_set_parent(NULL, header->ctl_table);
-	header->count = 1;
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-	if (sysctl_check_table(namespaces, header->ctl_table)) {
-		kfree(header);
-		return NULL;
-	}
-#endif
-	spin_lock(&sysctl_lock);
-	header->set = lookup_header_set(root, namespaces);
-	header->attached_by = header->ctl_table;
-	header->attached_to = root_table;
-	header->parent = &root_table_header;
-	for (set = header->set; set; set = set->parent) {
-		struct ctl_table_header *p;
-		list_for_each_entry(p, &set->list, ctl_entry) {
-			if (p->unregistering)
-				continue;
-			try_attach(p, header);
-		}
-	}
-	header->parent->count++;
-	list_add_tail(&header->ctl_entry, &header->set->list);
-	spin_unlock(&sysctl_lock);
-
-	return header;
-}
-
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-						struct ctl_table *table)
-{
-	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
-					path, table);
-}
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
-	static const struct ctl_path null_path[] = { {} };
-
-	return register_sysctl_paths(null_path, table);
-}
-
-/**
- * unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
- *
- * Unregisters the sysctl table and all children. proc entries may not
- * actually be removed until they are no longer used by anyone.
- */
-void unregister_sysctl_table(struct ctl_table_header * header)
-{
-	might_sleep();
-
-	if (header == NULL)
-		return;
-
-	spin_lock(&sysctl_lock);
-	start_unregistering(header);
-	if (!--header->parent->count) {
-		WARN_ON(1);
-		kfree_rcu(header->parent, rcu);
-	}
-	if (!--header->count)
-		kfree_rcu(header, rcu);
-	spin_unlock(&sysctl_lock);
-}
-
-int sysctl_is_seen(struct ctl_table_header *p)
-{
-	struct ctl_table_set *set = p->set;
-	int res;
-	spin_lock(&sysctl_lock);
-	if (p->unregistering)
-		res = 0;
-	else if (!set->is_seen)
-		res = 1;
-	else
-		res = set->is_seen(set);
-	spin_unlock(&sysctl_lock);
-	return res;
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
-	struct ctl_table_set *parent,
-	int (*is_seen)(struct ctl_table_set *))
-{
-	INIT_LIST_HEAD(&p->list);
-	p->parent = parent ? parent : &sysctl_table_root.default_set;
-	p->is_seen = is_seen;
-}
-
 #endif /* CONFIG_SYSCTL */
 
 /*
@@ -2977,6 +2516,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
-EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <net/ip_vs.h>
-
-
-static int sysctl_depth(struct ctl_table *table)
-{
-	struct ctl_table *tmp;
-	int depth;
-
-	depth = 0;
-	for (tmp = table; tmp->parent; tmp = tmp->parent)
-		depth++;
-
-	return depth;
-}
-
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
-{
-	int i;
-
-	for (i = 0; table && i < n; i++)
-		table = table->parent;
-
-	return table;
-}
-
-
-static void sysctl_print_path(struct ctl_table *table)
-{
-	struct ctl_table *tmp;
-	int depth, i;
-	depth = sysctl_depth(table);
-	if (table->procname) {
-		for (i = depth; i >= 0; i--) {
-			tmp = sysctl_parent(table, i);
-			printk("/%s", tmp->procname?tmp->procname:"");
-		}
-	}
-	printk(" ");
-}
-
-static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
-						struct ctl_table *table)
-{
-	struct ctl_table_header *head;
-	struct ctl_table *ref, *test;
-	int depth, cur_depth;
-
-	depth = sysctl_depth(table);
-
-	for (head = __sysctl_head_next(namespaces, NULL); head;
-	     head = __sysctl_head_next(namespaces, head)) {
-		cur_depth = depth;
-		ref = head->ctl_table;
-repeat:
-		test = sysctl_parent(table, cur_depth);
-		for (; ref->procname; ref++) {
-			int match = 0;
-			if (cur_depth && !ref->child)
-				continue;
-
-			if (test->procname && ref->procname &&
-			    (strcmp(test->procname, ref->procname) == 0))
-					match++;
-
-			if (match) {
-				if (cur_depth != 0) {
-					cur_depth--;
-					ref = ref->child;
-					goto repeat;
-				}
-				goto out;
-			}
-		}
-	}
-	ref = NULL;
-out:
-	sysctl_head_finish(head);
-	return ref;
-}
-
-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
-{
-	if (*fail) {
-		printk(KERN_ERR "sysctl table check failed: ");
-		sysctl_print_path(table);
-		printk(" %s\n", *fail);
-		dump_stack();
-	}
-	*fail = str;
-}
-
-static void sysctl_check_leaf(struct nsproxy *namespaces,
-				struct ctl_table *table, const char **fail)
-{
-	struct ctl_table *ref;
-
-	ref = sysctl_check_lookup(namespaces, table);
-	if (ref && (ref != table))
-		set_fail(fail, table, "Sysctl already exists");
-}
-
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
-{
-	int error = 0;
-	for (; table->procname; table++) {
-		const char *fail = NULL;
-
-		if (table->parent) {
-			if (!table->parent->procname)
-				set_fail(&fail, table, "Parent without procname");
-		}
-		if (table->child) {
-			if (table->data)
-				set_fail(&fail, table, "Directory with data?");
-			if (table->maxlen)
-				set_fail(&fail, table, "Directory with maxlen?");
-			if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
-				set_fail(&fail, table, "Writable sysctl directory");
-			if (table->proc_handler)
-				set_fail(&fail, table, "Directory with proc_handler");
-			if (table->extra1)
-				set_fail(&fail, table, "Directory with extra1");
-			if (table->extra2)
-				set_fail(&fail, table, "Directory with extra2");
-		} else {
-			if ((table->proc_handler == proc_dostring) ||
-			    (table->proc_handler == proc_dointvec) ||
-			    (table->proc_handler == proc_dointvec_minmax) ||
-			    (table->proc_handler == proc_dointvec_jiffies) ||
-			    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
-			    (table->proc_handler == proc_dointvec_ms_jiffies) ||
-			    (table->proc_handler == proc_doulongvec_minmax) ||
-			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-				if (!table->data)
-					set_fail(&fail, table, "No data");
-				if (!table->maxlen)
-					set_fail(&fail, table, "No maxlen");
-			}
-#ifdef CONFIG_PROC_SYSCTL
-			if (!table->proc_handler)
-				set_fail(&fail, table, "No proc_handler");
-#endif
-			sysctl_check_leaf(namespaces, table, &fail);
-		}
-		if (table->mode > 0777)
-			set_fail(&fail, table, "bogus .mode");
-		if (fail) {
-			set_fail(&fail, table, NULL);
-			error = -EINVAL;
-		}
-		if (table->child)
-			error |= sysctl_check_table(namespaces, table->child);
-	}
-	return error;
-}
-- 
cgit v1.2.3-55-g7522


From cf579dfb82550e34de7ccf3ef090d8b834ccd3a9 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Sun, 29 Jan 2012 20:38:29 +0100
Subject: PM / Sleep: Introduce "late suspend" and "early resume" of devices

The current device suspend/resume phases during system-wide power
transitions appear to be insufficient for some platforms that want
to use the same callback routines for saving device states and
related operations during runtime suspend/resume as well as during
system suspend/resume.  In principle, they could point their
.suspend_noirq() and .resume_noirq() to the same callback routines
as their .runtime_suspend() and .runtime_resume(), respectively,
but at least some of them require device interrupts to be enabled
while the code in those routines is running.

It also makes sense to have device suspend-resume callbacks that will
be executed with runtime PM disabled and with device interrupts
enabled in case someone needs to run some special code in that
context during system-wide power transitions.

Apart from this, .suspend_noirq() and .resume_noirq() were introduced
as a workaround for drivers using shared interrupts and failing to
prevent their interrupt handlers from accessing suspended hardware.
It appears to be better not to use them for other porposes, or we may
have to deal with some serious confusion (which seems to be happening
already).

For the above reasons, introduce new device suspend/resume phases,
"late suspend" and "early resume" (and analogously for hibernation)
whose callback will be executed with runtime PM disabled and with
device interrupts enabled and whose callback pointers generally may
point to runtime suspend/resume routines.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 Documentation/power/devices.txt |  93 +++++++++------
 arch/x86/kernel/apm_32.c        |  11 +-
 drivers/base/power/main.c       | 247 ++++++++++++++++++++++++++++++++++++----
 drivers/xen/manage.c            |   6 +-
 include/linux/pm.h              |  43 +++++--
 include/linux/suspend.h         |   4 +
 kernel/kexec.c                  |   8 +-
 kernel/power/hibernate.c        |  24 ++--
 kernel/power/main.c             |   8 +-
 kernel/power/suspend.c          |   4 +-
 10 files changed, 357 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 20af7def23c8..872815cd41d3 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -96,6 +96,12 @@ struct dev_pm_ops {
 	int (*thaw)(struct device *dev);
 	int (*poweroff)(struct device *dev);
 	int (*restore)(struct device *dev);
+	int (*suspend_late)(struct device *dev);
+	int (*resume_early)(struct device *dev);
+	int (*freeze_late)(struct device *dev);
+	int (*thaw_early)(struct device *dev);
+	int (*poweroff_late)(struct device *dev);
+	int (*restore_early)(struct device *dev);
 	int (*suspend_noirq)(struct device *dev);
 	int (*resume_noirq)(struct device *dev);
 	int (*freeze_noirq)(struct device *dev);
@@ -305,7 +311,7 @@ Entering System Suspend
 -----------------------
 When the system goes into the standby or memory sleep state, the phases are:
 
-		prepare, suspend, suspend_noirq.
+		prepare, suspend, suspend_late, suspend_noirq.
 
     1.	The prepare phase is meant to prevent races by preventing new devices
 	from being registered; the PM core would never know that all the
@@ -324,7 +330,12 @@ When the system goes into the standby or memory sleep state, the phases are:
 	appropriate low-power state, depending on the bus type the device is on,
 	and they may enable wakeup events.
 
-    3.	The suspend_noirq phase occurs after IRQ handlers have been disabled,
+    3	For a number of devices it is convenient to split suspend into the
+	"quiesce device" and "save device state" phases, in which cases
+	suspend_late is meant to do the latter.  It is always executed after
+	runtime power management has been disabled for all devices.
+
+    4.	The suspend_noirq phase occurs after IRQ handlers have been disabled,
 	which means that the driver's interrupt handler will not be called while
 	the callback method is running.  The methods should save the values of
 	the device's registers that weren't saved previously and finally put the
@@ -359,7 +370,7 @@ Leaving System Suspend
 ----------------------
 When resuming from standby or memory sleep, the phases are:
 
-		resume_noirq, resume, complete.
+		resume_noirq, resume_early, resume, complete.
 
     1.	The resume_noirq callback methods should perform any actions needed
 	before the driver's interrupt handlers are invoked.  This generally
@@ -375,14 +386,18 @@ When resuming from standby or memory sleep, the phases are:
 	device driver's ->pm.resume_noirq() method to perform device-specific
 	actions.
 
-    2.	The resume methods should bring the the device back to its operating
+    2.	The resume_early methods should prepare devices for the execution of
+	the resume methods.  This generally involves undoing the actions of the
+	preceding suspend_late phase.
+
+    3	The resume methods should bring the the device back to its operating
 	state, so that it can perform normal I/O.  This generally involves
 	undoing the actions of the suspend phase.
 
-    3.	The complete phase uses only a bus callback.  The method should undo the
-	actions of the prepare phase.  Note, however, that new children may be
-	registered below the device as soon as the resume callbacks occur; it's
-	not necessary to wait until the complete phase.
+    4.	The complete phase should undo the actions of the prepare phase.  Note,
+	however, that new children may be registered below the device as soon as
+	the resume callbacks occur; it's not necessary to wait until the
+	complete phase.
 
 At the end of these phases, drivers should be as functional as they were before
 suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
@@ -429,8 +444,8 @@ an image of the system memory while everything is stable, reactivate all
 devices (thaw), write the image to permanent storage, and finally shut down the
 system (poweroff).  The phases used to accomplish this are:
 
-	prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete,
-	prepare, poweroff, poweroff_noirq
+	prepare, freeze, freeze_late, freeze_noirq, thaw_noirq, thaw_early,
+	thaw, complete, prepare, poweroff, poweroff_late, poweroff_noirq
 
     1.	The prepare phase is discussed in the "Entering System Suspend" section
 	above.
@@ -441,7 +456,11 @@ system (poweroff).  The phases used to accomplish this are:
 	save time it's best not to do so.  Also, the device should not be
 	prepared to generate wakeup events.
 
-    3.	The freeze_noirq phase is analogous to the suspend_noirq phase discussed
+    3.	The freeze_late phase is analogous to the suspend_late phase described
+	above, except that the device should not be put in a low-power state and
+	should not be allowed to generate wakeup events by it.
+
+    4.	The freeze_noirq phase is analogous to the suspend_noirq phase discussed
 	above, except again that the device should not be put in a low-power
 	state and should not be allowed to generate wakeup events.
 
@@ -449,15 +468,19 @@ At this point the system image is created.  All devices should be inactive and
 the contents of memory should remain undisturbed while this happens, so that the
 image forms an atomic snapshot of the system state.
 
-    4.	The thaw_noirq phase is analogous to the resume_noirq phase discussed
+    5.	The thaw_noirq phase is analogous to the resume_noirq phase discussed
 	above.  The main difference is that its methods can assume the device is
 	in the same state as at the end of the freeze_noirq phase.
 
-    5.	The thaw phase is analogous to the resume phase discussed above.  Its
+    6.	The thaw_early phase is analogous to the resume_early phase described
+	above.  Its methods should undo the actions of the preceding
+	freeze_late, if necessary.
+
+    7.	The thaw phase is analogous to the resume phase discussed above.  Its
 	methods should bring the device back to an operating state, so that it
 	can be used for saving the image if necessary.
 
-    6.	The complete phase is discussed in the "Leaving System Suspend" section
+    8.	The complete phase is discussed in the "Leaving System Suspend" section
 	above.
 
 At this point the system image is saved, and the devices then need to be
@@ -465,16 +488,19 @@ prepared for the upcoming system shutdown.  This is much like suspending them
 before putting the system into the standby or memory sleep state, and the phases
 are similar.
 
-    7.	The prepare phase is discussed above.
+    9.	The prepare phase is discussed above.
+
+    10.	The poweroff phase is analogous to the suspend phase.
 
-    8.	The poweroff phase is analogous to the suspend phase.
+    11.	The poweroff_late phase is analogous to the suspend_late phase.
 
-    9.	The poweroff_noirq phase is analogous to the suspend_noirq phase.
+    12.	The poweroff_noirq phase is analogous to the suspend_noirq phase.
 
-The poweroff and poweroff_noirq callbacks should do essentially the same things
-as the suspend and suspend_noirq callbacks.  The only notable difference is that
-they need not store the device register values, because the registers should
-already have been stored during the freeze or freeze_noirq phases.
+The poweroff, poweroff_late and poweroff_noirq callbacks should do essentially
+the same things as the suspend, suspend_late and suspend_noirq callbacks,
+respectively.  The only notable difference is that they need not store the
+device register values, because the registers should already have been stored
+during the freeze, freeze_late or freeze_noirq phases.
 
 
 Leaving Hibernation
@@ -518,22 +544,25 @@ To achieve this, the image kernel must restore the devices' pre-hibernation
 functionality.  The operation is much like waking up from the memory sleep
 state, although it involves different phases:
 
-	restore_noirq, restore, complete
+	restore_noirq, restore_early, restore, complete
 
     1.	The restore_noirq phase is analogous to the resume_noirq phase.
 
-    2.	The restore phase is analogous to the resume phase.
+    2.	The restore_early phase is analogous to the resume_early phase.
+
+    3.	The restore phase is analogous to the resume phase.
 
-    3.	The complete phase is discussed above.
+    4.	The complete phase is discussed above.
 
-The main difference from resume[_noirq] is that restore[_noirq] must assume the
-device has been accessed and reconfigured by the boot loader or the boot kernel.
-Consequently the state of the device may be different from the state remembered
-from the freeze and freeze_noirq phases.  The device may even need to be reset
-and completely re-initialized.  In many cases this difference doesn't matter, so
-the resume[_noirq] and restore[_norq] method pointers can be set to the same
-routines.  Nevertheless, different callback pointers are used in case there is a
-situation where it actually matters.
+The main difference from resume[_early|_noirq] is that restore[_early|_noirq]
+must assume the device has been accessed and reconfigured by the boot loader or
+the boot kernel.  Consequently the state of the device may be different from the
+state remembered from the freeze, freeze_late and freeze_noirq phases.  The
+device may even need to be reset and completely re-initialized.  In many cases
+this difference doesn't matter, so the resume[_early|_noirq] and
+restore[_early|_norq] method pointers can be set to the same routines.
+Nevertheless, different callback pointers are used in case there is a situation
+where it actually does matter.
 
 
 Device Power Management Domains
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f76623cbe263..5d56931a15b3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1234,8 +1234,7 @@ static int suspend(int vetoable)
 	struct apm_user	*as;
 
 	dpm_suspend_start(PMSG_SUSPEND);
-
-	dpm_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_end(PMSG_SUSPEND);
 
 	local_irq_disable();
 	syscore_suspend();
@@ -1259,9 +1258,9 @@ static int suspend(int vetoable)
 	syscore_resume();
 	local_irq_enable();
 
-	dpm_resume_noirq(PMSG_RESUME);
-
+	dpm_resume_start(PMSG_RESUME);
 	dpm_resume_end(PMSG_RESUME);
+
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
 	for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1276,7 @@ static void standby(void)
 {
 	int err;
 
-	dpm_suspend_noirq(PMSG_SUSPEND);
+	dpm_suspend_end(PMSG_SUSPEND);
 
 	local_irq_disable();
 	syscore_suspend();
@@ -1291,7 +1290,7 @@ static void standby(void)
 	syscore_resume();
 	local_irq_enable();
 
-	dpm_resume_noirq(PMSG_RESUME);
+	dpm_resume_start(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e2cc3d2e0ecc..b462c0e341cb 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -47,6 +47,7 @@ typedef int (*pm_callback_t)(struct device *);
 LIST_HEAD(dpm_list);
 LIST_HEAD(dpm_prepared_list);
 LIST_HEAD(dpm_suspended_list);
+LIST_HEAD(dpm_late_early_list);
 LIST_HEAD(dpm_noirq_list);
 
 struct suspend_stats suspend_stats;
@@ -245,6 +246,40 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
 	return NULL;
 }
 
+/**
+ * pm_late_early_op - Return the PM operation appropriate for given PM event.
+ * @ops: PM operations to choose from.
+ * @state: PM transition of the system being carried out.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
+				      pm_message_t state)
+{
+	switch (state.event) {
+#ifdef CONFIG_SUSPEND
+	case PM_EVENT_SUSPEND:
+		return ops->suspend_late;
+	case PM_EVENT_RESUME:
+		return ops->resume_early;
+#endif /* CONFIG_SUSPEND */
+#ifdef CONFIG_HIBERNATE_CALLBACKS
+	case PM_EVENT_FREEZE:
+	case PM_EVENT_QUIESCE:
+		return ops->freeze_late;
+	case PM_EVENT_HIBERNATE:
+		return ops->poweroff_late;
+	case PM_EVENT_THAW:
+	case PM_EVENT_RECOVER:
+		return ops->thaw_early;
+	case PM_EVENT_RESTORE:
+		return ops->restore_early;
+#endif /* CONFIG_HIBERNATE_CALLBACKS */
+	}
+
+	return NULL;
+}
+
 /**
  * pm_noirq_op - Return the PM operation appropriate for given PM event.
  * @ops: PM operations to choose from.
@@ -374,21 +409,21 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 	TRACE_RESUME(0);
 
 	if (dev->pm_domain) {
-		info = "EARLY power domain ";
+		info = "noirq power domain ";
 		callback = pm_noirq_op(&dev->pm_domain->ops, state);
 	} else if (dev->type && dev->type->pm) {
-		info = "EARLY type ";
+		info = "noirq type ";
 		callback = pm_noirq_op(dev->type->pm, state);
 	} else if (dev->class && dev->class->pm) {
-		info = "EARLY class ";
+		info = "noirq class ";
 		callback = pm_noirq_op(dev->class->pm, state);
 	} else if (dev->bus && dev->bus->pm) {
-		info = "EARLY bus ";
+		info = "noirq bus ";
 		callback = pm_noirq_op(dev->bus->pm, state);
 	}
 
 	if (!callback && dev->driver && dev->driver->pm) {
-		info = "EARLY driver ";
+		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
 	}
 
@@ -399,13 +434,13 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- * dpm_resume_noirq - Execute "early resume" callbacks for non-sysdev devices.
+ * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices.
  * @state: PM transition of the system being carried out.
  *
- * Call the "noirq" resume handlers for all devices marked as DPM_OFF_IRQ and
+ * Call the "noirq" resume handlers for all devices in dpm_noirq_list and
  * enable device drivers to receive interrupts.
  */
-void dpm_resume_noirq(pm_message_t state)
+static void dpm_resume_noirq(pm_message_t state)
 {
 	ktime_t starttime = ktime_get();
 
@@ -415,7 +450,7 @@ void dpm_resume_noirq(pm_message_t state)
 		int error;
 
 		get_device(dev);
-		list_move_tail(&dev->power.entry, &dpm_suspended_list);
+		list_move_tail(&dev->power.entry, &dpm_late_early_list);
 		mutex_unlock(&dpm_list_mtx);
 
 		error = device_resume_noirq(dev, state);
@@ -423,6 +458,80 @@ void dpm_resume_noirq(pm_message_t state)
 			suspend_stats.failed_resume_noirq++;
 			dpm_save_failed_step(SUSPEND_RESUME_NOIRQ);
 			dpm_save_failed_dev(dev_name(dev));
+			pm_dev_err(dev, state, " noirq", error);
+		}
+
+		mutex_lock(&dpm_list_mtx);
+		put_device(dev);
+	}
+	mutex_unlock(&dpm_list_mtx);
+	dpm_show_time(starttime, state, "noirq");
+	resume_device_irqs();
+}
+
+/**
+ * device_resume_early - Execute an "early resume" callback for given device.
+ * @dev: Device to handle.
+ * @state: PM transition of the system being carried out.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static int device_resume_early(struct device *dev, pm_message_t state)
+{
+	pm_callback_t callback = NULL;
+	char *info = NULL;
+	int error = 0;
+
+	TRACE_DEVICE(dev);
+	TRACE_RESUME(0);
+
+	if (dev->pm_domain) {
+		info = "early power domain ";
+		callback = pm_late_early_op(&dev->pm_domain->ops, state);
+	} else if (dev->type && dev->type->pm) {
+		info = "early type ";
+		callback = pm_late_early_op(dev->type->pm, state);
+	} else if (dev->class && dev->class->pm) {
+		info = "early class ";
+		callback = pm_late_early_op(dev->class->pm, state);
+	} else if (dev->bus && dev->bus->pm) {
+		info = "early bus ";
+		callback = pm_late_early_op(dev->bus->pm, state);
+	}
+
+	if (!callback && dev->driver && dev->driver->pm) {
+		info = "early driver ";
+		callback = pm_late_early_op(dev->driver->pm, state);
+	}
+
+	error = dpm_run_callback(callback, dev, state, info);
+
+	TRACE_RESUME(error);
+	return error;
+}
+
+/**
+ * dpm_resume_early - Execute "early resume" callbacks for all devices.
+ * @state: PM transition of the system being carried out.
+ */
+static void dpm_resume_early(pm_message_t state)
+{
+	ktime_t starttime = ktime_get();
+
+	mutex_lock(&dpm_list_mtx);
+	while (!list_empty(&dpm_late_early_list)) {
+		struct device *dev = to_device(dpm_late_early_list.next);
+		int error;
+
+		get_device(dev);
+		list_move_tail(&dev->power.entry, &dpm_suspended_list);
+		mutex_unlock(&dpm_list_mtx);
+
+		error = device_resume_early(dev, state);
+		if (error) {
+			suspend_stats.failed_resume_early++;
+			dpm_save_failed_step(SUSPEND_RESUME_EARLY);
+			dpm_save_failed_dev(dev_name(dev));
 			pm_dev_err(dev, state, " early", error);
 		}
 
@@ -431,9 +540,18 @@ void dpm_resume_noirq(pm_message_t state)
 	}
 	mutex_unlock(&dpm_list_mtx);
 	dpm_show_time(starttime, state, "early");
-	resume_device_irqs();
 }
-EXPORT_SYMBOL_GPL(dpm_resume_noirq);
+
+/**
+ * dpm_resume_start - Execute "noirq" and "early" device callbacks.
+ * @state: PM transition of the system being carried out.
+ */
+void dpm_resume_start(pm_message_t state)
+{
+	dpm_resume_noirq(state);
+	dpm_resume_early(state);
+}
+EXPORT_SYMBOL_GPL(dpm_resume_start);
 
 /**
  * device_resume - Execute "resume" callbacks for given device.
@@ -716,21 +834,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 	char *info = NULL;
 
 	if (dev->pm_domain) {
-		info = "LATE power domain ";
+		info = "noirq power domain ";
 		callback = pm_noirq_op(&dev->pm_domain->ops, state);
 	} else if (dev->type && dev->type->pm) {
-		info = "LATE type ";
+		info = "noirq type ";
 		callback = pm_noirq_op(dev->type->pm, state);
 	} else if (dev->class && dev->class->pm) {
-		info = "LATE class ";
+		info = "noirq class ";
 		callback = pm_noirq_op(dev->class->pm, state);
 	} else if (dev->bus && dev->bus->pm) {
-		info = "LATE bus ";
+		info = "noirq bus ";
 		callback = pm_noirq_op(dev->bus->pm, state);
 	}
 
 	if (!callback && dev->driver && dev->driver->pm) {
-		info = "LATE driver ";
+		info = "noirq driver ";
 		callback = pm_noirq_op(dev->driver->pm, state);
 	}
 
@@ -738,21 +856,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 }
 
 /**
- * dpm_suspend_noirq - Execute "late suspend" callbacks for non-sysdev devices.
+ * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices.
  * @state: PM transition of the system being carried out.
  *
  * Prevent device drivers from receiving interrupts and call the "noirq" suspend
  * handlers for all non-sysdev devices.
  */
-int dpm_suspend_noirq(pm_message_t state)
+static int dpm_suspend_noirq(pm_message_t state)
 {
 	ktime_t starttime = ktime_get();
 	int error = 0;
 
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
-	while (!list_empty(&dpm_suspended_list)) {
-		struct device *dev = to_device(dpm_suspended_list.prev);
+	while (!list_empty(&dpm_late_early_list)) {
+		struct device *dev = to_device(dpm_late_early_list.prev);
 
 		get_device(dev);
 		mutex_unlock(&dpm_list_mtx);
@@ -761,7 +879,7 @@ int dpm_suspend_noirq(pm_message_t state)
 
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
-			pm_dev_err(dev, state, " late", error);
+			pm_dev_err(dev, state, " noirq", error);
 			suspend_stats.failed_suspend_noirq++;
 			dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ);
 			dpm_save_failed_dev(dev_name(dev));
@@ -775,11 +893,96 @@ int dpm_suspend_noirq(pm_message_t state)
 	mutex_unlock(&dpm_list_mtx);
 	if (error)
 		dpm_resume_noirq(resume_event(state));
+	else
+		dpm_show_time(starttime, state, "noirq");
+	return error;
+}
+
+/**
+ * device_suspend_late - Execute a "late suspend" callback for given device.
+ * @dev: Device to handle.
+ * @state: PM transition of the system being carried out.
+ *
+ * Runtime PM is disabled for @dev while this function is being executed.
+ */
+static int device_suspend_late(struct device *dev, pm_message_t state)
+{
+	pm_callback_t callback = NULL;
+	char *info = NULL;
+
+	if (dev->pm_domain) {
+		info = "late power domain ";
+		callback = pm_late_early_op(&dev->pm_domain->ops, state);
+	} else if (dev->type && dev->type->pm) {
+		info = "late type ";
+		callback = pm_late_early_op(dev->type->pm, state);
+	} else if (dev->class && dev->class->pm) {
+		info = "late class ";
+		callback = pm_late_early_op(dev->class->pm, state);
+	} else if (dev->bus && dev->bus->pm) {
+		info = "late bus ";
+		callback = pm_late_early_op(dev->bus->pm, state);
+	}
+
+	if (!callback && dev->driver && dev->driver->pm) {
+		info = "late driver ";
+		callback = pm_late_early_op(dev->driver->pm, state);
+	}
+
+	return dpm_run_callback(callback, dev, state, info);
+}
+
+/**
+ * dpm_suspend_late - Execute "late suspend" callbacks for all devices.
+ * @state: PM transition of the system being carried out.
+ */
+static int dpm_suspend_late(pm_message_t state)
+{
+	ktime_t starttime = ktime_get();
+	int error = 0;
+
+	mutex_lock(&dpm_list_mtx);
+	while (!list_empty(&dpm_suspended_list)) {
+		struct device *dev = to_device(dpm_suspended_list.prev);
+
+		get_device(dev);
+		mutex_unlock(&dpm_list_mtx);
+
+		error = device_suspend_late(dev, state);
+
+		mutex_lock(&dpm_list_mtx);
+		if (error) {
+			pm_dev_err(dev, state, " late", error);
+			suspend_stats.failed_suspend_late++;
+			dpm_save_failed_step(SUSPEND_SUSPEND_LATE);
+			dpm_save_failed_dev(dev_name(dev));
+			put_device(dev);
+			break;
+		}
+		if (!list_empty(&dev->power.entry))
+			list_move(&dev->power.entry, &dpm_late_early_list);
+		put_device(dev);
+	}
+	mutex_unlock(&dpm_list_mtx);
+	if (error)
+		dpm_resume_early(resume_event(state));
 	else
 		dpm_show_time(starttime, state, "late");
+
 	return error;
 }
-EXPORT_SYMBOL_GPL(dpm_suspend_noirq);
+
+/**
+ * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks.
+ * @state: PM transition of the system being carried out.
+ */
+int dpm_suspend_end(pm_message_t state)
+{
+	int error = dpm_suspend_late(state);
+
+	return error ? : dpm_suspend_noirq(state);
+}
+EXPORT_SYMBOL_GPL(dpm_suspend_end);
 
 /**
  * legacy_suspend - Execute a legacy (bus or class) suspend callback for device.
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index ce4fa0831860..9e14ae6cd49c 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -129,9 +129,9 @@ static void do_suspend(void)
 	printk(KERN_DEBUG "suspending xenstore...\n");
 	xs_suspend();
 
-	err = dpm_suspend_noirq(PMSG_FREEZE);
+	err = dpm_suspend_end(PMSG_FREEZE);
 	if (err) {
-		printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
+		printk(KERN_ERR "dpm_suspend_end failed: %d\n", err);
 		goto out_resume;
 	}
 
@@ -149,7 +149,7 @@ static void do_suspend(void)
 
 	err = stop_machine(xen_suspend, &si, cpumask_of(0));
 
-	dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
+	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
 
 	if (err) {
 		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index e4982ac3fbbc..c68e1f22ac95 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -110,6 +110,10 @@ typedef struct pm_message {
  *	Subsystem-level @suspend() is executed for all devices after invoking
  *	subsystem-level @prepare() for all of them.
  *
+ * @suspend_late: Continue operations started by @suspend().  For a number of
+ *	devices @suspend_late() may point to the same callback routine as the
+ *	runtime suspend callback.
+ *
  * @resume: Executed after waking the system up from a sleep state in which the
  *	contents of main memory were preserved.  The exact action to perform
  *	depends on the device's subsystem, but generally the driver is expected
@@ -122,6 +126,10 @@ typedef struct pm_message {
  *	Subsystem-level @resume() is executed for all devices after invoking
  *	subsystem-level @resume_noirq() for all of them.
  *
+ * @resume_early: Prepare to execute @resume().  For a number of devices
+ *	@resume_early() may point to the same callback routine as the runtime
+ *	resume callback.
+ *
  * @freeze: Hibernation-specific, executed before creating a hibernation image.
  *	Analogous to @suspend(), but it should not enable the device to signal
  *	wakeup events or change its power state.  The majority of subsystems
@@ -131,6 +139,10 @@ typedef struct pm_message {
  *	Subsystem-level @freeze() is executed for all devices after invoking
  *	subsystem-level @prepare() for all of them.
  *
+ * @freeze_late: Continue operations started by @freeze().  Analogous to
+ *	@suspend_late(), but it should not enable the device to signal wakeup
+ *	events or change its power state.
+ *
  * @thaw: Hibernation-specific, executed after creating a hibernation image OR
  *	if the creation of an image has failed.  Also executed after a failing
  *	attempt to restore the contents of main memory from such an image.
@@ -140,15 +152,23 @@ typedef struct pm_message {
  *	subsystem-level @thaw_noirq() for all of them.  It also may be executed
  *	directly after @freeze() in case of a transition error.
  *
+ * @thaw_early: Prepare to execute @thaw().  Undo the changes made by the
+ *	preceding @freeze_late().
+ *
  * @poweroff: Hibernation-specific, executed after saving a hibernation image.
  *	Analogous to @suspend(), but it need not save the device's settings in
  *	memory.
  *	Subsystem-level @poweroff() is executed for all devices after invoking
  *	subsystem-level @prepare() for all of them.
  *
+ * @poweroff_late: Continue operations started by @poweroff().  Analogous to
+ *	@suspend_late(), but it need not save the device's settings in memory.
+ *
  * @restore: Hibernation-specific, executed after restoring the contents of main
  *	memory from a hibernation image, analogous to @resume().
  *
+ * @restore_early: Prepare to execute @restore(), analogous to @resume_early().
+ *
  * @suspend_noirq: Complete the actions started by @suspend().  Carry out any
  *	additional operations required for suspending the device that might be
  *	racing with its driver's interrupt handler, which is guaranteed not to
@@ -158,9 +178,10 @@ typedef struct pm_message {
  *	@suspend_noirq() has returned successfully.  If the device can generate
  *	system wakeup signals and is enabled to wake up the system, it should be
  *	configured to do so at that time.  However, depending on the platform
- *	and device's subsystem, @suspend() may be allowed to put the device into
- *	the low-power state and configure it to generate wakeup signals, in
- *	which case it generally is not necessary to define @suspend_noirq().
+ *	and device's subsystem, @suspend() or @suspend_late() may be allowed to
+ *	put the device into the low-power state and configure it to generate
+ *	wakeup signals, in which case it generally is not necessary to define
+ *	@suspend_noirq().
  *
  * @resume_noirq: Prepare for the execution of @resume() by carrying out any
  *	operations required for resuming the device that might be racing with
@@ -171,9 +192,9 @@ typedef struct pm_message {
  *	additional operations required for freezing the device that might be
  *	racing with its driver's interrupt handler, which is guaranteed not to
  *	run while @freeze_noirq() is being executed.
- *	The power state of the device should not be changed by either @freeze()
- *	or @freeze_noirq() and it should not be configured to signal system
- *	wakeup by any of these callbacks.
+ *	The power state of the device should not be changed by either @freeze(),
+ *	or @freeze_late(), or @freeze_noirq() and it should not be configured to
+ *	signal system wakeup by any of these callbacks.
  *
  * @thaw_noirq: Prepare for the execution of @thaw() by carrying out any
  *	operations required for thawing the device that might be racing with its
@@ -249,6 +270,12 @@ struct dev_pm_ops {
 	int (*thaw)(struct device *dev);
 	int (*poweroff)(struct device *dev);
 	int (*restore)(struct device *dev);
+	int (*suspend_late)(struct device *dev);
+	int (*resume_early)(struct device *dev);
+	int (*freeze_late)(struct device *dev);
+	int (*thaw_early)(struct device *dev);
+	int (*poweroff_late)(struct device *dev);
+	int (*restore_early)(struct device *dev);
 	int (*suspend_noirq)(struct device *dev);
 	int (*resume_noirq)(struct device *dev);
 	int (*freeze_noirq)(struct device *dev);
@@ -584,13 +611,13 @@ struct dev_pm_domain {
 
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
-extern void dpm_resume_noirq(pm_message_t state);
+extern void dpm_resume_start(pm_message_t state);
 extern void dpm_resume_end(pm_message_t state);
 extern void dpm_resume(pm_message_t state);
 extern void dpm_complete(pm_message_t state);
 
 extern void device_pm_unlock(void);
-extern int dpm_suspend_noirq(pm_message_t state);
+extern int dpm_suspend_end(pm_message_t state);
 extern int dpm_suspend_start(pm_message_t state);
 extern int dpm_suspend(pm_message_t state);
 extern int dpm_prepare(pm_message_t state);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 91784a4f8608..ac1c114c499d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -42,8 +42,10 @@ enum suspend_stat_step {
 	SUSPEND_FREEZE = 1,
 	SUSPEND_PREPARE,
 	SUSPEND_SUSPEND,
+	SUSPEND_SUSPEND_LATE,
 	SUSPEND_SUSPEND_NOIRQ,
 	SUSPEND_RESUME_NOIRQ,
+	SUSPEND_RESUME_EARLY,
 	SUSPEND_RESUME
 };
 
@@ -53,8 +55,10 @@ struct suspend_stats {
 	int	failed_freeze;
 	int	failed_prepare;
 	int	failed_suspend;
+	int	failed_suspend_late;
 	int	failed_suspend_noirq;
 	int	failed_resume;
+	int	failed_resume_early;
 	int	failed_resume_noirq;
 #define	REC_FAILED_NUM	2
 	int	last_failed_dev;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..a6a675cb9818 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1546,13 +1546,13 @@ int kernel_kexec(void)
 		if (error)
 			goto Resume_console;
 		/* At this point, dpm_suspend_start() has been called,
-		 * but *not* dpm_suspend_noirq(). We *must* call
-		 * dpm_suspend_noirq() now.  Otherwise, drivers for
+		 * but *not* dpm_suspend_end(). We *must* call
+		 * dpm_suspend_end() now.  Otherwise, drivers for
 		 * some devices (e.g. interrupt controllers) become
 		 * desynchronized with the actual state of the
 		 * hardware at resume time, and evil weirdness ensues.
 		 */
-		error = dpm_suspend_noirq(PMSG_FREEZE);
+		error = dpm_suspend_end(PMSG_FREEZE);
 		if (error)
 			goto Resume_devices;
 		error = disable_nonboot_cpus();
@@ -1579,7 +1579,7 @@ int kernel_kexec(void)
 		local_irq_enable();
  Enable_cpus:
 		enable_nonboot_cpus();
-		dpm_resume_noirq(PMSG_RESTORE);
+		dpm_resume_start(PMSG_RESTORE);
  Resume_devices:
 		dpm_resume_end(PMSG_RESTORE);
  Resume_console:
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..a5d4cf0aa03e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
  * create_image - Create a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
  *
- * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
- * and execute the drivers' .thaw_noirq() callbacks.
+ * Execute device drivers' "late" and "noirq" freeze callbacks, create a
+ * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
  *
  * Control reappears in this routine after the subsequent restore.
  */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
 {
 	int error;
 
-	error = dpm_suspend_noirq(PMSG_FREEZE);
+	error = dpm_suspend_end(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
  Platform_finish:
 	platform_finish(platform_mode);
 
-	dpm_resume_noirq(in_suspend ?
+	dpm_resume_start(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
 	return error;
@@ -394,16 +394,16 @@ int hibernation_snapshot(int platform_mode)
  * resume_target_kernel - Restore system state from a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
  *
- * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- * highmem that have not been restored yet from the image and run the low-level
- * code that will restore the remaining contents of memory and switch to the
- * just restored target kernel.
+ * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
+ * contents of highmem that have not been restored yet from the image and run
+ * the low-level code that will restore the remaining contents of memory and
+ * switch to the just restored target kernel.
  */
 static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
-	error = dpm_suspend_noirq(PMSG_QUIESCE);
+	error = dpm_suspend_end(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -460,7 +460,7 @@ static int resume_target_kernel(bool platform_mode)
  Cleanup:
 	platform_restore_cleanup(platform_mode);
 
-	dpm_resume_noirq(PMSG_RECOVER);
+	dpm_resume_start(PMSG_RECOVER);
 
 	return error;
 }
@@ -518,7 +518,7 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
-	error = dpm_suspend_noirq(PMSG_HIBERNATE);
+	error = dpm_suspend_end(PMSG_HIBERNATE);
 	if (error)
 		goto Resume_devices;
 
@@ -549,7 +549,7 @@ int hibernation_platform_enter(void)
  Platform_finish:
 	hibernation_ops->finish();
 
-	dpm_resume_noirq(PMSG_RESTORE);
+	dpm_resume_start(PMSG_RESTORE);
 
  Resume_devices:
 	entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..8c5014a4e052 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
 	last_errno %= REC_FAILED_NUM;
 	last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
 	last_step %= REC_FAILED_NUM;
-	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
-			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
 			"success", suspend_stats.success,
 			"fail", suspend_stats.fail,
 			"failed_freeze", suspend_stats.failed_freeze,
 			"failed_prepare", suspend_stats.failed_prepare,
 			"failed_suspend", suspend_stats.failed_suspend,
+			"failed_suspend_late",
+				suspend_stats.failed_suspend_late,
 			"failed_suspend_noirq",
 				suspend_stats.failed_suspend_noirq,
 			"failed_resume", suspend_stats.failed_resume,
+			"failed_resume_early",
+				suspend_stats.failed_resume_early,
 			"failed_resume_noirq",
 				suspend_stats.failed_resume_noirq);
 	seq_printf(s,	"failures:\n  last_failed_dev:\t%-s\n",
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..560a639614a1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -147,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 			goto Platform_finish;
 	}
 
-	error = dpm_suspend_noirq(PMSG_SUSPEND);
+	error = dpm_suspend_end(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Platform_finish;
@@ -189,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	if (suspend_ops->wake)
 		suspend_ops->wake();
 
-	dpm_resume_noirq(PMSG_RESUME);
+	dpm_resume_start(PMSG_RESUME);
 
  Platform_finish:
 	if (suspend_ops->finish)
-- 
cgit v1.2.3-55-g7522


From d031e1de2c5ba91e67ed83f6adf624543ab2b03d Mon Sep 17 00:00:00 2001
From: Alex Frid
Date: Sun, 29 Jan 2012 20:39:25 +0100
Subject: PM / QoS: Simplify PM QoS expansion/merge

 - Replace class ID #define with enumeration
 - Loop through PM QoS objects during initialization (rather than
   initializing them one-by-one)

Signed-off-by: Alex Frid <afrid@nvidia.com>
Reviewed-by: Antti Miettinen <amiettinen@nvidia.com>
Reviewed-by: Diwakar Tundlam <dtundlam@nvidia.com>
Reviewed-by: Scott Williams <scwilliams@nvidia.com>
Reviewed-by: Yu-Huan Hsu <yhsu@nvidia.com>
Acked-by: markgross <markgross@thegnar.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/pm_qos.h | 14 +++++++++-----
 kernel/power/qos.c     | 23 ++++++++++-------------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index e5bbcbaa6f57..5ac91d8e69de 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -9,12 +9,16 @@
 #include <linux/miscdevice.h>
 #include <linux/device.h>
 
-#define PM_QOS_RESERVED 0
-#define PM_QOS_CPU_DMA_LATENCY 1
-#define PM_QOS_NETWORK_LATENCY 2
-#define PM_QOS_NETWORK_THROUGHPUT 3
+enum {
+	PM_QOS_RESERVED = 0,
+	PM_QOS_CPU_DMA_LATENCY,
+	PM_QOS_NETWORK_LATENCY,
+	PM_QOS_NETWORK_THROUGHPUT,
+
+	/* insert new class ID */
+	PM_QOS_NUM_CLASSES,
+};
 
-#define PM_QOS_NUM_CLASSES 4
 #define PM_QOS_DEFAULT_VALUE -1
 
 #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE	(2000 * USEC_PER_SEC)
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..d6d6dbd1ecc0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 static int __init pm_qos_power_init(void)
 {
 	int ret = 0;
+	int i;
 
-	ret = register_pm_qos_misc(&cpu_dma_pm_qos);
-	if (ret < 0) {
-		printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
-		return ret;
-	}
-	ret = register_pm_qos_misc(&network_lat_pm_qos);
-	if (ret < 0) {
-		printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
-		return ret;
+	BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+
+	for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
+		ret = register_pm_qos_misc(pm_qos_array[i]);
+		if (ret < 0) {
+			printk(KERN_ERR "pm_qos_param: %s setup failed\n",
+			       pm_qos_array[i]->name);
+			return ret;
+		}
 	}
-	ret = register_pm_qos_misc(&network_throughput_pm_qos);
-	if (ret < 0)
-		printk(KERN_ERR
-			"pm_qos_param: network_throughput setup failed\n");
 
 	return ret;
 }
-- 
cgit v1.2.3-55-g7522


From 61d1d219c4c0761059236a46867bc49943c4d29d Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines
Date: Mon, 30 Jan 2012 12:51:56 -0800
Subject: cgroup: remove extra calls to find_existing_css_set

In cgroup_attach_proc, we indirectly call find_existing_css_set 3
times. It is an expensive call so we want to call it a minimum
of times. This patch only calls it once and stores the result so
that it can be used later on when we call cgroup_task_migrate.

This required modifying cgroup_task_migrate to take the new css_set
(which we obtained from find_css_set) as a parameter. The nice side
effect of this is that cgroup_task_migrate is now identical for
cgroup_attach_task and cgroup_attach_proc. It also now returns a
void since it can never fail.

Changes in V5:
* https://lkml.org/lkml/2012/1/20/344 (Tejun Heo)
  * Remove css_set_refs
Changes in V4:
* https://lkml.org/lkml/2011/12/22/421 (Li Zefan)
  * Avoid GFP_KERNEL (sleep) in rcu_read_lock by getting css_set in
    a separate loop not under an rcu_read_lock
Changes in V3:
* https://lkml.org/lkml/2011/12/22/13 (Li Zefan)
  * Fixed earlier bug by creating a seperate patch to remove tasklist_lock
Changes in V2:
* https://lkml.org/lkml/2011/12/20/372 (Tejun Heo)
  * Move find_css_set call into loop which creates the flex array
* Author
  * Kill css_set_refs and use group_size instead
  * Fix an off-by-one error in counting css_set refs
  * Add a retval check in out_list_teardown

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: containers@lists.linux-foundation.org
Cc: cgroups@vger.kernel.org
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Menage <paul@paulmenage.org>
---
 kernel/cgroup.c | 140 +++++++++++---------------------------------------------
 1 file changed, 27 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1626152dcc1e..43a224f167b5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1763,6 +1763,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 struct task_and_cgroup {
 	struct task_struct	*task;
 	struct cgroup		*cgrp;
+	struct css_set		*cg;
 };
 
 struct cgroup_taskset {
@@ -1843,11 +1844,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
  * will already exist. If not set, this function might sleep, and can fail with
  * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
  */
-static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
-			       struct task_struct *tsk, bool guarantee)
+static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+				struct task_struct *tsk, struct css_set *newcg)
 {
 	struct css_set *oldcg;
-	struct css_set *newcg;
 
 	/*
 	 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1857,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	oldcg = tsk->cgroups;
 
-	/* locate or allocate a new css_set for this task. */
-	if (guarantee) {
-		/* we know the css_set we want already exists. */
-		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-		read_lock(&css_set_lock);
-		newcg = find_existing_css_set(oldcg, cgrp, template);
-		BUG_ON(!newcg);
-		get_css_set(newcg);
-		read_unlock(&css_set_lock);
-	} else {
-		might_sleep();
-		/* find_css_set will give us newcg already referenced. */
-		newcg = find_css_set(oldcg, cgrp);
-		if (!newcg)
-			return -ENOMEM;
-	}
-
 	task_lock(tsk);
 	rcu_assign_pointer(tsk->cgroups, newcg);
 	task_unlock(tsk);
@@ -1892,7 +1875,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	put_css_set(oldcg);
 
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
-	return 0;
 }
 
 /**
@@ -1910,6 +1892,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	struct cgroup *oldcgrp;
 	struct cgroupfs_root *root = cgrp->root;
 	struct cgroup_taskset tset = { };
+	struct css_set *newcg;
 
 	/* @tsk either already exited or can't exit until the end */
 	if (tsk->flags & PF_EXITING)
@@ -1939,9 +1922,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 		}
 	}
 
-	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
-	if (retval)
+	newcg = find_css_set(tsk->cgroups, cgrp);
+	if (!newcg) {
+		retval = -ENOMEM;
 		goto out;
+	}
+
+	cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
 
 	for_each_subsys(root, ss) {
 		if (ss->attach)
@@ -1997,66 +1984,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
-/*
- * cgroup_attach_proc works in two stages, the first of which prefetches all
- * new css_sets needed (to make sure we have enough memory before committing
- * to the move) and stores them in a list of entries of the following type.
- * TODO: possible optimization: use css_set->rcu_head for chaining instead
- */
-struct cg_list_entry {
-	struct css_set *cg;
-	struct list_head links;
-};
-
-static bool css_set_check_fetched(struct cgroup *cgrp,
-				  struct task_struct *tsk, struct css_set *cg,
-				  struct list_head *newcg_list)
-{
-	struct css_set *newcg;
-	struct cg_list_entry *cg_entry;
-	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
-	read_lock(&css_set_lock);
-	newcg = find_existing_css_set(cg, cgrp, template);
-	read_unlock(&css_set_lock);
-
-	/* doesn't exist at all? */
-	if (!newcg)
-		return false;
-	/* see if it's already in the list */
-	list_for_each_entry(cg_entry, newcg_list, links)
-		if (cg_entry->cg == newcg)
-			return true;
-
-	/* not found */
-	return false;
-}
-
-/*
- * Find the new css_set and store it in the list in preparation for moving the
- * given task to the given cgroup. Returns 0 or -ENOMEM.
- */
-static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
-			    struct list_head *newcg_list)
-{
-	struct css_set *newcg;
-	struct cg_list_entry *cg_entry;
-
-	/* ensure a new css_set will exist for this thread */
-	newcg = find_css_set(cg, cgrp);
-	if (!newcg)
-		return -ENOMEM;
-	/* add it to the list */
-	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
-	if (!cg_entry) {
-		put_css_set(newcg);
-		return -ENOMEM;
-	}
-	cg_entry->cg = newcg;
-	list_add(&cg_entry->links, newcg_list);
-	return 0;
-}
-
 /**
  * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
@@ -2070,20 +1997,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
 	/* guaranteed to be initialized later, but the compiler needs this */
-	struct css_set *oldcg;
 	struct cgroupfs_root *root = cgrp->root;
 	/* threadgroup list cursor and array */
 	struct task_struct *tsk;
 	struct task_and_cgroup *tc;
 	struct flex_array *group;
 	struct cgroup_taskset tset = { };
-	/*
-	 * we need to make sure we have css_sets for all the tasks we're
-	 * going to move -before- we actually start moving them, so that in
-	 * case we get an ENOMEM we can bail out before making any changes.
-	 */
-	struct list_head newcg_list;
-	struct cg_list_entry *cg_entry, *temp_nobe;
 
 	/*
 	 * step 0: in order to do expensive, possibly blocking operations for
@@ -2119,15 +2038,15 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 
 		/* as per above, nr_threads may decrease, but not increase. */
 		BUG_ON(i >= group_size);
-		/*
-		 * saying GFP_ATOMIC has no effect here because we did prealloc
-		 * earlier, but it's good form to communicate our expectations.
-		 */
 		ent.task = tsk;
 		ent.cgrp = task_cgroup_from_root(tsk, root);
 		/* nothing to do if this task is already in the cgroup */
 		if (ent.cgrp == cgrp)
 			continue;
+		/*
+		 * saying GFP_ATOMIC has no effect here because we did prealloc
+		 * earlier, but it's good form to communicate our expectations.
+		 */
 		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
@@ -2160,17 +2079,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 * step 2: make sure css_sets exist for all threads to be migrated.
 	 * we use find_css_set, which allocates a new one if necessary.
 	 */
-	INIT_LIST_HEAD(&newcg_list);
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
-		oldcg = tc->task->cgroups;
-
-		/* if we don't already have it in the list get a new one */
-		if (!css_set_check_fetched(cgrp, tc->task, oldcg,
-					   &newcg_list)) {
-			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
-			if (retval)
-				goto out_list_teardown;
+		tc->cg = find_css_set(tc->task->cgroups, cgrp);
+		if (!tc->cg) {
+			retval = -ENOMEM;
+			goto out_put_css_set_refs;
 		}
 	}
 
@@ -2181,8 +2095,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
-		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
-		BUG_ON(retval);
+		cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
 	}
 	/* nothing is sensitive to fork() after this point. */
 
@@ -2200,15 +2113,16 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	synchronize_rcu();
 	cgroup_wakeup_rmdir_waiter(cgrp);
 	retval = 0;
-out_list_teardown:
-	/* clean up the list of prefetched css_sets. */
-	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
-		list_del(&cg_entry->links);
-		put_css_set(cg_entry->cg);
-		kfree(cg_entry);
+out_put_css_set_refs:
+	if (retval) {
+		for (i = 0; i < group_size; i++) {
+			tc = flex_array_get(group, i);
+			if (!tc->cg)
+				break;
+			put_css_set(tc->cg);
+		}
 	}
 out_cancel_attach:
-	/* same deal as in cgroup_attach_task */
 	if (retval) {
 		for_each_subsys(root, ss) {
 			if (ss == failed_ss)
-- 
cgit v1.2.3-55-g7522


From 761b3ef50e1c2649cffbfa67a4dcb2dcdb7982ed Mon Sep 17 00:00:00 2001
From: Li Zefan
Date: Tue, 31 Jan 2012 13:47:36 +0800
Subject: cgroup: remove cgroup_subsys argument from callbacks

The argument is not used at all, and it's not necessary, because
a specific callback handler of course knows which subsys it
belongs to.

Now only ->pupulate() takes this argument, because the handlers of
this callback always call cgroup_add_file()/cgroup_add_files().

So we reduce a few lines of code, though the shrinking of object size
is minimal.

 16 files changed, 113 insertions(+), 162 deletions(-)

   text    data     bss     dec     hex filename
5486240  656987 7039960 13183187         c928d3 vmlinux.o.orig
5486170  656987 7039960 13183117         c9288d vmlinux.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/cgroups.txt | 26 +++++++++------------
 block/blk-cgroup.c                | 22 +++++++-----------
 include/linux/cgroup.h            | 29 ++++++++++-------------
 include/net/sock.h                |  7 +++---
 include/net/tcp_memcontrol.h      |  2 +-
 kernel/cgroup.c                   | 43 +++++++++++++++++------------------
 kernel/cgroup_freezer.c           | 11 ++++-----
 kernel/cpuset.c                   | 16 ++++---------
 kernel/events/core.c              | 13 ++++-------
 kernel/sched/core.c               | 20 +++++++---------
 mm/memcontrol.c                   | 48 ++++++++++++++++-----------------------
 net/core/netprio_cgroup.c         | 10 ++++----
 net/core/sock.c                   |  6 ++---
 net/ipv4/tcp_memcontrol.c         |  2 +-
 net/sched/cls_cgroup.c            | 10 ++++----
 security/device_cgroup.c          | 10 ++++----
 16 files changed, 113 insertions(+), 162 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index a7c96ae5557c..8e74980ab385 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -558,8 +558,7 @@ Each subsystem may export the following methods. The only mandatory
 methods are create/destroy. Any others that are null are presumed to
 be successful no-ops.
 
-struct cgroup_subsys_state *create(struct cgroup_subsys *ss,
-				   struct cgroup *cgrp)
+struct cgroup_subsys_state *create(struct cgroup *cgrp)
 (cgroup_mutex held by caller)
 
 Called to create a subsystem state object for a cgroup. The
@@ -574,7 +573,7 @@ identified by the passed cgroup object having a NULL parent (since
 it's the root of the hierarchy) and may be an appropriate place for
 initialization code.
 
-void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+void destroy(struct cgroup *cgrp)
 (cgroup_mutex held by caller)
 
 The cgroup system is about to destroy the passed cgroup; the subsystem
@@ -585,7 +584,7 @@ cgroup->parent is still valid. (Note - can also be called for a
 newly-created cgroup if an error occurs after this subsystem's
 create() method has been called for the new cgroup).
 
-int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+int pre_destroy(struct cgroup *cgrp);
 
 Called before checking the reference count on each subsystem. This may
 be useful for subsystems which have some extra references even if
@@ -593,8 +592,7 @@ there are not tasks in the cgroup. If pre_destroy() returns error code,
 rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
-int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	       struct cgroup_taskset *tset)
+int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
 Called prior to moving one or more tasks into a cgroup; if the
@@ -615,8 +613,7 @@ fork. If this method returns 0 (success) then this should remain valid
 while the caller holds cgroup_mutex and it is ensured that either
 attach() or cancel_attach() will be called in future.
 
-void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		   struct cgroup_taskset *tset)
+void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
 Called when a task attach operation has failed after can_attach() has succeeded.
@@ -625,23 +622,22 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
 This will be called only about subsystems whose can_attach() operation have
 succeeded. The parameters are identical to can_attach().
 
-void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	    struct cgroup_taskset *tset)
+void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
 The parameters are identical to can_attach().
 
-void fork(struct cgroup_subsy *ss, struct task_struct *task)
+void fork(struct task_struct *task)
 
 Called when a task is forked into a cgroup.
 
-void exit(struct cgroup_subsys *ss, struct task_struct *task)
+void exit(struct task_struct *task)
 
 Called during task exit.
 
-int populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+int populate(struct cgroup *cgrp)
 (cgroup_mutex held by caller)
 
 Called after creation of a cgroup to allow a subsystem to populate
@@ -651,7 +647,7 @@ include/linux/cgroup.h for details).  Note that although this
 method can return an error code, the error code is currently not
 always handled well.
 
-void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
+void post_clone(struct cgroup *cgrp)
 (cgroup_mutex held by caller)
 
 Called during cgroup_create() to do any parameter
@@ -659,7 +655,7 @@ initialization which might be required before a task could attach.  For
 example in cpusets, no task may attach before 'cpus' and 'mems' are set
 up.
 
-void bind(struct cgroup_subsys *ss, struct cgroup *root)
+void bind(struct cgroup *root)
 (cgroup_mutex and ss->hierarchy_mutex held by caller)
 
 Called when a cgroup subsystem is rebound to a different hierarchy
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fa8f26309444..1359d637831f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -28,13 +28,10 @@ static LIST_HEAD(blkio_list);
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
-static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
-						  struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
-			      struct cgroup_taskset *);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-			   struct cgroup_taskset *);
-static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
+static struct cgroup_subsys_state *blkiocg_create(struct cgroup *);
+static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *);
+static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *);
+static void blkiocg_destroy(struct cgroup *);
 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
 
 /* for encoding cft->private value on file */
@@ -1548,7 +1545,7 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 				ARRAY_SIZE(blkio_files));
 }
 
-static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+static void blkiocg_destroy(struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	unsigned long flags;
@@ -1598,8 +1595,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 		kfree(blkcg);
 }
 
-static struct cgroup_subsys_state *
-blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg;
 	struct cgroup *parent = cgroup->parent;
@@ -1628,8 +1624,7 @@ done:
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			      struct cgroup_taskset *tset)
+static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct io_context *ioc;
@@ -1648,8 +1643,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	return ret;
 }
 
-static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			   struct cgroup_taskset *tset)
+static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct io_context *ioc;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7da3e745b74c..501adb1b2f43 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -452,23 +452,18 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
  */
 
 struct cgroup_subsys {
-	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
-						  struct cgroup *cgrp);
-	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
-	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
-	int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			  struct cgroup_taskset *tset);
-	void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			      struct cgroup_taskset *tset);
-	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		       struct cgroup_taskset *tset);
-	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
-	void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			struct cgroup *old_cgrp, struct task_struct *task);
-	int (*populate)(struct cgroup_subsys *ss,
-			struct cgroup *cgrp);
-	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
-	void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
+	struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
+	int (*pre_destroy)(struct cgroup *cgrp);
+	void (*destroy)(struct cgroup *cgrp);
+	int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
+	void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
+	void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
+	void (*fork)(struct task_struct *task);
+	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
+		     struct task_struct *task);
+	int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp);
+	void (*post_clone)(struct cgroup *cgrp);
+	void (*bind)(struct cgroup *root);
 
 	int subsys_id;
 	int active;
diff --git a/include/net/sock.h b/include/net/sock.h
index bb972d254dff..705d1add19a1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -68,7 +68,7 @@ struct cgroup;
 struct cgroup_subsys;
 #ifdef CONFIG_NET
 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss);
-void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss);
+void mem_cgroup_sockets_destroy(struct cgroup *cgrp);
 #else
 static inline
 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
@@ -76,7 +76,7 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	return 0;
 }
 static inline
-void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
 {
 }
 #endif
@@ -869,8 +869,7 @@ struct proto {
 	 */
 	int			(*init_cgroup)(struct cgroup *cgrp,
 					       struct cgroup_subsys *ss);
-	void			(*destroy_cgroup)(struct cgroup *cgrp,
-						  struct cgroup_subsys *ss);
+	void			(*destroy_cgroup)(struct cgroup *cgrp);
 	struct cg_proto		*(*proto_cgroup)(struct mem_cgroup *memcg);
 #endif
 };
diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h
index 3512082fa909..48410ff25c9e 100644
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -13,7 +13,7 @@ struct tcp_memcontrol {
 
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
-void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
+void tcp_destroy_cgroup(struct cgroup *cgrp);
 unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
 #endif /* _TCP_MEMCG_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 43a224f167b5..865d89a580c7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 
 	for_each_subsys(cgrp->root, ss)
 		if (ss->pre_destroy) {
-			ret = ss->pre_destroy(ss, cgrp);
+			ret = ss->pre_destroy(cgrp);
 			if (ret)
 				break;
 		}
@@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 * Release the subsystem state objects.
 		 */
 		for_each_subsys(cgrp->root, ss)
-			ss->destroy(ss, cgrp);
+			ss->destroy(cgrp);
 
 		cgrp->root->number_of_cgroups--;
 		mutex_unlock(&cgroup_mutex);
@@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
-				ss->bind(ss, cgrp);
+				ss->bind(cgrp);
 			mutex_unlock(&ss->hierarchy_mutex);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_bits) {
@@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
 			mutex_lock(&ss->hierarchy_mutex);
 			if (ss->bind)
-				ss->bind(ss, dummytop);
+				ss->bind(dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
@@ -1908,7 +1908,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
-			retval = ss->can_attach(ss, cgrp, &tset);
+			retval = ss->can_attach(cgrp, &tset);
 			if (retval) {
 				/*
 				 * Remember on which subsystem the can_attach()
@@ -1932,7 +1932,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	for_each_subsys(root, ss) {
 		if (ss->attach)
-			ss->attach(ss, cgrp, &tset);
+			ss->attach(cgrp, &tset);
 	}
 
 	synchronize_rcu();
@@ -1954,7 +1954,7 @@ out:
 				 */
 				break;
 			if (ss->cancel_attach)
-				ss->cancel_attach(ss, cgrp, &tset);
+				ss->cancel_attach(cgrp, &tset);
 		}
 	}
 	return retval;
@@ -2067,7 +2067,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
-			retval = ss->can_attach(ss, cgrp, &tset);
+			retval = ss->can_attach(cgrp, &tset);
 			if (retval) {
 				failed_ss = ss;
 				goto out_cancel_attach;
@@ -2104,7 +2104,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 */
 	for_each_subsys(root, ss) {
 		if (ss->attach)
-			ss->attach(ss, cgrp, &tset);
+			ss->attach(cgrp, &tset);
 	}
 
 	/*
@@ -2128,7 +2128,7 @@ out_cancel_attach:
 			if (ss == failed_ss)
 				break;
 			if (ss->cancel_attach)
-				ss->cancel_attach(ss, cgrp, &tset);
+				ss->cancel_attach(cgrp, &tset);
 		}
 	}
 out_free_group_list:
@@ -3756,7 +3756,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 
 	for_each_subsys(root, ss) {
-		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
+		struct cgroup_subsys_state *css = ss->create(cgrp);
 
 		if (IS_ERR(css)) {
 			err = PTR_ERR(css);
@@ -3770,7 +3770,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		}
 		/* At error, ->destroy() callback has to free assigned ID. */
 		if (clone_children(parent) && ss->post_clone)
-			ss->post_clone(ss, cgrp);
+			ss->post_clone(cgrp);
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -3804,7 +3804,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	for_each_subsys(root, ss) {
 		if (cgrp->subsys[ss->subsys_id])
-			ss->destroy(ss, cgrp);
+			ss->destroy(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -4028,7 +4028,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	/* Create the top cgroup state for this subsystem */
 	list_add(&ss->sibling, &rootnode.subsys_list);
 	ss->root = &rootnode;
-	css = ss->create(ss, dummytop);
+	css = ss->create(dummytop);
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_cgroup_css(css, ss, dummytop);
@@ -4117,7 +4117,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	 * no ss->create seems to need anything important in the ss struct, so
 	 * this can happen first (i.e. before the rootnode attachment).
 	 */
-	css = ss->create(ss, dummytop);
+	css = ss->create(dummytop);
 	if (IS_ERR(css)) {
 		/* failure case - need to deassign the subsys[] slot. */
 		subsys[i] = NULL;
@@ -4135,7 +4135,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 		int ret = cgroup_init_idr(ss, css);
 		if (ret) {
 			dummytop->subsys[ss->subsys_id] = NULL;
-			ss->destroy(ss, dummytop);
+			ss->destroy(dummytop);
 			subsys[i] = NULL;
 			mutex_unlock(&cgroup_mutex);
 			return ret;
@@ -4233,7 +4233,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	 * pointer to find their state. note that this also takes care of
 	 * freeing the css_id.
 	 */
-	ss->destroy(ss, dummytop);
+	ss->destroy(dummytop);
 	dummytop->subsys[ss->subsys_id] = NULL;
 
 	mutex_unlock(&cgroup_mutex);
@@ -4509,7 +4509,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
 		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->fork)
-				ss->fork(ss, child);
+				ss->fork(child);
 		}
 	}
 }
@@ -4611,7 +4611,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 				struct cgroup *old_cgrp =
 					rcu_dereference_raw(cg->subsys[i])->cgroup;
 				struct cgroup *cgrp = task_cgroup(tsk, i);
-				ss->exit(ss, cgrp, old_cgrp, tsk);
+				ss->exit(cgrp, old_cgrp, tsk);
 			}
 		}
 	}
@@ -5066,8 +5066,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
-						   struct cgroup *cont)
+static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 
@@ -5077,7 +5076,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 	return css;
 }
 
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void debug_destroy(struct cgroup *cont)
 {
 	kfree(cont->subsys[debug_subsys_id]);
 }
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a64..f86e93920b62 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;
  *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
  *     sighand->siglock
  */
-static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
-						  struct cgroup *cgroup)
+static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
 {
 	struct freezer *freezer;
 
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
 	return &freezer->css;
 }
 
-static void freezer_destroy(struct cgroup_subsys *ss,
-			    struct cgroup *cgroup)
+static void freezer_destroy(struct cgroup *cgroup)
 {
 	struct freezer *freezer = cgroup_freezer(cgroup);
 
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
  * a write to that file racing against an attach, and hence the
  * can_attach() result will remain valid until the attach completes.
  */
-static int freezer_can_attach(struct cgroup_subsys *ss,
-			      struct cgroup *new_cgroup,
+static int freezer_can_attach(struct cgroup *new_cgroup,
 			      struct cgroup_taskset *tset)
 {
 	struct freezer *freezer;
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
 	return 0;
 }
 
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+static void freezer_fork(struct task_struct *task)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a661..5d575836dba6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1399,8 +1399,7 @@ static nodemask_t cpuset_attach_nodemask_from;
 static nodemask_t cpuset_attach_nodemask_to;
 
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			     struct cgroup_taskset *tset)
+static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct cpuset *cs = cgroup_cs(cgrp);
 	struct task_struct *task;
@@ -1436,8 +1435,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	return 0;
 }
 
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			  struct cgroup_taskset *tset)
+static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct mm_struct *mm;
 	struct task_struct *task;
@@ -1833,8 +1831,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
  * held.
  */
-static void cpuset_post_clone(struct cgroup_subsys *ss,
-			      struct cgroup *cgroup)
+static void cpuset_post_clone(struct cgroup *cgroup)
 {
 	struct cgroup *parent, *child;
 	struct cpuset *cs, *parent_cs;
@@ -1857,13 +1854,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 
 /*
  *	cpuset_create - create a cpuset
- *	ss:	cpuset cgroup subsystem
  *	cont:	control group that the new cpuset will be part of
  */
 
-static struct cgroup_subsys_state *cpuset_create(
-	struct cgroup_subsys *ss,
-	struct cgroup *cont)
+static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
 {
 	struct cpuset *cs;
 	struct cpuset *parent;
@@ -1902,7 +1896,7 @@ static struct cgroup_subsys_state *cpuset_create(
  * will call async_rebuild_sched_domains().
  */
 
-static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+static void cpuset_destroy(struct cgroup *cont)
 {
 	struct cpuset *cs = cgroup_cs(cont);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a8f4ac001a00..a5d1ee92b0d9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6906,8 +6906,7 @@ unlock:
 device_initcall(perf_event_sysfs_init);
 
 #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
-	struct cgroup_subsys *ss, struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
 {
 	struct perf_cgroup *jc;
 
@@ -6924,8 +6923,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
 	return &jc->css;
 }
 
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
-				struct cgroup *cont)
+static void perf_cgroup_destroy(struct cgroup *cont)
 {
 	struct perf_cgroup *jc;
 	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6941,8 +6939,7 @@ static int __perf_cgroup_move(void *info)
 	return 0;
 }
 
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-			       struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 
@@ -6950,8 +6947,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		task_function_call(task, __perf_cgroup_move, task);
 }
 
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		struct cgroup *old_cgrp, struct task_struct *task)
+static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+			     struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df00cb09263e..ff12f7216062 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7530,8 +7530,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 			    struct task_group, css);
 }
 
-static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
 {
 	struct task_group *tg, *parent;
 
@@ -7548,15 +7547,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	return &tg->css;
 }
 
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpu_cgroup_destroy(struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 
 	sched_destroy_group(tg);
 }
 
-static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup *cgrp,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
@@ -7574,7 +7572,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	return 0;
 }
 
-static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup *cgrp,
 			      struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
@@ -7584,8 +7582,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-		struct cgroup *old_cgrp, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+		struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
@@ -7935,8 +7933,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  */
 
 /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
-	struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
 {
 	struct cpuacct *ca;
 
@@ -7966,8 +7963,7 @@ out:
 }
 
 /* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpuacct_destroy(struct cgroup *cgrp)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3dbff4dcde35..ae2f0a8ab761 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4580,10 +4580,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
 	return mem_cgroup_sockets_init(cont, ss);
 };
 
-static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
-				struct cgroup *cont)
+static void kmem_cgroup_destroy(struct cgroup *cont)
 {
-	mem_cgroup_sockets_destroy(cont, ss);
+	mem_cgroup_sockets_destroy(cont);
 }
 #else
 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
@@ -4591,8 +4590,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
 	return 0;
 }
 
-static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
-				struct cgroup *cont)
+static void kmem_cgroup_destroy(struct cgroup *cont)
 {
 }
 #endif
@@ -4884,7 +4882,7 @@ err_cleanup:
 }
 
 static struct cgroup_subsys_state * __ref
-mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+mem_cgroup_create(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg, *parent;
 	long error = -ENOMEM;
@@ -4946,20 +4944,18 @@ free_out:
 	return ERR_PTR(error);
 }
 
-static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
-					struct cgroup *cont)
+static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
 	return mem_cgroup_force_empty(memcg, false);
 }
 
-static void mem_cgroup_destroy(struct cgroup_subsys *ss,
-				struct cgroup *cont)
+static void mem_cgroup_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	kmem_cgroup_destroy(ss, cont);
+	kmem_cgroup_destroy(cont);
 
 	mem_cgroup_put(memcg);
 }
@@ -5296,9 +5292,8 @@ static void mem_cgroup_clear_mc(void)
 	mem_cgroup_end_move(from);
 }
 
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup *cgroup,
+				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	int ret = 0;
@@ -5336,9 +5331,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 	return ret;
 }
 
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
+				     struct cgroup_taskset *tset)
 {
 	mem_cgroup_clear_mc();
 }
@@ -5453,9 +5447,8 @@ retry:
 	up_read(&mm->mmap_sem);
 }
 
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-				struct cgroup *cont,
-				struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup *cont,
+				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	struct mm_struct *mm = get_task_mm(p);
@@ -5470,20 +5463,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup *cgroup,
+				 struct cgroup_taskset *tset)
 {
 	return 0;
 }
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
+				     struct cgroup_taskset *tset)
 {
 }
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-				struct cgroup *cont,
-				struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup *cont,
+				 struct cgroup_taskset *tset)
 {
 }
 #endif
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 3a9fd4826b75..22036ab732cf 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -23,9 +23,8 @@
 #include <net/sock.h>
 #include <net/netprio_cgroup.h>
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-					       struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup *cgrp);
 static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
 
 struct cgroup_subsys net_prio_subsys = {
@@ -120,8 +119,7 @@ static void update_netdev_tables(void)
 	rtnl_unlock();
 }
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-						 struct cgroup *cgrp)
+static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
 {
 	struct cgroup_netprio_state *cs;
 	int ret;
@@ -145,7 +143,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
 	return &cs->css;
 }
 
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cgrp_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_netprio_state *cs;
 	struct net_device *dev;
diff --git a/net/core/sock.c b/net/core/sock.c
index 5c5af9988f94..688037cb3b6e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -160,19 +160,19 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
 out:
 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
 		if (proto->destroy_cgroup)
-			proto->destroy_cgroup(cgrp, ss);
+			proto->destroy_cgroup(cgrp);
 	mutex_unlock(&proto_list_mutex);
 	return ret;
 }
 
-void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
 {
 	struct proto *proto;
 
 	mutex_lock(&proto_list_mutex);
 	list_for_each_entry_reverse(proto, &proto_list, node)
 		if (proto->destroy_cgroup)
-			proto->destroy_cgroup(cgrp, ss);
+			proto->destroy_cgroup(cgrp);
 	mutex_unlock(&proto_list_mutex);
 }
 #endif
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 49978788a9dc..e714c6834c90 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -94,7 +94,7 @@ create_files:
 }
 EXPORT_SYMBOL(tcp_init_cgroup);
 
-void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+void tcp_destroy_cgroup(struct cgroup *cgrp)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct cg_proto *cg_proto;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index f84fdc3a7f27..1afaa284fcd7 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -22,9 +22,8 @@
 #include <net/sock.h>
 #include <net/cls_cgroup.h>
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-					       struct cgroup *cgrp);
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup *cgrp);
 static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
 
 struct cgroup_subsys net_cls_subsys = {
@@ -51,8 +50,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
 			    struct cgroup_cls_state, css);
 }
 
-static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-						 struct cgroup *cgrp)
+static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
 {
 	struct cgroup_cls_state *cs;
 
@@ -66,7 +64,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
 	return &cs->css;
 }
 
-static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cgrp_destroy(struct cgroup *cgrp)
 {
 	kfree(cgrp_cls_state(cgrp));
 }
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 8b5b5d8612c6..c43a3323feea 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,8 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 
 struct cgroup_subsys devices_subsys;
 
-static int devcgroup_can_attach(struct cgroup_subsys *ss,
-			struct cgroup *new_cgrp, struct cgroup_taskset *set)
+static int devcgroup_can_attach(struct cgroup *new_cgrp,
+				struct cgroup_taskset *set)
 {
 	struct task_struct *task = cgroup_taskset_first(set);
 
@@ -156,8 +156,7 @@ remove:
 /*
  * called from kernel/cgroup.c with cgroup_lock() held.
  */
-static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss,
-						struct cgroup *cgroup)
+static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup)
 {
 	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
 	struct cgroup *parent_cgroup;
@@ -195,8 +194,7 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss,
 	return &dev_cgroup->css;
 }
 
-static void devcgroup_destroy(struct cgroup_subsys *ss,
-			struct cgroup *cgroup)
+static void devcgroup_destroy(struct cgroup *cgroup)
 {
 	struct dev_cgroup *dev_cgroup;
 	struct dev_whitelist_item *wh, *tmp;
-- 
cgit v1.2.3-55-g7522


From a80b83b7b8456e9b475346c2e01d7e210883208c Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Fri, 3 Feb 2012 00:19:07 -0800
Subject: Input: add infrastructure for selecting clockid for event time stamps

As noted by Arve and others, since wall time can jump backwards, it is
difficult to use for input because one cannot determine if one event
occurred before another or for how long a key was pressed.

However, the timestamp field is part of the kernel ABI, and cannot be
changed without possibly breaking existing users.

This patch adds a new IOCTL that allows a clockid to be set in the
evdev_client struct that will specify which time base to use for event
timestamps (ie: CLOCK_MONOTONIC instead of CLOCK_REALTIME).

For now we only support CLOCK_MONOTONIC and CLOCK_REALTIME, but
in the future we could support other clockids if appropriate.

The default remains CLOCK_REALTIME, so we don't change the ABI.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Daniel Kurtz <djkurtz@google.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c     | 25 +++++++++++++++++++++----
 include/linux/input.h     |  2 ++
 kernel/time/timekeeping.c |  2 ++
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 76457d50bc34..c17409742999 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -46,6 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
+	int clkid;
 	unsigned int bufsize;
 	struct input_event buffer[];
 };
@@ -54,8 +55,12 @@ static struct evdev *evdev_table[EVDEV_MINORS];
 static DEFINE_MUTEX(evdev_table_mutex);
 
 static void evdev_pass_event(struct evdev_client *client,
-			     struct input_event *event)
+			     struct input_event *event,
+			     ktime_t mono, ktime_t real)
 {
+	event->time = ktime_to_timeval(client->clkid == CLOCK_MONOTONIC ?
+					mono : real);
+
 	/* Interrupts are disabled, just acquire the lock. */
 	spin_lock(&client->buffer_lock);
 
@@ -94,8 +99,11 @@ static void evdev_event(struct input_handle *handle,
 	struct evdev *evdev = handle->private;
 	struct evdev_client *client;
 	struct input_event event;
+	ktime_t time_mono, time_real;
+
+	time_mono = ktime_get();
+	time_real = ktime_sub(time_mono, ktime_get_monotonic_offset());
 
-	do_gettimeofday(&event.time);
 	event.type = type;
 	event.code = code;
 	event.value = value;
@@ -103,11 +111,12 @@ static void evdev_event(struct input_handle *handle,
 	rcu_read_lock();
 
 	client = rcu_dereference(evdev->grab);
+
 	if (client)
-		evdev_pass_event(client, &event);
+		evdev_pass_event(client, &event, time_mono, time_real);
 	else
 		list_for_each_entry_rcu(client, &evdev->client_list, node)
-			evdev_pass_event(client, &event);
+			evdev_pass_event(client, &event, time_mono, time_real);
 
 	rcu_read_unlock();
 
@@ -685,6 +694,14 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
 		else
 			return evdev_ungrab(evdev, client);
 
+	case EVIOCSCLOCKID:
+		if (copy_from_user(&i, p, sizeof(unsigned int)))
+			return -EFAULT;
+		if (i != CLOCK_MONOTONIC && i != CLOCK_REALTIME)
+			return -EINVAL;
+		client->clkid = i;
+		return 0;
+
 	case EVIOCGKEYCODE:
 		return evdev_handle_get_keycode(dev, p);
 
diff --git a/include/linux/input.h b/include/linux/input.h
index 3862e32c4eeb..177261ea6f52 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -129,6 +129,8 @@ struct input_keymap_entry {
 
 #define EVIOCGRAB		_IOW('E', 0x90, int)			/* Grab/Release device */
 
+#define EVIOCSCLOCKID		_IOW('E', 0xa0, int)			/* Set clockid to be used for timestamps */
+
 /*
  * Device properties and quirks
  */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2b021b0e8507..169479994755 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1140,6 +1140,8 @@ ktime_t ktime_get_monotonic_offset(void)
 	} while (read_seqretry(&xtime_lock, seq));
 	return timespec_to_ktime(wtom);
 }
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
+
 
 /**
  * xtime_update() - advances the timekeeping infrastructure
-- 
cgit v1.2.3-55-g7522


From 241057486646dd42278538218376c79aae2c359f Mon Sep 17 00:00:00 2001
From: Cong Wang
Date: Fri, 3 Feb 2012 21:42:39 +0800
Subject: kernel/resource.c: move EXPORT_SYMBOL right after definition

EXPORT_SYMBOL(adjust_resource) should be right after adjust_resource().

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/resource.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
 	write_unlock(&resource_lock);
 	return result;
 }
+EXPORT_SYMBOL(adjust_resource);
 
 static void __init __reserve_region_with_split(struct resource *root,
 		resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
 	write_unlock(&resource_lock);
 }
 
-EXPORT_SYMBOL(adjust_resource);
-
 /**
  * resource_alignment - calculate resource's alignment
  * @res: resource pointer
-- 
cgit v1.2.3-55-g7522


From 1a2a4d06e1e95260c470ebe3a945f61bbe8c1fd8 Mon Sep 17 00:00:00 2001
From: Kees Cook
Date: Wed, 21 Dec 2011 12:17:03 -0800
Subject: security: create task_free security callback

The current LSM interface to cred_free is not sufficient for allowing
an LSM to track the life and death of a task. This patch adds the
task_free hook so that an LSM can clean up resources on task death.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 9 +++++++++
 kernel/fork.c            | 1 +
 security/capability.c    | 5 +++++
 security/security.c      | 5 +++++
 4 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index 83c18e8c846d..8325eddd9ee4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -651,6 +651,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
+ * @task_free:
+ *	@task task being freed
+ *	Handle release of task-related resources. (Note that this can be called
+ *	from interrupt context.)
  * @cred_alloc_blank:
  *	@cred points to the credentials.
  *	@gfp indicates the atomicity of any memory allocations.
@@ -1493,6 +1497,7 @@ struct security_operations {
 	int (*dentry_open) (struct file *file, const struct cred *cred);
 
 	int (*task_create) (unsigned long clone_flags);
+	void (*task_free) (struct task_struct *task);
 	int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp);
 	void (*cred_free) (struct cred *cred);
 	int (*cred_prepare)(struct cred *new, const struct cred *old,
@@ -1752,6 +1757,7 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
+void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
@@ -2245,6 +2251,9 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
+static inline void security_task_free(struct task_struct *task)
+{ }
+
 static inline int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
 	return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1b2ef3c23ae4..f0e7781ba9b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -192,6 +192,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
diff --git a/security/capability.c b/security/capability.c
index 2f680eb02b59..5bb21b1c448c 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -358,6 +358,10 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
+static void cap_task_free(struct task_struct *task)
+{
+}
+
 static int cap_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
 	return 0;
@@ -954,6 +958,7 @@ void __init security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
+	set_to_cap_if_null(ops, task_free);
 	set_to_cap_if_null(ops, cred_alloc_blank);
 	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, cred_prepare);
diff --git a/security/security.c b/security/security.c
index d7542493454d..7d9426bb7442 100644
--- a/security/security.c
+++ b/security/security.c
@@ -729,6 +729,11 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
+void security_task_free(struct task_struct *task)
+{
+	security_ops->task_free(task);
+}
+
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
 {
 	return security_ops->cred_alloc_blank(cred, gfp);
-- 
cgit v1.2.3-55-g7522


From 8916e3702ec422b57cc549fbae3986106292100f Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza
Date: Sat, 4 Feb 2012 22:26:13 +0100
Subject: PM / Suspend: Avoid code duplication in suspend statistics update

The code
       if (error) {
               suspend_stats.fail++;
               dpm_save_failed_errno(error);
       } else
               suspend_stats.success++;

Appears in the kernel/power/main.c and kernel/power/suspend.c.

This patch just creates a new function to avoid duplicated code.

Suggested-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Marcos Paulo de Souza <marcos.mage@gmail.com>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/suspend.h | 16 ++++++++++++++++
 kernel/power/main.c     |  6 +-----
 kernel/power/suspend.c  |  6 +-----
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index ac1c114c499d..b90191894441 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -94,6 +94,22 @@ static inline void dpm_save_failed_step(enum suspend_stat_step step)
 	suspend_stats.last_failed_step %= REC_FAILED_NUM;
 }
 
+/**
+ * suspend_stats_update - Update success/failure statistics of suspend-to-ram
+ *
+ * @error: Value returned by enter_state() function
+ */
+static inline void suspend_stats_update(int error)
+{
+	if (error) {
+		suspend_stats.fail++;
+		dpm_save_failed_errno(error);
+	} else {
+		suspend_stats.success++;
+	}
+}
+
+
 /**
  * struct platform_suspend_ops - Callbacks for managing platform dependent
  *	system sleep states.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8c5014a4e052..b1e324878d5f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,11 +296,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 	}
 	if (state < PM_SUSPEND_MAX && *s) {
 		error = enter_state(state);
-		if (error) {
-			suspend_stats.fail++;
-			dpm_save_failed_errno(error);
-		} else
-			suspend_stats.success++;
+		suspend_stats_update(error);
 	}
 #endif
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 560a639614a1..03bc92b42750 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -321,11 +321,7 @@ int pm_suspend(suspend_state_t state)
 	int ret;
 	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
 		ret = enter_state(state);
-		if (ret) {
-			suspend_stats.fail++;
-			dpm_save_failed_errno(ret);
-		} else
-			suspend_stats.success++;
+		suspend_stats_update(ret);
 		return ret;
 	}
 	return -EINVAL;
-- 
cgit v1.2.3-55-g7522


From 51d6ff7acd920379f54d0be4dbe844a46178a65f Mon Sep 17 00:00:00 2001
From: Srivatsa S. Bhat
Date: Sat, 4 Feb 2012 22:26:38 +0100
Subject: PM / Hibernate: Thaw kernel threads in hibernation_snapshot() in
 error/test path

In the hibernation call path, the kernel threads are frozen inside
hibernation_snapshot(). If we happen to encounter an error further down
the road or if we are exiting early due to a successful freezer test,
then thaw kernel threads before returning to the caller.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 6 ++++--
 kernel/power/user.c      | 8 ++------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a5d4cf0aa03e..c6dee739080c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)
 		 * successful freezer test.
 		 */
 		freezer_test_done = true;
-		goto Cleanup;
+		goto Thaw;
 	}
 
 	error = dpm_prepare(PMSG_FREEZE);
 	if (error) {
 		dpm_complete(PMSG_RECOVER);
-		goto Cleanup;
+		goto Thaw;
 	}
 
 	suspend_console();
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)
 	platform_end(platform_mode);
 	return error;
 
+ Thaw:
+	thaw_kernel_threads();
  Cleanup:
 	swsusp_free();
 	goto Close;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13c..7bee91f9af51 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -249,16 +249,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		}
 		pm_restore_gfp_mask();
 		error = hibernation_snapshot(data->platform_support);
-		if (error) {
-			thaw_kernel_threads();
-		} else {
+		if (!error) {
 			error = put_user(in_suspend, (int __user *)arg);
 			if (!error && !freezer_test_done)
 				data->ready = 1;
-			if (freezer_test_done) {
+			if (freezer_test_done)
 				freezer_test_done = false;
-				thaw_kernel_threads();
-			}
 		}
 		break;
 
-- 
cgit v1.2.3-55-g7522


From a556d5b58345ccf51826b9ceac078072f830738b Mon Sep 17 00:00:00 2001
From: Srivatsa S. Bhat
Date: Sat, 4 Feb 2012 23:39:56 +0100
Subject: PM / Hibernate: Refactor and simplify freezer_test_done

The code related to 'freezer_test_done' is needlessly convoluted.
Refactor the code and simplify the implementation.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 10 +++++-----
 kernel/power/user.c      |  6 ++----
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c6dee739080c..72baaf011fb7 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -629,12 +629,8 @@ int hibernate(void)
 		goto Finish;
 
 	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-	if (error)
-		goto Thaw;
-	if (freezer_test_done) {
-		freezer_test_done = false;
+	if (error || freezer_test_done)
 		goto Thaw;
-	}
 
 	if (in_suspend) {
 		unsigned int flags = 0;
@@ -659,6 +655,10 @@ int hibernate(void)
 
  Thaw:
 	thaw_processes();
+
+	/* Don't bother checking whether freezer_test_done is true */
+	freezer_test_done = false;
+
  Finish:
 	free_basic_memory_bitmaps();
 	usermodehelper_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7bee91f9af51..33c4329205af 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -251,10 +251,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = hibernation_snapshot(data->platform_support);
 		if (!error) {
 			error = put_user(in_suspend, (int __user *)arg);
-			if (!error && !freezer_test_done)
-				data->ready = 1;
-			if (freezer_test_done)
-				freezer_test_done = false;
+			data->ready = !freezer_test_done && !error;
+			freezer_test_done = false;
 		}
 		break;
 
-- 
cgit v1.2.3-55-g7522


From a9b542ee607a8afafa9447292394959fc84ea650 Mon Sep 17 00:00:00 2001
From: Jean Pihet
Date: Mon, 13 Feb 2012 16:23:42 +0100
Subject: PM / QoS: unconditionally build the feature

The PM QoS feature originally didn't depend on CONFIG_PM, which was
mistakenly changed by commit e8db0be1245de16a6cc6365506abc392c3c212d4

    PM QoS: Move and rename the implementation files

Later, commit d020283dc694c9ec31b410f522252f7a8397e67d

    PM / QoS: CPU C-state breakage with PM Qos change

partially fixed that by introducing a static inline definition of
pm_qos_request(), but that still didn't allow user space to use
the PM QoS interface if CONFIG_PM was unset (which had been possible
before).  For this reason, remove the dependency of PM QoS on
CONFIG_PM to make it work (as intended) with CONFIG_PM unset.

[rjw: Replaced the original changelog with a new one.]

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Reported-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/linux/pm_qos.h | 41 +----------------------------------------
 kernel/power/Makefile  |  3 ++-
 2 files changed, 3 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 67c521731f41..c8a541e13380 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -67,7 +67,6 @@ static inline int dev_pm_qos_request_active(struct dev_pm_qos_request *req)
 	return req->dev != 0;
 }
 
-#ifdef CONFIG_PM
 int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 			 enum pm_qos_req_action action, int value);
 void pm_qos_add_request(struct pm_qos_request *req, int pm_qos_class,
@@ -82,6 +81,7 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier);
 int pm_qos_request_active(struct pm_qos_request *req);
 s32 pm_qos_read_value(struct pm_qos_constraints *c);
 
+#ifdef CONFIG_PM
 s32 __dev_pm_qos_read_value(struct device *dev);
 s32 dev_pm_qos_read_value(struct device *dev);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
@@ -99,45 +99,6 @@ void dev_pm_qos_constraints_destroy(struct device *dev);
 int dev_pm_qos_add_ancestor_request(struct device *dev,
 				    struct dev_pm_qos_request *req, s32 value);
 #else
-static inline int pm_qos_update_target(struct pm_qos_constraints *c,
-				       struct plist_node *node,
-				       enum pm_qos_req_action action,
-				       int value)
-			{ return 0; }
-static inline void pm_qos_add_request(struct pm_qos_request *req,
-				      int pm_qos_class, s32 value)
-			{ return; }
-static inline void pm_qos_update_request(struct pm_qos_request *req,
-					 s32 new_value)
-			{ return; }
-static inline void pm_qos_remove_request(struct pm_qos_request *req)
-			{ return; }
-
-static inline int pm_qos_request(int pm_qos_class)
-{
-	switch (pm_qos_class) {
-	case PM_QOS_CPU_DMA_LATENCY:
-		return PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE;
-	case PM_QOS_NETWORK_LATENCY:
-		return PM_QOS_NETWORK_LAT_DEFAULT_VALUE;
-	case PM_QOS_NETWORK_THROUGHPUT:
-		return PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE;
-	default:
-		return PM_QOS_DEFAULT_VALUE;
-	}
-}
-
-static inline int pm_qos_add_notifier(int pm_qos_class,
-				      struct notifier_block *notifier)
-			{ return 0; }
-static inline int pm_qos_remove_notifier(int pm_qos_class,
-					 struct notifier_block *notifier)
-			{ return 0; }
-static inline int pm_qos_request_active(struct pm_qos_request *req)
-			{ return 0; }
-static inline s32 pm_qos_read_value(struct pm_qos_constraints *c)
-			{ return 0; }
-
 static inline s32 __dev_pm_qos_read_value(struct device *dev)
 			{ return 0; }
 static inline s32 dev_pm_qos_read_value(struct device *dev)
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
-obj-$(CONFIG_PM)		+= main.o qos.o
+obj-y				+= qos.o
+obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
-- 
cgit v1.2.3-55-g7522


From 6c83b4818dd65eb17e633b6b629a81da7bed90b3 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Sat, 11 Feb 2012 00:00:34 +0100
Subject: PM / Sleep: Do not check wakeup too often in try_to_freeze_tasks()

Use the observation that it is more efficient to check the wakeup
variable once before the loop reporting tasks that were not
frozen in try_to_freeze_tasks() than to do that in every step of that
loop.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/process.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60a..6aeb5efe00eb 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -98,13 +98,15 @@ static int try_to_freeze_tasks(bool user_only)
 		       elapsed_csecs / 100, elapsed_csecs % 100,
 		       todo - wq_busy, wq_busy);
 
-		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
-			if (!wakeup && !freezer_should_skip(p) &&
-			    p != current && freezing(p) && !frozen(p))
-				sched_show_task(p);
-		} while_each_thread(g, p);
-		read_unlock(&tasklist_lock);
+		if (!wakeup) {
+			read_lock(&tasklist_lock);
+			do_each_thread(g, p) {
+				if (p != current && !freezer_should_skip(p)
+				    && freezing(p) && !frozen(p))
+					sched_show_task(p);
+			} while_each_thread(g, p);
+			read_unlock(&tasklist_lock);
+		}
 	} else {
 		printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
 			elapsed_csecs % 100);
-- 
cgit v1.2.3-55-g7522


From 6f585f750d792652f33b6e85b1ee205be4b5e572 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Sat, 11 Feb 2012 22:40:23 +0100
Subject: PM / Sleep: Remove unnecessary label from suspend_freeze_processes()

The Finish label in suspend_freeze_processes() is in fact unnecessary
and makes the function look more complicated than it really is, so
remove that label (along with a few empty lines).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---
 kernel/power/power.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee5206..398d42b48e9e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -234,16 +234,14 @@ static inline int suspend_freeze_processes(void)
 	int error;
 
 	error = freeze_processes();
-
 	/*
 	 * freeze_processes() automatically thaws every task if freezing
 	 * fails. So we need not do anything extra upon error.
 	 */
 	if (error)
-		goto Finish;
+		return error;
 
 	error = freeze_kernel_threads();
-
 	/*
 	 * freeze_kernel_threads() thaws only kernel threads upon freezing
 	 * failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +249,6 @@ static inline int suspend_freeze_processes(void)
 	if (error)
 		thaw_processes();
 
- Finish:
 	return error;
 }
 
-- 
cgit v1.2.3-55-g7522


From 191c542442fdf53cc3c496c00be13367fd9cd42d Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Mon, 13 Feb 2012 03:58:52 +0000
Subject: mm: collapse security_vm_enough_memory() variants into a single
 function

Collapse security_vm_enough_memory() variants into a single function.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 16 ----------------
 kernel/fork.c            |  2 +-
 mm/mmap.c                |  4 ++--
 mm/mprotect.c            |  2 +-
 mm/mremap.c              |  2 +-
 mm/shmem.c               |  4 ++--
 mm/swapfile.c            |  4 +++-
 security/security.c      | 14 --------------
 8 files changed, 10 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index 8325eddd9ee4..2fefad6d27a0 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1679,9 +1679,7 @@ int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
 int security_syslog(int type);
 int security_settime(const struct timespec *ts, const struct timezone *tz);
-int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
-int security_vm_enough_memory_kern(long pages);
 int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(struct linux_binprm *bprm);
@@ -1902,25 +1900,11 @@ static inline int security_settime(const struct timespec *ts,
 	return cap_settime(ts, tz);
 }
 
-static inline int security_vm_enough_memory(long pages)
-{
-	WARN_ON(current->mm == NULL);
-	return cap_vm_enough_memory(current->mm, pages);
-}
-
 static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
-	WARN_ON(mm == NULL);
 	return cap_vm_enough_memory(mm, pages);
 }
 
-static inline int security_vm_enough_memory_kern(long pages)
-{
-	/* If current->mm is a kernel thread then we will pass NULL,
-	   for this specific case that is fine */
-	return cap_vm_enough_memory(current->mm, pages);
-}
-
 static inline int security_bprm_set_creds(struct linux_binprm *bprm)
 {
 	return cap_bprm_set_creds(bprm);
diff --git a/kernel/fork.c b/kernel/fork.c
index f0e7781ba9b4..d5ebddf317a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -355,7 +355,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		charge = 0;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-			if (security_vm_enough_memory(len))
+			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
 				goto fail_nomem;
 			charge = len;
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7f4c81..db05495d6d0a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1235,7 +1235,7 @@ munmap_back:
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
-		if (security_vm_enough_memory(charged))
+		if (security_vm_enough_memory_mm(mm, charged))
 			return -ENOMEM;
 		vm_flags |= VM_ACCOUNT;
 	}
@@ -2169,7 +2169,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
-	if (security_vm_enough_memory(len >> PAGE_SHIFT))
+	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
 	/* Can we just expand an old private anonymous mapping? */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5a688a2756be..9599fa2d0e92 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
-			if (security_vm_enough_memory(charged))
+			if (security_vm_enough_memory_mm(mm, charged))
 				return -ENOMEM;
 			newflags |= VM_ACCOUNT;
 		}
diff --git a/mm/mremap.c b/mm/mremap.c
index 87bb8393e7d2..db8d983b5a7d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 	if (vma->vm_flags & VM_ACCOUNT) {
 		unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
-		if (security_vm_enough_memory(charged))
+		if (security_vm_enough_memory_mm(mm, charged))
 			goto Efault;
 		*p = charged;
 	}
diff --git a/mm/shmem.c b/mm/shmem.c
index 269d049294ab..d9c293952755 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
 	return (flags & VM_NORESERVE) ?
-		0 : security_vm_enough_memory_kern(VM_ACCT(size));
+		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
 }
 
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 static inline int shmem_acct_block(unsigned long flags)
 {
 	return (flags & VM_NORESERVE) ?
-		security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+		security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d999f090dfda..f0d79296dd55 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1563,6 +1563,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	BUG_ON(!current->mm);
+
 	pathname = getname(specialfile);
 	err = PTR_ERR(pathname);
 	if (IS_ERR(pathname))
@@ -1590,7 +1592,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
-	if (!security_vm_enough_memory(p->pages))
+	if (!security_vm_enough_memory_mm(current->mm, p->pages))
 		vm_unacct_memory(p->pages);
 	else {
 		err = -ENOMEM;
diff --git a/security/security.c b/security/security.c
index 7d9426bb7442..44177add4713 100644
--- a/security/security.c
+++ b/security/security.c
@@ -187,25 +187,11 @@ int security_settime(const struct timespec *ts, const struct timezone *tz)
 	return security_ops->settime(ts, tz);
 }
 
-int security_vm_enough_memory(long pages)
-{
-	WARN_ON(current->mm == NULL);
-	return security_ops->vm_enough_memory(current->mm, pages);
-}
-
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
-	WARN_ON(mm == NULL);
 	return security_ops->vm_enough_memory(mm, pages);
 }
 
-int security_vm_enough_memory_kern(long pages)
-{
-	/* If current->mm is a kernel thread then we will pass NULL,
-	   for this specific case that is fine */
-	return security_ops->vm_enough_memory(current->mm, pages);
-}
-
 int security_bprm_set_creds(struct linux_binprm *bprm)
 {
 	return security_ops->bprm_set_creds(bprm);
-- 
cgit v1.2.3-55-g7522


From 4040153087478993cbf0809f444400a3c808074c Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Mon, 13 Feb 2012 03:58:52 +0000
Subject: security: trim security.h

Trim security.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: James Morris <jmorris@namei.org>
---
 drivers/net/macvtap.c                     |  1 +
 drivers/target/iscsi/iscsi_target.c       |  1 +
 drivers/target/iscsi/iscsi_target_login.c |  1 +
 fs/nfs/client.c                           |  1 +
 fs/proc/proc_sysctl.c                     |  2 ++
 fs/quota/dquot.c                          |  1 +
 fs/super.c                                |  1 +
 include/linux/security.h                  | 55 ++++++++++++++++---------------
 include/net/sock.h                        |  2 ++
 ipc/msgutil.c                             |  2 ++
 kernel/cred.c                             |  1 +
 kernel/exit.c                             |  1 +
 kernel/sched/core.c                       |  1 +
 kernel/sysctl.c                           |  1 +
 mm/mmap.c                                 | 13 ++++++++
 security/commoncap.c                      |  1 +
 security/security.c                       |  2 ++
 security/selinux/hooks.c                  |  2 ++
 security/smack/smack_lsm.c                |  3 ++
 19 files changed, 66 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 58dc117a8d78..0427c6561c84 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/cdev.h>
+#include <linux/idr.h>
 #include <linux/fs.h>
 
 #include <net/net_namespace.h>
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 44262908def5..33df66d91aad 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -23,6 +23,7 @@
 #include <linux/crypto.h>
 #include <linux/completion.h>
 #include <linux/module.h>
+#include <linux/idr.h>
 #include <asm/unaligned.h>
 #include <scsi/scsi_device.h>
 #include <scsi/iscsi_proto.h>
diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c
index 38cb7ce8469e..1ee33a8c3fab 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -21,6 +21,7 @@
 #include <linux/string.h>
 #include <linux/kthread.h>
 #include <linux/crypto.h>
+#include <linux/idr.h>
 #include <scsi/iscsi_proto.h>
 #include <target/target_core_base.h>
 #include <target/target_core_fabric.h>
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 31778f74357d..d4f772ebd1ef 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -36,6 +36,7 @@
 #include <linux/inet.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
+#include <linux/idr.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a6b62173d4c3..67bbf6e4e197 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -6,7 +6,9 @@
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/mm.h>
 #include "internal.h"
 
 static const struct dentry_operations proc_sys_dentry_operations;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 46741970371b..8b4f12b33f57 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -71,6 +71,7 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/sched.h>
 #include <linux/kmod.h>
 #include <linux/namei.h>
 #include <linux/capability.h>
diff --git a/fs/super.c b/fs/super.c
index 6015c02296b7..18660532909e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -32,6 +32,7 @@
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
+#include <linux/fsnotify.h>
 #include "internal.h"
 
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 2fefad6d27a0..339b3b120f6c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -22,22 +22,36 @@
 #ifndef __LINUX_SECURITY_H
 #define __LINUX_SECURITY_H
 
-#include <linux/fs.h>
-#include <linux/fsnotify.h>
-#include <linux/binfmts.h>
-#include <linux/dcache.h>
-#include <linux/signal.h>
-#include <linux/resource.h>
-#include <linux/sem.h>
-#include <linux/shm.h>
-#include <linux/mm.h> /* PAGE_ALIGN */
-#include <linux/msg.h>
-#include <linux/sched.h>
 #include <linux/key.h>
-#include <linux/xfrm.h>
+#include <linux/capability.h>
 #include <linux/slab.h>
-#include <linux/xattr.h>
-#include <net/flow.h>
+#include <linux/err.h>
+
+struct linux_binprm;
+struct cred;
+struct rlimit;
+struct siginfo;
+struct sem_array;
+struct sembuf;
+struct kern_ipc_perm;
+struct audit_context;
+struct super_block;
+struct inode;
+struct dentry;
+struct file;
+struct vfsmount;
+struct path;
+struct qstr;
+struct nameidata;
+struct iattr;
+struct fown_struct;
+struct file_operations;
+struct shmid_kernel;
+struct msg_msg;
+struct msg_queue;
+struct xattr;
+struct xfrm_sec_ctx;
+struct mm_struct;
 
 /* Maximum number of letters for an LSM name string */
 #define SECURITY_NAME_MAX	10
@@ -49,6 +63,7 @@
 struct ctl_table;
 struct audit_krule;
 struct user_namespace;
+struct timezone;
 
 /*
  * These functions are in security/capability.c and are used
@@ -131,18 +146,6 @@ struct request_sock;
 #define LSM_UNSAFE_PTRACE_CAP	4
 
 #ifdef CONFIG_MMU
-/*
- * If a hint addr is less than mmap_min_addr change hint to be as
- * low as possible but still greater than mmap_min_addr
- */
-static inline unsigned long round_hint_to_min(unsigned long hint)
-{
-	hint &= PAGE_MASK;
-	if (((void *)hint != NULL) &&
-	    (hint < mmap_min_addr))
-		return PAGE_ALIGN(mmap_min_addr);
-	return hint;
-}
 extern int mmap_min_addr_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 91c1c8baf020..27508f07eada 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -56,6 +56,8 @@
 #include <linux/memcontrol.h>
 #include <linux/res_counter.h>
 #include <linux/jump_label.h>
+#include <linux/aio.h>
+#include <linux/sched.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 5652101cdac0..26143d377c95 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -13,7 +13,9 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/ipc.h>
+#include <linux/msg.h>
 #include <linux/ipc_namespace.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 
 #include "util.h"
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..97b36eeca4c9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/binfmts.h>
 #include <linux/cn_proc.h>
 
 #if 0
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6a..5ad867a3685e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
+#include <linux/shm.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d2e053..78682bfb3405 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,6 +71,7 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/binfmts.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..11d53046b905 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@
 #include <linux/oom.h>
 #include <linux/kmod.h>
 #include <linux/capability.h>
+#include <linux/binfmts.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index db05495d6d0a..694a8625ab0d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -935,6 +935,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 }
 #endif /* CONFIG_PROC_FS */
 
+/*
+ * If a hint addr is less than mmap_min_addr change hint to be as
+ * low as possible but still greater than mmap_min_addr
+ */
+static inline unsigned long round_hint_to_min(unsigned long hint)
+{
+	hint &= PAGE_MASK;
+	if (((void *)hint != NULL) &&
+	    (hint < mmap_min_addr))
+		return PAGE_ALIGN(mmap_min_addr);
+	return hint;
+}
+
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
diff --git a/security/commoncap.c b/security/commoncap.c
index 7ce191ea29a0..0cf4b53480a7 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -28,6 +28,7 @@
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 #include <linux/user_namespace.h>
+#include <linux/binfmts.h>
 
 /*
  * If a non-root user executes a setuid-root binary in
diff --git a/security/security.c b/security/security.c
index 44177add4713..bf619ffc9a4d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -19,6 +19,8 @@
 #include <linux/integrity.h>
 #include <linux/ima.h>
 #include <linux/evm.h>
+#include <linux/fsnotify.h>
+#include <net/flow.h>
 
 #define MAX_LSM_EVM_XATTR	2
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 6a3683e28426..304929909375 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -81,6 +81,8 @@
 #include <linux/syslog.h>
 #include <linux/user_namespace.h>
 #include <linux/export.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
 
 #include "avc.h"
 #include "objsec.h"
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index e8af5b0ba80f..cd667b4089a5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -36,6 +36,9 @@
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/personality.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/binfmts.h>
 #include "smack.h"
 
 #define task_security(task)	(task_cred_xxx((task), security))
-- 
cgit v1.2.3-55-g7522


From e1964c50a83d1ce53731c88271d12ac92292a880 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:48 -0700
Subject: irq_domain: Be less verbose

irq_domain printk's too much.  Drop some output.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Olof Johansson <olof@lixom.net>
---
 kernel/irq/irqdomain.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..cc2cd43ec740 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -170,13 +170,11 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 				u64 phys_base, unsigned int irq_start)
 {
 	struct device_node *node;
-	pr_info("looking for phys_base=%llx, irq_start=%i\n",
+	pr_debug("looking for phys_base=%llx, irq_start=%i\n",
 		(unsigned long long) phys_base, (int) irq_start);
 	node = of_find_matching_node_by_address(NULL, match, phys_base);
 	if (node)
 		irq_domain_add_simple(node, irq_start);
-	else
-		pr_info("no node found\n");
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 #endif /* CONFIG_OF_IRQ */
-- 
cgit v1.2.3-55-g7522


From 7bb69bade0d41715bdf1b24f5ef0b8f798769fe9 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:48 -0700
Subject: irq_domain: Make irq_domain structure match powerpc's irq_host

Part of the series to unify the irq remapping mechanisms in the
kernel.  A follow up patch will copy the powerpc implementation into
kernel/irq/irqdomain.c, which will be a lot easier if the structures
are identical.

Where they differ, I've chose to use the powerpc names since there is
a lot more code using those names.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/common/gic.c     | 14 ++++----
 include/linux/irqdomain.h | 84 ++++++++++++++++++++++++++++++++++++-----------
 kernel/irq/irqdomain.c    | 14 ++++----
 3 files changed, 78 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index c47d6199b784..dc19862be0a8 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -619,10 +619,10 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
 #endif
 
 #ifdef CONFIG_OF
-static int gic_irq_domain_dt_translate(struct irq_domain *d,
-				       struct device_node *controller,
-				       const u32 *intspec, unsigned int intsize,
-				       unsigned long *out_hwirq, unsigned int *out_type)
+static int gic_irq_domain_xlate(struct irq_domain *d,
+				struct device_node *controller,
+				const u32 *intspec, unsigned int intsize,
+				unsigned long *out_hwirq, unsigned int *out_type)
 {
 	if (d->of_node != controller)
 		return -EINVAL;
@@ -641,9 +641,9 @@ static int gic_irq_domain_dt_translate(struct irq_domain *d,
 }
 #endif
 
-const struct irq_domain_ops gic_irq_domain_ops = {
+struct irq_domain_ops gic_irq_domain_ops = {
 #ifdef CONFIG_OF
-	.dt_translate = gic_irq_domain_dt_translate,
+	.xlate = gic_irq_domain_xlate,
 #endif
 };
 
@@ -721,7 +721,7 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 		     irq_start);
 		domain->irq_base = irq_start;
 	}
-	domain->priv = gic;
+	domain->host_data = gic;
 	domain->ops = &gic_irq_domain_ops;
 	irq_domain_add(domain);
 
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index bd4272b61a14..35b9ff382e45 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -9,61 +9,105 @@
  * representation into a hardware irq number that can be mapped back to a
  * Linux irq number without any extra platform support code.
  *
- * irq_domain is expected to be embedded in an interrupt controller's private
- * data structure.
+ * Interrupt controller "domain" data structure. This could be defined as a
+ * irq domain controller. That is, it handles the mapping between hardware
+ * and virtual interrupt numbers for a given interrupt domain. The domain
+ * structure is generally created by the PIC code for a given PIC instance
+ * (though a domain can cover more than one PIC if they have a flat number
+ * model). It's the domain callbacks that are responsible for setting the
+ * irq_chip on a given irq_desc after it's been mapped.
  */
+
 #ifndef _LINUX_IRQDOMAIN_H
 #define _LINUX_IRQDOMAIN_H
 
-#include <linux/irq.h>
-#include <linux/mod_devicetable.h>
+#include <linux/types.h>
+#include <linux/radix-tree.h>
 
-#ifdef CONFIG_IRQ_DOMAIN
 struct device_node;
 struct irq_domain;
+struct of_device_id;
+
+/* This type is the placeholder for a hardware interrupt number. It has to
+ * be big enough to enclose whatever representation is used by a given
+ * platform.
+ */
+typedef unsigned long irq_hw_number_t;
 
 /**
  * struct irq_domain_ops - Methods for irq_domain objects
+ * @match: Match an interrupt controller device node to a host, returns
+ *         1 on a match
+ * @map: Create or update a mapping between a virtual irq number and a hw
+ *       irq number. This is called only once for a given mapping.
+ * @unmap: Dispose of such a mapping
  * @to_irq: (optional) given a local hardware irq number, return the linux
  *          irq number.  If to_irq is not implemented, then the irq_domain
  *          will use this translation: irq = (domain->irq_base + hwirq)
- * @dt_translate: Given a device tree node and interrupt specifier, decode
- *                the hardware irq number and linux irq type value.
+ * @xlate: Given a device tree node and interrupt specifier, decode
+ *         the hardware irq number and linux irq type value.
+ *
+ * Functions below are provided by the driver and called whenever a new mapping
+ * is created or an old mapping is disposed. The driver can then proceed to
+ * whatever internal data structures management is required. It also needs
+ * to setup the irq_desc when returning from map().
  */
 struct irq_domain_ops {
+	int (*match)(struct irq_domain *d, struct device_node *node);
+	int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw);
+	void (*unmap)(struct irq_domain *d, unsigned int virq);
 	unsigned int (*to_irq)(struct irq_domain *d, unsigned long hwirq);
-
-#ifdef CONFIG_OF
-	int (*dt_translate)(struct irq_domain *d, struct device_node *node,
-			    const u32 *intspec, unsigned int intsize,
-			    unsigned long *out_hwirq, unsigned int *out_type);
-#endif /* CONFIG_OF */
+	int (*xlate)(struct irq_domain *d, struct device_node *node,
+		     const u32 *intspec, unsigned int intsize,
+		     unsigned long *out_hwirq, unsigned int *out_type);
 };
 
 /**
  * struct irq_domain - Hardware interrupt number translation object
- * @list: Element in global irq_domain list.
+ * @link: Element in global irq_domain list.
+ * @revmap_type: Method used for reverse mapping hwirq numbers to linux irq. This
+ *               will be one of the IRQ_DOMAIN_MAP_* values.
+ * @revmap_data: Revmap method specific data.
+ * @ops: pointer to irq_domain methods
+ * @host_data: private data pointer for use by owner.  Not touched by irq_domain
+ *             core code.
  * @irq_base: Start of irq_desc range assigned to the irq_domain.  The creator
  *            of the irq_domain is responsible for allocating the array of
  *            irq_desc structures.
  * @nr_irq: Number of irqs managed by the irq domain
  * @hwirq_base: Starting number for hwirqs managed by the irq domain
- * @ops: pointer to irq_domain methods
- * @priv: private data pointer for use by owner.  Not touched by irq_domain
- *        core code.
  * @of_node: (optional) Pointer to device tree nodes associated with the
  *           irq_domain.  Used when decoding device tree interrupt specifiers.
  */
 struct irq_domain {
-	struct list_head list;
+	struct list_head link;
+
+	/* type of reverse mapping_technique */
+	unsigned int revmap_type;
+#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */
+#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
+#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
+#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
+	union {
+		struct {
+			unsigned int size;
+			unsigned int *revmap;
+		} linear;
+		struct radix_tree_root tree;
+	} revmap_data;
+	struct irq_domain_ops *ops;
+	void *host_data;
+	irq_hw_number_t inval_irq;
+
 	unsigned int irq_base;
 	unsigned int nr_irq;
 	unsigned int hwirq_base;
-	const struct irq_domain_ops *ops;
-	void *priv;
+
+	/* Optional device node pointer */
 	struct device_node *of_node;
 };
 
+#ifdef CONFIG_IRQ_DOMAIN
 /**
  * irq_domain_to_irq() - Translate from a hardware irq to a linux irq number
  *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cc2cd43ec740..509adb8762d7 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -43,7 +43,7 @@ void irq_domain_add(struct irq_domain *domain)
 	}
 
 	mutex_lock(&irq_domain_mutex);
-	list_add(&domain->list, &irq_domain_list);
+	list_add(&domain->link, &irq_domain_list);
 	mutex_unlock(&irq_domain_mutex);
 }
 
@@ -57,7 +57,7 @@ void irq_domain_del(struct irq_domain *domain)
 	int hwirq, irq;
 
 	mutex_lock(&irq_domain_mutex);
-	list_del(&domain->list);
+	list_del(&domain->link);
 	mutex_unlock(&irq_domain_mutex);
 
 	/* Clear the irq_domain assignments */
@@ -88,10 +88,10 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 
 	/* Find a domain which can translate the irq spec */
 	mutex_lock(&irq_domain_mutex);
-	list_for_each_entry(domain, &irq_domain_list, list) {
-		if (!domain->ops->dt_translate)
+	list_for_each_entry(domain, &irq_domain_list, link) {
+		if (!domain->ops->xlate)
 			continue;
-		rc = domain->ops->dt_translate(domain, controller,
+		rc = domain->ops->xlate(domain, controller,
 					intspec, intsize, &hwirq, &type);
 		if (rc == 0)
 			break;
@@ -126,7 +126,7 @@ void irq_dispose_mapping(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 
-int irq_domain_simple_dt_translate(struct irq_domain *d,
+int irq_domain_simple_xlate(struct irq_domain *d,
 			    struct device_node *controller,
 			    const u32 *intspec, unsigned int intsize,
 			    unsigned long *out_hwirq, unsigned int *out_type)
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 
 struct irq_domain_ops irq_domain_simple_ops = {
 #ifdef CONFIG_OF_IRQ
-	.dt_translate = irq_domain_simple_dt_translate,
+	.xlate = irq_domain_simple_xlate,
 #endif /* CONFIG_OF_IRQ */
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-- 
cgit v1.2.3-55-g7522


From cc79ca691c292e9fd44f589c7940b9654e22f2f6 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Thu, 16 Feb 2012 01:37:49 -0700
Subject: irq_domain: Move irq_domain code from powerpc to kernel/irq

This patch only moves the code.  It doesn't make any changes, and the
code is still only compiled for powerpc.  Follow-on patches will generalize
the code for other architectures.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>---
 arch/powerpc/Kconfig           |   1 +
 arch/powerpc/include/asm/irq.h | 144 ----------
 arch/powerpc/kernel/irq.c      | 502 ----------------------------------
 include/linux/irqdomain.h      |  46 +++-
 kernel/irq/irqdomain.c         | 600 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 643 insertions(+), 650 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1919634a9b32..303703d716fe 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,6 +135,7 @@ config PPC
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
 	select IRQ_PER_CPU
+	select IRQ_DOMAIN
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
 	select IRQ_FORCED_THREADING
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index f80f262e0597..728cc30d04ea 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -42,154 +42,10 @@ extern atomic_t ppc_n_lost_interrupts;
 /* Same thing, used by the generic IRQ code */
 #define NR_IRQS_LEGACY		NUM_ISA_INTERRUPTS
 
-/*
- * The host code and data structures are fairly agnostic to the fact that
- * we use an open firmware device-tree. We do have references to struct
- * device_node in two places: in irq_find_host() to find the host matching
- * a given interrupt controller node, and of course as an argument to its
- * counterpart host->ops->match() callback. However, those are treated as
- * generic pointers by the core and the fact that it's actually a device-node
- * pointer is purely a convention between callers and implementation. This
- * code could thus be used on other architectures by replacing those two
- * by some sort of arch-specific void * "token" used to identify interrupt
- * controllers.
- */
-
 struct irq_data;
 extern irq_hw_number_t irqd_to_hwirq(struct irq_data *d);
 extern irq_hw_number_t virq_to_hw(unsigned int virq);
 
-/**
- * irq_alloc_host - Allocate a new irq_domain data structure
- * @of_node: optional device-tree node of the interrupt controller
- * @revmap_type: type of reverse mapping to use
- * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map
- * @ops: map/unmap host callbacks
- * @inval_irq: provide a hw number in that host space that is always invalid
- *
- * Allocates and initialize and irq_domain structure. Note that in the case of
- * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by
- * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be allocated
- * later during boot automatically (the reverse mapping will use the slow path
- * until that happens).
- */
-extern struct irq_domain *irq_alloc_host(struct device_node *of_node,
-				       unsigned int revmap_type,
-				       unsigned int revmap_arg,
-				       struct irq_domain_ops *ops,
-				       irq_hw_number_t inval_irq);
-
-
-/**
- * irq_find_host - Locates a host for a given device node
- * @node: device-tree node of the interrupt controller
- */
-extern struct irq_domain *irq_find_host(struct device_node *node);
-
-
-/**
- * irq_set_default_host - Set a "default" host
- * @host: default host pointer
- *
- * For convenience, it's possible to set a "default" host that will be used
- * whenever NULL is passed to irq_create_mapping(). It makes life easier for
- * platforms that want to manipulate a few hard coded interrupt numbers that
- * aren't properly represented in the device-tree.
- */
-extern void irq_set_default_host(struct irq_domain *host);
-
-
-/**
- * irq_set_virq_count - Set the maximum number of virt irqs
- * @count: number of linux virtual irqs, capped with NR_IRQS
- *
- * This is mainly for use by platforms like iSeries who want to program
- * the virtual irq number in the controller to avoid the reverse mapping
- */
-extern void irq_set_virq_count(unsigned int count);
-
-
-/**
- * irq_create_mapping - Map a hardware interrupt into linux virq space
- * @host: host owning this hardware interrupt or NULL for default host
- * @hwirq: hardware irq number in that host space
- *
- * Only one mapping per hardware interrupt is permitted. Returns a linux
- * virq number.
- * If the sense/trigger is to be specified, set_irq_type() should be called
- * on the number returned from that call.
- */
-extern unsigned int irq_create_mapping(struct irq_domain *host,
-				       irq_hw_number_t hwirq);
-
-
-/**
- * irq_dispose_mapping - Unmap an interrupt
- * @virq: linux virq number of the interrupt to unmap
- */
-extern void irq_dispose_mapping(unsigned int virq);
-
-/**
- * irq_find_mapping - Find a linux virq from an hw irq number.
- * @host: host owning this hardware interrupt
- * @hwirq: hardware irq number in that host space
- *
- * This is a slow path, for use by generic code. It's expected that an
- * irq controller implementation directly calls the appropriate low level
- * mapping function.
- */
-extern unsigned int irq_find_mapping(struct irq_domain *host,
-				     irq_hw_number_t hwirq);
-
-/**
- * irq_create_direct_mapping - Allocate a virq for direct mapping
- * @host: host to allocate the virq for or NULL for default host
- *
- * This routine is used for irq controllers which can choose the hardware
- * interrupt numbers they generate. In such a case it's simplest to use
- * the linux virq as the hardware interrupt number.
- */
-extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
-
-/**
- * irq_radix_revmap_insert - Insert a hw irq to linux virq number mapping.
- * @host: host owning this hardware interrupt
- * @virq: linux irq number
- * @hwirq: hardware irq number in that host space
- *
- * This is for use by irq controllers that use a radix tree reverse
- * mapping for fast lookup.
- */
-extern void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq,
-				    irq_hw_number_t hwirq);
-
-/**
- * irq_radix_revmap_lookup - Find a linux virq from a hw irq number.
- * @host: host owning this hardware interrupt
- * @hwirq: hardware irq number in that host space
- *
- * This is a fast path, for use by irq controller code that uses radix tree
- * revmaps
- */
-extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
-					    irq_hw_number_t hwirq);
-
-/**
- * irq_linear_revmap - Find a linux virq from a hw irq number.
- * @host: host owning this hardware interrupt
- * @hwirq: hardware irq number in that host space
- *
- * This is a fast path, for use by irq controller code that uses linear
- * revmaps. It does fallback to the slow path if the revmap doesn't exist
- * yet and will create the revmap entry with appropriate locking
- */
-
-extern unsigned int irq_linear_revmap(struct irq_domain *host,
-				      irq_hw_number_t hwirq);
-
-
 /**
  * irq_early_init - Init irq remapping subsystem
  */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 269fbd5ac62f..e3673ff6b7a0 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -486,17 +486,6 @@ void do_softirq(void)
 	local_irq_restore(flags);
 }
 
-
-/*
- * IRQ controller and virtual interrupts
- */
-
-static LIST_HEAD(irq_domain_list);
-static DEFINE_MUTEX(irq_domain_mutex);
-static DEFINE_MUTEX(revmap_trees_mutex);
-static unsigned int irq_virq_count = NR_IRQS;
-static struct irq_domain *irq_default_host;
-
 irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
 {
 	return d->hwirq;
@@ -510,362 +499,6 @@ irq_hw_number_t virq_to_hw(unsigned int virq)
 }
 EXPORT_SYMBOL_GPL(virq_to_hw);
 
-static int default_irq_host_match(struct irq_domain *h, struct device_node *np)
-{
-	return h->of_node != NULL && h->of_node == np;
-}
-
-struct irq_domain *irq_alloc_host(struct device_node *of_node,
-				unsigned int revmap_type,
-				unsigned int revmap_arg,
-				struct irq_domain_ops *ops,
-				irq_hw_number_t inval_irq)
-{
-	struct irq_domain *host, *h;
-	unsigned int size = sizeof(struct irq_domain);
-	unsigned int i;
-	unsigned int *rmap;
-
-	/* Allocate structure and revmap table if using linear mapping */
-	if (revmap_type == IRQ_DOMAIN_MAP_LINEAR)
-		size += revmap_arg * sizeof(unsigned int);
-	host = kzalloc(size, GFP_KERNEL);
-	if (host == NULL)
-		return NULL;
-
-	/* Fill structure */
-	host->revmap_type = revmap_type;
-	host->inval_irq = inval_irq;
-	host->ops = ops;
-	host->of_node = of_node_get(of_node);
-
-	if (host->ops->match == NULL)
-		host->ops->match = default_irq_host_match;
-
-	mutex_lock(&irq_domain_mutex);
-	/* Make sure only one legacy controller can be created */
-	if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
-		list_for_each_entry(h, &irq_domain_list, link) {
-			if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) {
-				mutex_unlock(&irq_domain_mutex);
-				of_node_put(host->of_node);
-				kfree(host);
-				return NULL;
-			}
-		}
-	}
-	list_add(&host->link, &irq_domain_list);
-	mutex_unlock(&irq_domain_mutex);
-
-	/* Additional setups per revmap type */
-	switch(revmap_type) {
-	case IRQ_DOMAIN_MAP_LEGACY:
-		/* 0 is always the invalid number for legacy */
-		host->inval_irq = 0;
-		/* setup us as the host for all legacy interrupts */
-		for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
-			struct irq_data *irq_data = irq_get_irq_data(i);
-			irq_data->hwirq = i;
-			irq_data->domain = host;
-
-			/* Legacy flags are left to default at this point,
-			 * one can then use irq_create_mapping() to
-			 * explicitly change them
-			 */
-			ops->map(host, i, i);
-
-			/* Clear norequest flags */
-			irq_clear_status_flags(i, IRQ_NOREQUEST);
-		}
-		break;
-	case IRQ_DOMAIN_MAP_LINEAR:
-		rmap = (unsigned int *)(host + 1);
-		for (i = 0; i < revmap_arg; i++)
-			rmap[i] = NO_IRQ;
-		host->revmap_data.linear.size = revmap_arg;
-		host->revmap_data.linear.revmap = rmap;
-		break;
-	case IRQ_DOMAIN_MAP_TREE:
-		INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL);
-		break;
-	default:
-		break;
-	}
-
-	pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host);
-
-	return host;
-}
-
-struct irq_domain *irq_find_host(struct device_node *node)
-{
-	struct irq_domain *h, *found = NULL;
-
-	/* We might want to match the legacy controller last since
-	 * it might potentially be set to match all interrupts in
-	 * the absence of a device node. This isn't a problem so far
-	 * yet though...
-	 */
-	mutex_lock(&irq_domain_mutex);
-	list_for_each_entry(h, &irq_domain_list, link)
-		if (h->ops->match(h, node)) {
-			found = h;
-			break;
-		}
-	mutex_unlock(&irq_domain_mutex);
-	return found;
-}
-EXPORT_SYMBOL_GPL(irq_find_host);
-
-void irq_set_default_host(struct irq_domain *host)
-{
-	pr_debug("irq: Default host set to @0x%p\n", host);
-
-	irq_default_host = host;
-}
-
-void irq_set_virq_count(unsigned int count)
-{
-	pr_debug("irq: Trying to set virq count to %d\n", count);
-
-	BUG_ON(count < NUM_ISA_INTERRUPTS);
-	if (count < NR_IRQS)
-		irq_virq_count = count;
-}
-
-static int irq_setup_virq(struct irq_domain *host, unsigned int virq,
-			    irq_hw_number_t hwirq)
-{
-	struct irq_data *irq_data = irq_get_irq_data(virq);
-
-	irq_data->hwirq = hwirq;
-	irq_data->domain = host;
-	if (host->ops->map(host, virq, hwirq)) {
-		pr_debug("irq: -> mapping failed, freeing\n");
-		irq_data->domain = NULL;
-		irq_data->hwirq = 0;
-		return -1;
-	}
-
-	irq_clear_status_flags(virq, IRQ_NOREQUEST);
-
-	return 0;
-}
-
-unsigned int irq_create_direct_mapping(struct irq_domain *host)
-{
-	unsigned int virq;
-
-	if (host == NULL)
-		host = irq_default_host;
-
-	BUG_ON(host == NULL);
-	WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
-
-	virq = irq_alloc_desc_from(1, 0);
-	if (virq == NO_IRQ) {
-		pr_debug("irq: create_direct virq allocation failed\n");
-		return NO_IRQ;
-	}
-	if (virq >= irq_virq_count) {
-		pr_err("ERROR: no free irqs available below %i maximum\n",
-			irq_virq_count);
-		irq_free_desc(virq);
-		return 0;
-	}
-
-	pr_debug("irq: create_direct obtained virq %d\n", virq);
-
-	if (irq_setup_virq(host, virq, virq)) {
-		irq_free_desc(virq);
-		return NO_IRQ;
-	}
-
-	return virq;
-}
-
-unsigned int irq_create_mapping(struct irq_domain *host,
-				irq_hw_number_t hwirq)
-{
-	unsigned int virq, hint;
-
-	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq);
-
-	/* Look for default host if nececssary */
-	if (host == NULL)
-		host = irq_default_host;
-	if (host == NULL) {
-		printk(KERN_WARNING "irq_create_mapping called for"
-		       " NULL host, hwirq=%lx\n", hwirq);
-		WARN_ON(1);
-		return NO_IRQ;
-	}
-	pr_debug("irq: -> using host @%p\n", host);
-
-	/* Check if mapping already exists */
-	virq = irq_find_mapping(host, hwirq);
-	if (virq != NO_IRQ) {
-		pr_debug("irq: -> existing mapping on virq %d\n", virq);
-		return virq;
-	}
-
-	/* Get a virtual interrupt number */
-	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
-		/* Handle legacy */
-		virq = (unsigned int)hwirq;
-		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
-			return NO_IRQ;
-		return virq;
-	} else {
-		/* Allocate a virtual interrupt number */
-		hint = hwirq % irq_virq_count;
-		if (hint == 0)
-			hint = 1;
-		virq = irq_alloc_desc_from(hint, 0);
-		if (!virq)
-			virq = irq_alloc_desc_from(1, 0);
-		if (virq == NO_IRQ) {
-			pr_debug("irq: -> virq allocation failed\n");
-			return NO_IRQ;
-		}
-	}
-
-	if (irq_setup_virq(host, virq, hwirq)) {
-		if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
-			irq_free_desc(virq);
-		return NO_IRQ;
-	}
-
-	pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n",
-		hwirq, host->of_node ? host->of_node->full_name : "null", virq);
-
-	return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_mapping);
-
-unsigned int irq_create_of_mapping(struct device_node *controller,
-				   const u32 *intspec, unsigned int intsize)
-{
-	struct irq_domain *host;
-	irq_hw_number_t hwirq;
-	unsigned int type = IRQ_TYPE_NONE;
-	unsigned int virq;
-
-	if (controller == NULL)
-		host = irq_default_host;
-	else
-		host = irq_find_host(controller);
-	if (host == NULL) {
-		printk(KERN_WARNING "irq: no irq host found for %s !\n",
-		       controller->full_name);
-		return NO_IRQ;
-	}
-
-	/* If host has no translation, then we assume interrupt line */
-	if (host->ops->xlate == NULL)
-		hwirq = intspec[0];
-	else {
-		if (host->ops->xlate(host, controller, intspec, intsize,
-				     &hwirq, &type))
-			return NO_IRQ;
-	}
-
-	/* Create mapping */
-	virq = irq_create_mapping(host, hwirq);
-	if (virq == NO_IRQ)
-		return virq;
-
-	/* Set type if specified and different than the current one */
-	if (type != IRQ_TYPE_NONE &&
-	    type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
-		irq_set_irq_type(virq, type);
-	return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
-void irq_dispose_mapping(unsigned int virq)
-{
-	struct irq_data *irq_data = irq_get_irq_data(virq);
-	struct irq_domain *host;
-	irq_hw_number_t hwirq;
-
-	if (virq == NO_IRQ || !irq_data)
-		return;
-
-	host = irq_data->domain;
-	if (WARN_ON(host == NULL))
-		return;
-
-	/* Never unmap legacy interrupts */
-	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-		return;
-
-	irq_set_status_flags(virq, IRQ_NOREQUEST);
-
-	/* remove chip and handler */
-	irq_set_chip_and_handler(virq, NULL, NULL);
-
-	/* Make sure it's completed */
-	synchronize_irq(virq);
-
-	/* Tell the PIC about it */
-	if (host->ops->unmap)
-		host->ops->unmap(host, virq);
-	smp_mb();
-
-	/* Clear reverse map */
-	hwirq = irq_data->hwirq;
-	switch(host->revmap_type) {
-	case IRQ_DOMAIN_MAP_LINEAR:
-		if (hwirq < host->revmap_data.linear.size)
-			host->revmap_data.linear.revmap[hwirq] = NO_IRQ;
-		break;
-	case IRQ_DOMAIN_MAP_TREE:
-		mutex_lock(&revmap_trees_mutex);
-		radix_tree_delete(&host->revmap_data.tree, hwirq);
-		mutex_unlock(&revmap_trees_mutex);
-		break;
-	}
-
-	/* Destroy map */
-	irq_data->hwirq = host->inval_irq;
-
-	irq_free_desc(virq);
-}
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-
-unsigned int irq_find_mapping(struct irq_domain *host,
-			      irq_hw_number_t hwirq)
-{
-	unsigned int i;
-	unsigned int hint = hwirq % irq_virq_count;
-
-	/* Look for default host if nececssary */
-	if (host == NULL)
-		host = irq_default_host;
-	if (host == NULL)
-		return NO_IRQ;
-
-	/* legacy -> bail early */
-	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-		return hwirq;
-
-	/* Slow path does a linear search of the map */
-	if (hint == 0)
-		hint = 1;
-	i = hint;
-	do {
-		struct irq_data *data = irq_get_irq_data(i);
-		if (data && (data->domain == host) && (data->hwirq == hwirq))
-			return i;
-		i++;
-		if (i >= irq_virq_count)
-			i = 1;
-	} while(i != hint);
-	return NO_IRQ;
-}
-EXPORT_SYMBOL_GPL(irq_find_mapping);
-
 #ifdef CONFIG_SMP
 int irq_choose_cpu(const struct cpumask *mask)
 {
@@ -902,146 +535,11 @@ int irq_choose_cpu(const struct cpumask *mask)
 }
 #endif
 
-unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
-				     irq_hw_number_t hwirq)
-{
-	struct irq_data *irq_data;
-
-	if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
-		return irq_find_mapping(host, hwirq);
-
-	/*
-	 * Freeing an irq can delete nodes along the path to
-	 * do the lookup via call_rcu.
-	 */
-	rcu_read_lock();
-	irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq);
-	rcu_read_unlock();
-
-	/*
-	 * If found in radix tree, then fine.
-	 * Else fallback to linear lookup - this should not happen in practice
-	 * as it means that we failed to insert the node in the radix tree.
-	 */
-	return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq);
-}
-
-void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq,
-			     irq_hw_number_t hwirq)
-{
-	struct irq_data *irq_data = irq_get_irq_data(virq);
-
-	if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
-		return;
-
-	if (virq != NO_IRQ) {
-		mutex_lock(&revmap_trees_mutex);
-		radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data);
-		mutex_unlock(&revmap_trees_mutex);
-	}
-}
-
-unsigned int irq_linear_revmap(struct irq_domain *host,
-			       irq_hw_number_t hwirq)
-{
-	unsigned int *revmap;
-
-	if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
-		return irq_find_mapping(host, hwirq);
-
-	/* Check revmap bounds */
-	if (unlikely(hwirq >= host->revmap_data.linear.size))
-		return irq_find_mapping(host, hwirq);
-
-	/* Check if revmap was allocated */
-	revmap = host->revmap_data.linear.revmap;
-	if (unlikely(revmap == NULL))
-		return irq_find_mapping(host, hwirq);
-
-	/* Fill up revmap with slow path if no mapping found */
-	if (unlikely(revmap[hwirq] == NO_IRQ))
-		revmap[hwirq] = irq_find_mapping(host, hwirq);
-
-	return revmap[hwirq];
-}
-
 int arch_early_irq_init(void)
 {
 	return 0;
 }
 
-#ifdef CONFIG_VIRQ_DEBUG
-static int virq_debug_show(struct seq_file *m, void *private)
-{
-	unsigned long flags;
-	struct irq_desc *desc;
-	const char *p;
-	static const char none[] = "none";
-	void *data;
-	int i;
-
-	seq_printf(m, "%-5s  %-7s  %-15s  %-18s  %s\n", "virq", "hwirq",
-		      "chip name", "chip data", "host name");
-
-	for (i = 1; i < nr_irqs; i++) {
-		desc = irq_to_desc(i);
-		if (!desc)
-			continue;
-
-		raw_spin_lock_irqsave(&desc->lock, flags);
-
-		if (desc->action && desc->action->handler) {
-			struct irq_chip *chip;
-
-			seq_printf(m, "%5d  ", i);
-			seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq);
-
-			chip = irq_desc_get_chip(desc);
-			if (chip && chip->name)
-				p = chip->name;
-			else
-				p = none;
-			seq_printf(m, "%-15s  ", p);
-
-			data = irq_desc_get_chip_data(desc);
-			seq_printf(m, "0x%16p  ", data);
-
-			if (desc->irq_data.domain->of_node)
-				p = desc->irq_data.domain->of_node->full_name;
-			else
-				p = none;
-			seq_printf(m, "%s\n", p);
-		}
-
-		raw_spin_unlock_irqrestore(&desc->lock, flags);
-	}
-
-	return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
-	.open = virq_debug_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
-	if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root,
-				 NULL, &virq_debug_fops) == NULL)
-		return -ENOMEM;
-
-	return 0;
-}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_VIRQ_DEBUG */
-
 #ifdef CONFIG_PPC64
 static int __init setup_noirqdistrib(char *str)
 {
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 35b9ff382e45..18f4ab002d2e 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -16,6 +16,17 @@
  * (though a domain can cover more than one PIC if they have a flat number
  * model). It's the domain callbacks that are responsible for setting the
  * irq_chip on a given irq_desc after it's been mapped.
+ *
+ * The host code and data structures are agnostic to whether or not
+ * we use an open firmware device-tree. We do have references to struct
+ * device_node in two places: in irq_find_host() to find the host matching
+ * a given interrupt controller node, and of course as an argument to its
+ * counterpart domain->ops->match() callback. However, those are treated as
+ * generic pointers by the core and the fact that it's actually a device-node
+ * pointer is purely a convention between callers and implementation. This
+ * code could thus be used on other architectures by replacing those two
+ * by some sort of arch-specific void * "token" used to identify interrupt
+ * controllers.
  */
 
 #ifndef _LINUX_IRQDOMAIN_H
@@ -108,6 +119,32 @@ struct irq_domain {
 };
 
 #ifdef CONFIG_IRQ_DOMAIN
+#ifdef CONFIG_PPC
+extern struct irq_domain *irq_alloc_host(struct device_node *of_node,
+				       unsigned int revmap_type,
+				       unsigned int revmap_arg,
+				       struct irq_domain_ops *ops,
+				       irq_hw_number_t inval_irq);
+extern struct irq_domain *irq_find_host(struct device_node *node);
+extern void irq_set_default_host(struct irq_domain *host);
+extern void irq_set_virq_count(unsigned int count);
+
+
+extern unsigned int irq_create_mapping(struct irq_domain *host,
+				       irq_hw_number_t hwirq);
+extern void irq_dispose_mapping(unsigned int virq);
+extern unsigned int irq_find_mapping(struct irq_domain *host,
+				     irq_hw_number_t hwirq);
+extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
+extern void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq,
+				    irq_hw_number_t hwirq);
+extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
+					    irq_hw_number_t hwirq);
+extern unsigned int irq_linear_revmap(struct irq_domain *host,
+				      irq_hw_number_t hwirq);
+
+#else /* CONFIG_PPC */
+
 /**
  * irq_domain_to_irq() - Translate from a hardware irq to a linux irq number
  *
@@ -137,15 +174,16 @@ extern void irq_domain_add(struct irq_domain *domain);
 extern void irq_domain_del(struct irq_domain *domain);
 
 extern struct irq_domain_ops irq_domain_simple_ops;
-#endif /* CONFIG_IRQ_DOMAIN */
 
-#if defined(CONFIG_IRQ_DOMAIN) && defined(CONFIG_OF_IRQ)
+#if defined(CONFIG_OF_IRQ)
 extern void irq_domain_add_simple(struct device_node *controller, int irq_base);
 extern void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start);
-#else /* CONFIG_IRQ_DOMAIN && CONFIG_OF_IRQ */
+#else /* CONFIG_OF_IRQ */
 static inline void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start) { }
-#endif /* CONFIG_IRQ_DOMAIN && CONFIG_OF_IRQ */
+#endif /* !CONFIG_OF_IRQ */
+#endif /* !CONFIG_PPC */
+#endif /* CONFIG_IRQ_DOMAIN */
 
 #endif /* _LINUX_IRQDOMAIN_H */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 509adb8762d7..f551bc1d3167 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,14 +1,612 @@
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqdesc.h>
 #include <linux/irqdomain.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
 
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
 
+#ifdef CONFIG_PPC
+static DEFINE_MUTEX(revmap_trees_mutex);
+static unsigned int irq_virq_count = NR_IRQS;
+static struct irq_domain *irq_default_host;
+
+static int default_irq_host_match(struct irq_domain *h, struct device_node *np)
+{
+	return h->of_node != NULL && h->of_node == np;
+}
+
+/**
+ * irq_alloc_host() - Allocate a new irq_domain data structure
+ * @of_node: optional device-tree node of the interrupt controller
+ * @revmap_type: type of reverse mapping to use
+ * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map
+ * @ops: map/unmap host callbacks
+ * @inval_irq: provide a hw number in that host space that is always invalid
+ *
+ * Allocates and initialize and irq_domain structure. Note that in the case of
+ * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by
+ * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be
+ * allocated later during boot automatically (the reverse mapping will use the
+ * slow path until that happens).
+ */
+struct irq_domain *irq_alloc_host(struct device_node *of_node,
+				unsigned int revmap_type,
+				unsigned int revmap_arg,
+				struct irq_domain_ops *ops,
+				irq_hw_number_t inval_irq)
+{
+	struct irq_domain *host, *h;
+	unsigned int size = sizeof(struct irq_domain);
+	unsigned int i;
+	unsigned int *rmap;
+
+	/* Allocate structure and revmap table if using linear mapping */
+	if (revmap_type == IRQ_DOMAIN_MAP_LINEAR)
+		size += revmap_arg * sizeof(unsigned int);
+	host = kzalloc(size, GFP_KERNEL);
+	if (host == NULL)
+		return NULL;
+
+	/* Fill structure */
+	host->revmap_type = revmap_type;
+	host->inval_irq = inval_irq;
+	host->ops = ops;
+	host->of_node = of_node_get(of_node);
+
+	if (host->ops->match == NULL)
+		host->ops->match = default_irq_host_match;
+
+	mutex_lock(&irq_domain_mutex);
+	/* Make sure only one legacy controller can be created */
+	if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
+		list_for_each_entry(h, &irq_domain_list, link) {
+			if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) {
+				mutex_unlock(&irq_domain_mutex);
+				of_node_put(host->of_node);
+				kfree(host);
+				return NULL;
+			}
+		}
+	}
+	list_add(&host->link, &irq_domain_list);
+	mutex_unlock(&irq_domain_mutex);
+
+	/* Additional setups per revmap type */
+	switch(revmap_type) {
+	case IRQ_DOMAIN_MAP_LEGACY:
+		/* 0 is always the invalid number for legacy */
+		host->inval_irq = 0;
+		/* setup us as the host for all legacy interrupts */
+		for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
+			struct irq_data *irq_data = irq_get_irq_data(i);
+			irq_data->hwirq = i;
+			irq_data->domain = host;
+
+			/* Legacy flags are left to default at this point,
+			 * one can then use irq_create_mapping() to
+			 * explicitly change them
+			 */
+			ops->map(host, i, i);
+
+			/* Clear norequest flags */
+			irq_clear_status_flags(i, IRQ_NOREQUEST);
+		}
+		break;
+	case IRQ_DOMAIN_MAP_LINEAR:
+		rmap = (unsigned int *)(host + 1);
+		for (i = 0; i < revmap_arg; i++)
+			rmap[i] = NO_IRQ;
+		host->revmap_data.linear.size = revmap_arg;
+		host->revmap_data.linear.revmap = rmap;
+		break;
+	case IRQ_DOMAIN_MAP_TREE:
+		INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL);
+		break;
+	default:
+		break;
+	}
+
+	pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host);
+
+	return host;
+}
+
+/**
+ * irq_find_host() - Locates a domain for a given device node
+ * @node: device-tree node of the interrupt controller
+ */
+struct irq_domain *irq_find_host(struct device_node *node)
+{
+	struct irq_domain *h, *found = NULL;
+
+	/* We might want to match the legacy controller last since
+	 * it might potentially be set to match all interrupts in
+	 * the absence of a device node. This isn't a problem so far
+	 * yet though...
+	 */
+	mutex_lock(&irq_domain_mutex);
+	list_for_each_entry(h, &irq_domain_list, link)
+		if (h->ops->match(h, node)) {
+			found = h;
+			break;
+		}
+	mutex_unlock(&irq_domain_mutex);
+	return found;
+}
+EXPORT_SYMBOL_GPL(irq_find_host);
+
+/**
+ * irq_set_default_host() - Set a "default" irq domain
+ * @host: default host pointer
+ *
+ * For convenience, it's possible to set a "default" domain that will be used
+ * whenever NULL is passed to irq_create_mapping(). It makes life easier for
+ * platforms that want to manipulate a few hard coded interrupt numbers that
+ * aren't properly represented in the device-tree.
+ */
+void irq_set_default_host(struct irq_domain *host)
+{
+	pr_debug("irq: Default host set to @0x%p\n", host);
+
+	irq_default_host = host;
+}
+
+/**
+ * irq_set_virq_count() - Set the maximum number of linux irqs
+ * @count: number of linux irqs, capped with NR_IRQS
+ *
+ * This is mainly for use by platforms like iSeries who want to program
+ * the virtual irq number in the controller to avoid the reverse mapping
+ */
+void irq_set_virq_count(unsigned int count)
+{
+	pr_debug("irq: Trying to set virq count to %d\n", count);
+
+	BUG_ON(count < NUM_ISA_INTERRUPTS);
+	if (count < NR_IRQS)
+		irq_virq_count = count;
+}
+
+static int irq_setup_virq(struct irq_domain *host, unsigned int virq,
+			    irq_hw_number_t hwirq)
+{
+	struct irq_data *irq_data = irq_get_irq_data(virq);
+
+	irq_data->hwirq = hwirq;
+	irq_data->domain = host;
+	if (host->ops->map(host, virq, hwirq)) {
+		pr_debug("irq: -> mapping failed, freeing\n");
+		irq_data->domain = NULL;
+		irq_data->hwirq = 0;
+		return -1;
+	}
+
+	irq_clear_status_flags(virq, IRQ_NOREQUEST);
+
+	return 0;
+}
+
+/**
+ * irq_create_direct_mapping() - Allocate an irq for direct mapping
+ * @host: domain to allocate the irq for or NULL for default host
+ *
+ * This routine is used for irq controllers which can choose the hardware
+ * interrupt numbers they generate. In such a case it's simplest to use
+ * the linux irq as the hardware interrupt number.
+ */
+unsigned int irq_create_direct_mapping(struct irq_domain *host)
+{
+	unsigned int virq;
+
+	if (host == NULL)
+		host = irq_default_host;
+
+	BUG_ON(host == NULL);
+	WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
+
+	virq = irq_alloc_desc_from(1, 0);
+	if (virq == NO_IRQ) {
+		pr_debug("irq: create_direct virq allocation failed\n");
+		return NO_IRQ;
+	}
+	if (virq >= irq_virq_count) {
+		pr_err("ERROR: no free irqs available below %i maximum\n",
+			irq_virq_count);
+		irq_free_desc(virq);
+		return 0;
+	}
+
+	pr_debug("irq: create_direct obtained virq %d\n", virq);
+
+	if (irq_setup_virq(host, virq, virq)) {
+		irq_free_desc(virq);
+		return NO_IRQ;
+	}
+
+	return virq;
+}
+
+/**
+ * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * @host: host owning this hardware interrupt or NULL for default host
+ * @hwirq: hardware irq number in that host space
+ *
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
+ * irq number.
+ * If the sense/trigger is to be specified, set_irq_type() should be called
+ * on the number returned from that call.
+ */
+unsigned int irq_create_mapping(struct irq_domain *host,
+				irq_hw_number_t hwirq)
+{
+	unsigned int virq, hint;
+
+	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq);
+
+	/* Look for default host if nececssary */
+	if (host == NULL)
+		host = irq_default_host;
+	if (host == NULL) {
+		printk(KERN_WARNING "irq_create_mapping called for"
+		       " NULL host, hwirq=%lx\n", hwirq);
+		WARN_ON(1);
+		return NO_IRQ;
+	}
+	pr_debug("irq: -> using host @%p\n", host);
+
+	/* Check if mapping already exists */
+	virq = irq_find_mapping(host, hwirq);
+	if (virq != NO_IRQ) {
+		pr_debug("irq: -> existing mapping on virq %d\n", virq);
+		return virq;
+	}
+
+	/* Get a virtual interrupt number */
+	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
+		/* Handle legacy */
+		virq = (unsigned int)hwirq;
+		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
+			return NO_IRQ;
+		return virq;
+	} else {
+		/* Allocate a virtual interrupt number */
+		hint = hwirq % irq_virq_count;
+		if (hint == 0)
+			hint++;
+		virq = irq_alloc_desc_from(hint, 0);
+		if (!virq)
+			virq = irq_alloc_desc_from(1, 0);
+		if (virq == NO_IRQ) {
+			pr_debug("irq: -> virq allocation failed\n");
+			return NO_IRQ;
+		}
+	}
+
+	if (irq_setup_virq(host, virq, hwirq)) {
+		if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
+			irq_free_desc(virq);
+		return NO_IRQ;
+	}
+
+	pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n",
+		hwirq, host->of_node ? host->of_node->full_name : "null", virq);
+
+	return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping);
+
+unsigned int irq_create_of_mapping(struct device_node *controller,
+				   const u32 *intspec, unsigned int intsize)
+{
+	struct irq_domain *host;
+	irq_hw_number_t hwirq;
+	unsigned int type = IRQ_TYPE_NONE;
+	unsigned int virq;
+
+	if (controller == NULL)
+		host = irq_default_host;
+	else
+		host = irq_find_host(controller);
+	if (host == NULL) {
+		printk(KERN_WARNING "irq: no irq host found for %s !\n",
+		       controller->full_name);
+		return NO_IRQ;
+	}
+
+	/* If host has no translation, then we assume interrupt line */
+	if (host->ops->xlate == NULL)
+		hwirq = intspec[0];
+	else {
+		if (host->ops->xlate(host, controller, intspec, intsize,
+				     &hwirq, &type))
+			return NO_IRQ;
+	}
+
+	/* Create mapping */
+	virq = irq_create_mapping(host, hwirq);
+	if (virq == NO_IRQ)
+		return virq;
+
+	/* Set type if specified and different than the current one */
+	if (type != IRQ_TYPE_NONE &&
+	    type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
+		irq_set_irq_type(virq, type);
+	return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+
+/**
+ * irq_dispose_mapping() - Unmap an interrupt
+ * @virq: linux irq number of the interrupt to unmap
+ */
+void irq_dispose_mapping(unsigned int virq)
+{
+	struct irq_data *irq_data = irq_get_irq_data(virq);
+	struct irq_domain *host;
+	irq_hw_number_t hwirq;
+
+	if (virq == NO_IRQ || !irq_data)
+		return;
+
+	host = irq_data->domain;
+	if (WARN_ON(host == NULL))
+		return;
+
+	/* Never unmap legacy interrupts */
+	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+		return;
+
+	irq_set_status_flags(virq, IRQ_NOREQUEST);
+
+	/* remove chip and handler */
+	irq_set_chip_and_handler(virq, NULL, NULL);
+
+	/* Make sure it's completed */
+	synchronize_irq(virq);
+
+	/* Tell the PIC about it */
+	if (host->ops->unmap)
+		host->ops->unmap(host, virq);
+	smp_mb();
+
+	/* Clear reverse map */
+	hwirq = irq_data->hwirq;
+	switch(host->revmap_type) {
+	case IRQ_DOMAIN_MAP_LINEAR:
+		if (hwirq < host->revmap_data.linear.size)
+			host->revmap_data.linear.revmap[hwirq] = NO_IRQ;
+		break;
+	case IRQ_DOMAIN_MAP_TREE:
+		mutex_lock(&revmap_trees_mutex);
+		radix_tree_delete(&host->revmap_data.tree, hwirq);
+		mutex_unlock(&revmap_trees_mutex);
+		break;
+	}
+
+	/* Destroy map */
+	irq_data->hwirq = host->inval_irq;
+
+	irq_free_desc(virq);
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+
+/**
+ * irq_find_mapping() - Find a linux irq from an hw irq number.
+ * @host: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is a slow path, for use by generic code. It's expected that an
+ * irq controller implementation directly calls the appropriate low level
+ * mapping function.
+ */
+unsigned int irq_find_mapping(struct irq_domain *host,
+			      irq_hw_number_t hwirq)
+{
+	unsigned int i;
+	unsigned int hint = hwirq % irq_virq_count;
+
+	/* Look for default host if nececssary */
+	if (host == NULL)
+		host = irq_default_host;
+	if (host == NULL)
+		return NO_IRQ;
+
+	/* legacy -> bail early */
+	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+		return hwirq;
+
+	/* Slow path does a linear search of the map */
+	if (hint == 0)
+		hint = 1;
+	i = hint;
+	do {
+		struct irq_data *data = irq_get_irq_data(i);
+		if (data && (data->domain == host) && (data->hwirq == hwirq))
+			return i;
+		i++;
+		if (i >= irq_virq_count)
+			i = 1;
+	} while(i != hint);
+	return NO_IRQ;
+}
+EXPORT_SYMBOL_GPL(irq_find_mapping);
+
+/**
+ * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
+ * @host: host owning this hardware interrupt
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is a fast path, for use by irq controller code that uses radix tree
+ * revmaps
+ */
+unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
+				     irq_hw_number_t hwirq)
+{
+	struct irq_data *irq_data;
+
+	if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
+		return irq_find_mapping(host, hwirq);
+
+	/*
+	 * Freeing an irq can delete nodes along the path to
+	 * do the lookup via call_rcu.
+	 */
+	rcu_read_lock();
+	irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq);
+	rcu_read_unlock();
+
+	/*
+	 * If found in radix tree, then fine.
+	 * Else fallback to linear lookup - this should not happen in practice
+	 * as it means that we failed to insert the node in the radix tree.
+	 */
+	return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq);
+}
+
+/**
+ * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
+ * @host: host owning this hardware interrupt
+ * @virq: linux irq number
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is for use by irq controllers that use a radix tree reverse
+ * mapping for fast lookup.
+ */
+void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq,
+			     irq_hw_number_t hwirq)
+{
+	struct irq_data *irq_data = irq_get_irq_data(virq);
+
+	if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
+		return;
+
+	if (virq != NO_IRQ) {
+		mutex_lock(&revmap_trees_mutex);
+		radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data);
+		mutex_unlock(&revmap_trees_mutex);
+	}
+}
+
+/**
+ * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * @host: host owning this hardware interrupt
+ * @hwirq: hardware irq number in that host space
+ *
+ * This is a fast path, for use by irq controller code that uses linear
+ * revmaps. It does fallback to the slow path if the revmap doesn't exist
+ * yet and will create the revmap entry with appropriate locking
+ */
+unsigned int irq_linear_revmap(struct irq_domain *host,
+			       irq_hw_number_t hwirq)
+{
+	unsigned int *revmap;
+
+	if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
+		return irq_find_mapping(host, hwirq);
+
+	/* Check revmap bounds */
+	if (unlikely(hwirq >= host->revmap_data.linear.size))
+		return irq_find_mapping(host, hwirq);
+
+	/* Check if revmap was allocated */
+	revmap = host->revmap_data.linear.revmap;
+	if (unlikely(revmap == NULL))
+		return irq_find_mapping(host, hwirq);
+
+	/* Fill up revmap with slow path if no mapping found */
+	if (unlikely(revmap[hwirq] == NO_IRQ))
+		revmap[hwirq] = irq_find_mapping(host, hwirq);
+
+	return revmap[hwirq];
+}
+
+#ifdef CONFIG_VIRQ_DEBUG
+static int virq_debug_show(struct seq_file *m, void *private)
+{
+	unsigned long flags;
+	struct irq_desc *desc;
+	const char *p;
+	static const char none[] = "none";
+	void *data;
+	int i;
+
+	seq_printf(m, "%-5s  %-7s  %-15s  %-18s  %s\n", "virq", "hwirq",
+		      "chip name", "chip data", "host name");
+
+	for (i = 1; i < nr_irqs; i++) {
+		desc = irq_to_desc(i);
+		if (!desc)
+			continue;
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+
+		if (desc->action && desc->action->handler) {
+			struct irq_chip *chip;
+
+			seq_printf(m, "%5d  ", i);
+			seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq);
+
+			chip = irq_desc_get_chip(desc);
+			if (chip && chip->name)
+				p = chip->name;
+			else
+				p = none;
+			seq_printf(m, "%-15s  ", p);
+
+			data = irq_desc_get_chip_data(desc);
+			seq_printf(m, "0x%16p  ", data);
+
+			if (desc->irq_data.domain->of_node)
+				p = desc->irq_data.domain->of_node->full_name;
+			else
+				p = none;
+			seq_printf(m, "%s\n", p);
+		}
+
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
+	}
+
+	return 0;
+}
+
+static int virq_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, virq_debug_show, inode->i_private);
+}
+
+static const struct file_operations virq_debug_fops = {
+	.open = virq_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int __init irq_debugfs_init(void)
+{
+	if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root,
+				 NULL, &virq_debug_fops) == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+__initcall(irq_debugfs_init);
+#endif /* CONFIG_VIRQ_DEBUG */
+
+#else /* CONFIG_PPC */
+
 /**
  * irq_domain_add() - Register an irq_domain
  * @domain: ptr to initialized irq_domain structure
@@ -185,3 +783,5 @@ struct irq_domain_ops irq_domain_simple_ops = {
 #endif /* CONFIG_OF_IRQ */
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+#endif /* !CONFIG_PPC */
-- 
cgit v1.2.3-55-g7522


From 03848373ea741caafab952fb62405ed7fc0c279c Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:52 -0700
Subject: irq_domain: remove NO_IRQ from irq domain code

zero always means no irq when using irq domains.  Get rid of the NO_IRQ
references.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 kernel/irq/irqdomain.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f551bc1d3167..8f7b91ce53c4 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -108,7 +108,7 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node,
 	case IRQ_DOMAIN_MAP_LINEAR:
 		rmap = (unsigned int *)(host + 1);
 		for (i = 0; i < revmap_arg; i++)
-			rmap[i] = NO_IRQ;
+			rmap[i] = 0;
 		host->revmap_data.linear.size = revmap_arg;
 		host->revmap_data.linear.revmap = rmap;
 		break;
@@ -218,9 +218,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host)
 	WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
 
 	virq = irq_alloc_desc_from(1, 0);
-	if (virq == NO_IRQ) {
+	if (!virq) {
 		pr_debug("irq: create_direct virq allocation failed\n");
-		return NO_IRQ;
+		return 0;
 	}
 	if (virq >= irq_virq_count) {
 		pr_err("ERROR: no free irqs available below %i maximum\n",
@@ -233,7 +233,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host)
 
 	if (irq_setup_virq(host, virq, virq)) {
 		irq_free_desc(virq);
-		return NO_IRQ;
+		return 0;
 	}
 
 	return virq;
@@ -263,13 +263,13 @@ unsigned int irq_create_mapping(struct irq_domain *host,
 		printk(KERN_WARNING "irq_create_mapping called for"
 		       " NULL host, hwirq=%lx\n", hwirq);
 		WARN_ON(1);
-		return NO_IRQ;
+		return 0;
 	}
 	pr_debug("irq: -> using host @%p\n", host);
 
 	/* Check if mapping already exists */
 	virq = irq_find_mapping(host, hwirq);
-	if (virq != NO_IRQ) {
+	if (virq) {
 		pr_debug("irq: -> existing mapping on virq %d\n", virq);
 		return virq;
 	}
@@ -279,7 +279,7 @@ unsigned int irq_create_mapping(struct irq_domain *host,
 		/* Handle legacy */
 		virq = (unsigned int)hwirq;
 		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
-			return NO_IRQ;
+			return 0;
 		return virq;
 	} else {
 		/* Allocate a virtual interrupt number */
@@ -289,16 +289,16 @@ unsigned int irq_create_mapping(struct irq_domain *host,
 		virq = irq_alloc_desc_from(hint, 0);
 		if (!virq)
 			virq = irq_alloc_desc_from(1, 0);
-		if (virq == NO_IRQ) {
+		if (!virq) {
 			pr_debug("irq: -> virq allocation failed\n");
-			return NO_IRQ;
+			return 0;
 		}
 	}
 
 	if (irq_setup_virq(host, virq, hwirq)) {
 		if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
 			irq_free_desc(virq);
-		return NO_IRQ;
+		return 0;
 	}
 
 	pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n",
@@ -323,7 +323,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 	if (host == NULL) {
 		printk(KERN_WARNING "irq: no irq host found for %s !\n",
 		       controller->full_name);
-		return NO_IRQ;
+		return 0;
 	}
 
 	/* If host has no translation, then we assume interrupt line */
@@ -332,12 +332,12 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 	else {
 		if (host->ops->xlate(host, controller, intspec, intsize,
 				     &hwirq, &type))
-			return NO_IRQ;
+			return 0;
 	}
 
 	/* Create mapping */
 	virq = irq_create_mapping(host, hwirq);
-	if (virq == NO_IRQ)
+	if (!virq)
 		return virq;
 
 	/* Set type if specified and different than the current one */
@@ -358,7 +358,7 @@ void irq_dispose_mapping(unsigned int virq)
 	struct irq_domain *host;
 	irq_hw_number_t hwirq;
 
-	if (virq == NO_IRQ || !irq_data)
+	if (!virq || !irq_data)
 		return;
 
 	host = irq_data->domain;
@@ -387,7 +387,7 @@ void irq_dispose_mapping(unsigned int virq)
 	switch(host->revmap_type) {
 	case IRQ_DOMAIN_MAP_LINEAR:
 		if (hwirq < host->revmap_data.linear.size)
-			host->revmap_data.linear.revmap[hwirq] = NO_IRQ;
+			host->revmap_data.linear.revmap[hwirq] = 0;
 		break;
 	case IRQ_DOMAIN_MAP_TREE:
 		mutex_lock(&revmap_trees_mutex);
@@ -422,7 +422,7 @@ unsigned int irq_find_mapping(struct irq_domain *host,
 	if (host == NULL)
 		host = irq_default_host;
 	if (host == NULL)
-		return NO_IRQ;
+		return 0;
 
 	/* legacy -> bail early */
 	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
@@ -440,7 +440,7 @@ unsigned int irq_find_mapping(struct irq_domain *host,
 		if (i >= irq_virq_count)
 			i = 1;
 	} while(i != hint);
-	return NO_IRQ;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_find_mapping);
 
@@ -493,7 +493,7 @@ void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq,
 	if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
 		return;
 
-	if (virq != NO_IRQ) {
+	if (virq) {
 		mutex_lock(&revmap_trees_mutex);
 		radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data);
 		mutex_unlock(&revmap_trees_mutex);
@@ -527,7 +527,7 @@ unsigned int irq_linear_revmap(struct irq_domain *host,
 		return irq_find_mapping(host, hwirq);
 
 	/* Fill up revmap with slow path if no mapping found */
-	if (unlikely(revmap[hwirq] == NO_IRQ))
+	if (unlikely(!revmap[hwirq]))
 		revmap[hwirq] = irq_find_mapping(host, hwirq);
 
 	return revmap[hwirq];
-- 
cgit v1.2.3-55-g7522


From 68700650e71b6bb6636673f4f9c8ec807353d8d6 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:53 -0700
Subject: irq_domain: Remove references to old irq_host names

No functional changes.  Replaces non-exported references to 'host' with domain.
Does not change any symbol names referenced by other .c files.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 kernel/irq/irqdomain.c | 219 ++++++++++++++++++++++++-------------------------
 1 file changed, 108 insertions(+), 111 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8f7b91ce53c4..432d292b33f8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -19,11 +19,11 @@ static DEFINE_MUTEX(irq_domain_mutex);
 #ifdef CONFIG_PPC
 static DEFINE_MUTEX(revmap_trees_mutex);
 static unsigned int irq_virq_count = NR_IRQS;
-static struct irq_domain *irq_default_host;
+static struct irq_domain *irq_default_domain;
 
-static int default_irq_host_match(struct irq_domain *h, struct device_node *np)
+static int default_irq_domain_match(struct irq_domain *d, struct device_node *np)
 {
-	return h->of_node != NULL && h->of_node == np;
+	return d->of_node != NULL && d->of_node == np;
 }
 
 /**
@@ -31,8 +31,8 @@ static int default_irq_host_match(struct irq_domain *h, struct device_node *np)
  * @of_node: optional device-tree node of the interrupt controller
  * @revmap_type: type of reverse mapping to use
  * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map
- * @ops: map/unmap host callbacks
- * @inval_irq: provide a hw number in that host space that is always invalid
+ * @ops: map/unmap domain callbacks
+ * @inval_irq: provide a hw number in that domain space that is always invalid
  *
  * Allocates and initialize and irq_domain structure. Note that in the case of
  * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns
@@ -48,7 +48,7 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node,
 				struct irq_domain_ops *ops,
 				irq_hw_number_t inval_irq)
 {
-	struct irq_domain *host, *h;
+	struct irq_domain *domain, *h;
 	unsigned int size = sizeof(struct irq_domain);
 	unsigned int i;
 	unsigned int *rmap;
@@ -56,18 +56,18 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node,
 	/* Allocate structure and revmap table if using linear mapping */
 	if (revmap_type == IRQ_DOMAIN_MAP_LINEAR)
 		size += revmap_arg * sizeof(unsigned int);
-	host = kzalloc(size, GFP_KERNEL);
-	if (host == NULL)
+	domain = kzalloc(size, GFP_KERNEL);
+	if (domain == NULL)
 		return NULL;
 
 	/* Fill structure */
-	host->revmap_type = revmap_type;
-	host->inval_irq = inval_irq;
-	host->ops = ops;
-	host->of_node = of_node_get(of_node);
+	domain->revmap_type = revmap_type;
+	domain->inval_irq = inval_irq;
+	domain->ops = ops;
+	domain->of_node = of_node_get(of_node);
 
-	if (host->ops->match == NULL)
-		host->ops->match = default_irq_host_match;
+	if (domain->ops->match == NULL)
+		domain->ops->match = default_irq_domain_match;
 
 	mutex_lock(&irq_domain_mutex);
 	/* Make sure only one legacy controller can be created */
@@ -75,53 +75,53 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node,
 		list_for_each_entry(h, &irq_domain_list, link) {
 			if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) {
 				mutex_unlock(&irq_domain_mutex);
-				of_node_put(host->of_node);
-				kfree(host);
+				of_node_put(domain->of_node);
+				kfree(domain);
 				return NULL;
 			}
 		}
 	}
-	list_add(&host->link, &irq_domain_list);
+	list_add(&domain->link, &irq_domain_list);
 	mutex_unlock(&irq_domain_mutex);
 
 	/* Additional setups per revmap type */
 	switch(revmap_type) {
 	case IRQ_DOMAIN_MAP_LEGACY:
 		/* 0 is always the invalid number for legacy */
-		host->inval_irq = 0;
-		/* setup us as the host for all legacy interrupts */
+		domain->inval_irq = 0;
+		/* setup us as the domain for all legacy interrupts */
 		for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
 			struct irq_data *irq_data = irq_get_irq_data(i);
 			irq_data->hwirq = i;
-			irq_data->domain = host;
+			irq_data->domain = domain;
 
 			/* Legacy flags are left to default at this point,
 			 * one can then use irq_create_mapping() to
 			 * explicitly change them
 			 */
-			ops->map(host, i, i);
+			ops->map(domain, i, i);
 
 			/* Clear norequest flags */
 			irq_clear_status_flags(i, IRQ_NOREQUEST);
 		}
 		break;
 	case IRQ_DOMAIN_MAP_LINEAR:
-		rmap = (unsigned int *)(host + 1);
+		rmap = (unsigned int *)(domain + 1);
 		for (i = 0; i < revmap_arg; i++)
 			rmap[i] = 0;
-		host->revmap_data.linear.size = revmap_arg;
-		host->revmap_data.linear.revmap = rmap;
+		domain->revmap_data.linear.size = revmap_arg;
+		domain->revmap_data.linear.revmap = rmap;
 		break;
 	case IRQ_DOMAIN_MAP_TREE:
-		INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL);
+		INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
 		break;
 	default:
 		break;
 	}
 
-	pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host);
+	pr_debug("irq: Allocated domain of type %d @0x%p\n", revmap_type, domain);
 
-	return host;
+	return domain;
 }
 
 /**
@@ -150,18 +150,18 @@ EXPORT_SYMBOL_GPL(irq_find_host);
 
 /**
  * irq_set_default_host() - Set a "default" irq domain
- * @host: default host pointer
+ * @domain: default domain pointer
  *
  * For convenience, it's possible to set a "default" domain that will be used
  * whenever NULL is passed to irq_create_mapping(). It makes life easier for
  * platforms that want to manipulate a few hard coded interrupt numbers that
  * aren't properly represented in the device-tree.
  */
-void irq_set_default_host(struct irq_domain *host)
+void irq_set_default_host(struct irq_domain *domain)
 {
-	pr_debug("irq: Default host set to @0x%p\n", host);
+	pr_debug("irq: Default domain set to @0x%p\n", domain);
 
-	irq_default_host = host;
+	irq_default_domain = domain;
 }
 
 /**
@@ -180,14 +180,14 @@ void irq_set_virq_count(unsigned int count)
 		irq_virq_count = count;
 }
 
-static int irq_setup_virq(struct irq_domain *host, unsigned int virq,
+static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
 			    irq_hw_number_t hwirq)
 {
 	struct irq_data *irq_data = irq_get_irq_data(virq);
 
 	irq_data->hwirq = hwirq;
-	irq_data->domain = host;
-	if (host->ops->map(host, virq, hwirq)) {
+	irq_data->domain = domain;
+	if (domain->ops->map(domain, virq, hwirq)) {
 		pr_debug("irq: -> mapping failed, freeing\n");
 		irq_data->domain = NULL;
 		irq_data->hwirq = 0;
@@ -201,21 +201,21 @@ static int irq_setup_virq(struct irq_domain *host, unsigned int virq,
 
 /**
  * irq_create_direct_mapping() - Allocate an irq for direct mapping
- * @host: domain to allocate the irq for or NULL for default host
+ * @domain: domain to allocate the irq for or NULL for default domain
  *
  * This routine is used for irq controllers which can choose the hardware
  * interrupt numbers they generate. In such a case it's simplest to use
  * the linux irq as the hardware interrupt number.
  */
-unsigned int irq_create_direct_mapping(struct irq_domain *host)
+unsigned int irq_create_direct_mapping(struct irq_domain *domain)
 {
 	unsigned int virq;
 
-	if (host == NULL)
-		host = irq_default_host;
+	if (domain == NULL)
+		domain = irq_default_domain;
 
-	BUG_ON(host == NULL);
-	WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
+	BUG_ON(domain == NULL);
+	WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
 
 	virq = irq_alloc_desc_from(1, 0);
 	if (!virq) {
@@ -231,7 +231,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host)
 
 	pr_debug("irq: create_direct obtained virq %d\n", virq);
 
-	if (irq_setup_virq(host, virq, virq)) {
+	if (irq_setup_virq(domain, virq, virq)) {
 		irq_free_desc(virq);
 		return 0;
 	}
@@ -241,41 +241,41 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host)
 
 /**
  * irq_create_mapping() - Map a hardware interrupt into linux irq space
- * @host: host owning this hardware interrupt or NULL for default host
- * @hwirq: hardware irq number in that host space
+ * @domain: domain owning this hardware interrupt or NULL for default domain
+ * @hwirq: hardware irq number in that domain space
  *
  * Only one mapping per hardware interrupt is permitted. Returns a linux
  * irq number.
  * If the sense/trigger is to be specified, set_irq_type() should be called
  * on the number returned from that call.
  */
-unsigned int irq_create_mapping(struct irq_domain *host,
+unsigned int irq_create_mapping(struct irq_domain *domain,
 				irq_hw_number_t hwirq)
 {
 	unsigned int virq, hint;
 
-	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq);
+	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
 
-	/* Look for default host if nececssary */
-	if (host == NULL)
-		host = irq_default_host;
-	if (host == NULL) {
+	/* Look for default domain if nececssary */
+	if (domain == NULL)
+		domain = irq_default_domain;
+	if (domain == NULL) {
 		printk(KERN_WARNING "irq_create_mapping called for"
-		       " NULL host, hwirq=%lx\n", hwirq);
+		       " NULL domain, hwirq=%lx\n", hwirq);
 		WARN_ON(1);
 		return 0;
 	}
-	pr_debug("irq: -> using host @%p\n", host);
+	pr_debug("irq: -> using domain @%p\n", domain);
 
 	/* Check if mapping already exists */
-	virq = irq_find_mapping(host, hwirq);
+	virq = irq_find_mapping(domain, hwirq);
 	if (virq) {
 		pr_debug("irq: -> existing mapping on virq %d\n", virq);
 		return virq;
 	}
 
 	/* Get a virtual interrupt number */
-	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
+	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
 		/* Handle legacy */
 		virq = (unsigned int)hwirq;
 		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
@@ -295,14 +295,14 @@ unsigned int irq_create_mapping(struct irq_domain *host,
 		}
 	}
 
-	if (irq_setup_virq(host, virq, hwirq)) {
-		if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
+	if (irq_setup_virq(domain, virq, hwirq)) {
+		if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
 			irq_free_desc(virq);
 		return 0;
 	}
 
-	pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n",
-		hwirq, host->of_node ? host->of_node->full_name : "null", virq);
+	pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
+		hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
 
 	return virq;
 }
@@ -311,32 +311,29 @@ EXPORT_SYMBOL_GPL(irq_create_mapping);
 unsigned int irq_create_of_mapping(struct device_node *controller,
 				   const u32 *intspec, unsigned int intsize)
 {
-	struct irq_domain *host;
+	struct irq_domain *domain;
 	irq_hw_number_t hwirq;
 	unsigned int type = IRQ_TYPE_NONE;
 	unsigned int virq;
 
-	if (controller == NULL)
-		host = irq_default_host;
-	else
-		host = irq_find_host(controller);
-	if (host == NULL) {
-		printk(KERN_WARNING "irq: no irq host found for %s !\n",
+	domain = controller ? irq_find_host(controller) : irq_default_domain;
+	if (!domain) {
+		printk(KERN_WARNING "irq: no irq domain found for %s !\n",
 		       controller->full_name);
 		return 0;
 	}
 
-	/* If host has no translation, then we assume interrupt line */
-	if (host->ops->xlate == NULL)
+	/* If domain has no translation, then we assume interrupt line */
+	if (domain->ops->xlate == NULL)
 		hwirq = intspec[0];
 	else {
-		if (host->ops->xlate(host, controller, intspec, intsize,
+		if (domain->ops->xlate(domain, controller, intspec, intsize,
 				     &hwirq, &type))
 			return 0;
 	}
 
 	/* Create mapping */
-	virq = irq_create_mapping(host, hwirq);
+	virq = irq_create_mapping(domain, hwirq);
 	if (!virq)
 		return virq;
 
@@ -355,18 +352,18 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 void irq_dispose_mapping(unsigned int virq)
 {
 	struct irq_data *irq_data = irq_get_irq_data(virq);
-	struct irq_domain *host;
+	struct irq_domain *domain;
 	irq_hw_number_t hwirq;
 
 	if (!virq || !irq_data)
 		return;
 
-	host = irq_data->domain;
-	if (WARN_ON(host == NULL))
+	domain = irq_data->domain;
+	if (WARN_ON(domain == NULL))
 		return;
 
 	/* Never unmap legacy interrupts */
-	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
 		return;
 
 	irq_set_status_flags(virq, IRQ_NOREQUEST);
@@ -378,26 +375,26 @@ void irq_dispose_mapping(unsigned int virq)
 	synchronize_irq(virq);
 
 	/* Tell the PIC about it */
-	if (host->ops->unmap)
-		host->ops->unmap(host, virq);
+	if (domain->ops->unmap)
+		domain->ops->unmap(domain, virq);
 	smp_mb();
 
 	/* Clear reverse map */
 	hwirq = irq_data->hwirq;
-	switch(host->revmap_type) {
+	switch(domain->revmap_type) {
 	case IRQ_DOMAIN_MAP_LINEAR:
-		if (hwirq < host->revmap_data.linear.size)
-			host->revmap_data.linear.revmap[hwirq] = 0;
+		if (hwirq < domain->revmap_data.linear.size)
+			domain->revmap_data.linear.revmap[hwirq] = 0;
 		break;
 	case IRQ_DOMAIN_MAP_TREE:
 		mutex_lock(&revmap_trees_mutex);
-		radix_tree_delete(&host->revmap_data.tree, hwirq);
+		radix_tree_delete(&domain->revmap_data.tree, hwirq);
 		mutex_unlock(&revmap_trees_mutex);
 		break;
 	}
 
 	/* Destroy map */
-	irq_data->hwirq = host->inval_irq;
+	irq_data->hwirq = domain->inval_irq;
 
 	irq_free_desc(virq);
 }
@@ -405,27 +402,27 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 
 /**
  * irq_find_mapping() - Find a linux irq from an hw irq number.
- * @host: domain owning this hardware interrupt
- * @hwirq: hardware irq number in that host space
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
  *
  * This is a slow path, for use by generic code. It's expected that an
  * irq controller implementation directly calls the appropriate low level
  * mapping function.
  */
-unsigned int irq_find_mapping(struct irq_domain *host,
+unsigned int irq_find_mapping(struct irq_domain *domain,
 			      irq_hw_number_t hwirq)
 {
 	unsigned int i;
 	unsigned int hint = hwirq % irq_virq_count;
 
-	/* Look for default host if nececssary */
-	if (host == NULL)
-		host = irq_default_host;
-	if (host == NULL)
+	/* Look for default domain if nececssary */
+	if (domain == NULL)
+		domain = irq_default_domain;
+	if (domain == NULL)
 		return 0;
 
 	/* legacy -> bail early */
-	if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
 		return hwirq;
 
 	/* Slow path does a linear search of the map */
@@ -434,7 +431,7 @@ unsigned int irq_find_mapping(struct irq_domain *host,
 	i = hint;
 	do {
 		struct irq_data *data = irq_get_irq_data(i);
-		if (data && (data->domain == host) && (data->hwirq == hwirq))
+		if (data && (data->domain == domain) && (data->hwirq == hwirq))
 			return i;
 		i++;
 		if (i >= irq_virq_count)
@@ -446,26 +443,26 @@ EXPORT_SYMBOL_GPL(irq_find_mapping);
 
 /**
  * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
- * @host: host owning this hardware interrupt
- * @hwirq: hardware irq number in that host space
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
  *
  * This is a fast path, for use by irq controller code that uses radix tree
  * revmaps
  */
-unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
+unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
 				     irq_hw_number_t hwirq)
 {
 	struct irq_data *irq_data;
 
-	if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
-		return irq_find_mapping(host, hwirq);
+	if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
+		return irq_find_mapping(domain, hwirq);
 
 	/*
 	 * Freeing an irq can delete nodes along the path to
 	 * do the lookup via call_rcu.
 	 */
 	rcu_read_lock();
-	irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq);
+	irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
 	rcu_read_unlock();
 
 	/*
@@ -473,62 +470,62 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
 	 * Else fallback to linear lookup - this should not happen in practice
 	 * as it means that we failed to insert the node in the radix tree.
 	 */
-	return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq);
+	return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
 }
 
 /**
  * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
- * @host: host owning this hardware interrupt
+ * @domain: domain owning this hardware interrupt
  * @virq: linux irq number
- * @hwirq: hardware irq number in that host space
+ * @hwirq: hardware irq number in that domain space
  *
  * This is for use by irq controllers that use a radix tree reverse
  * mapping for fast lookup.
  */
-void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq,
+void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
 			     irq_hw_number_t hwirq)
 {
 	struct irq_data *irq_data = irq_get_irq_data(virq);
 
-	if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE))
+	if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
 		return;
 
 	if (virq) {
 		mutex_lock(&revmap_trees_mutex);
-		radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data);
+		radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
 		mutex_unlock(&revmap_trees_mutex);
 	}
 }
 
 /**
  * irq_linear_revmap() - Find a linux irq from a hw irq number.
- * @host: host owning this hardware interrupt
- * @hwirq: hardware irq number in that host space
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
  *
  * This is a fast path, for use by irq controller code that uses linear
  * revmaps. It does fallback to the slow path if the revmap doesn't exist
  * yet and will create the revmap entry with appropriate locking
  */
-unsigned int irq_linear_revmap(struct irq_domain *host,
+unsigned int irq_linear_revmap(struct irq_domain *domain,
 			       irq_hw_number_t hwirq)
 {
 	unsigned int *revmap;
 
-	if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
-		return irq_find_mapping(host, hwirq);
+	if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
+		return irq_find_mapping(domain, hwirq);
 
 	/* Check revmap bounds */
-	if (unlikely(hwirq >= host->revmap_data.linear.size))
-		return irq_find_mapping(host, hwirq);
+	if (unlikely(hwirq >= domain->revmap_data.linear.size))
+		return irq_find_mapping(domain, hwirq);
 
 	/* Check if revmap was allocated */
-	revmap = host->revmap_data.linear.revmap;
+	revmap = domain->revmap_data.linear.revmap;
 	if (unlikely(revmap == NULL))
-		return irq_find_mapping(host, hwirq);
+		return irq_find_mapping(domain, hwirq);
 
 	/* Fill up revmap with slow path if no mapping found */
 	if (unlikely(!revmap[hwirq]))
-		revmap[hwirq] = irq_find_mapping(host, hwirq);
+		revmap[hwirq] = irq_find_mapping(domain, hwirq);
 
 	return revmap[hwirq];
 }
@@ -544,7 +541,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
 	int i;
 
 	seq_printf(m, "%-5s  %-7s  %-15s  %-18s  %s\n", "virq", "hwirq",
-		      "chip name", "chip data", "host name");
+		      "chip name", "chip data", "domain name");
 
 	for (i = 1; i < nr_irqs; i++) {
 		desc = irq_to_desc(i);
-- 
cgit v1.2.3-55-g7522


From a8db8cf0d894df5f1dcfd4bce9894e0dbcc01c96 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:54 -0700
Subject: irq_domain: Replace irq_alloc_host() with revmap-specific
 initializers

Each revmap type has different arguments for setting up the revmap.
This patch splits up the generator functions so that each revmap type
can do its own setup and the user doesn't need to keep track of how
each revmap type handles the arguments.

This patch also adds a host_data argument to the generators.  There are
cases where the host_data pointer will be needed before the function returns.
ie. the legacy map calls the .map callback for each irq before returning.

v2: - Add void *host_data argument to irq_domain_add_*() functions
    - fixed failure to compile
    - Moved IRQ_DOMAIN_MAP_* defines into irqdomain.c

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 arch/powerpc/platforms/512x/mpc5121_ads_cpld.c   |   3 +-
 arch/powerpc/platforms/52xx/media5200.c          |   7 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c        |   6 +-
 arch/powerpc/platforms/52xx/mpc52xx_pic.c        |   4 +-
 arch/powerpc/platforms/82xx/pq2ads-pci-pic.c     |   6 +-
 arch/powerpc/platforms/85xx/socrates_fpga_pic.c  |   5 +-
 arch/powerpc/platforms/86xx/gef_pic.c            |   5 +-
 arch/powerpc/platforms/cell/axon_msi.c           |   5 +-
 arch/powerpc/platforms/cell/beat_interrupt.c     |   4 +-
 arch/powerpc/platforms/cell/interrupt.c          |   4 +-
 arch/powerpc/platforms/cell/spider-pic.c         |   6 +-
 arch/powerpc/platforms/embedded6xx/flipper-pic.c |   6 +-
 arch/powerpc/platforms/embedded6xx/hlwd-pic.c    |   5 +-
 arch/powerpc/platforms/iseries/irq.c             |   3 +-
 arch/powerpc/platforms/powermac/pic.c            |   5 +-
 arch/powerpc/platforms/powermac/smp.c            |   3 +-
 arch/powerpc/platforms/ps3/interrupt.c           |   3 +-
 arch/powerpc/platforms/wsp/opb_pic.c             |   7 +-
 arch/powerpc/sysdev/cpm1.c                       |   3 +-
 arch/powerpc/sysdev/cpm2_pic.c                   |   3 +-
 arch/powerpc/sysdev/ehv_pic.c                    |   6 +-
 arch/powerpc/sysdev/fsl_msi.c                    |   6 +-
 arch/powerpc/sysdev/i8259.c                      |   3 +-
 arch/powerpc/sysdev/ipic.c                       |   7 +-
 arch/powerpc/sysdev/mpc8xx_pic.c                 |   3 +-
 arch/powerpc/sysdev/mpic.c                       |   7 +-
 arch/powerpc/sysdev/mv64x60_pic.c                |   5 +-
 arch/powerpc/sysdev/qe_lib/qe_ic.c               |   5 +-
 arch/powerpc/sysdev/tsi108_pci.c                 |   3 +-
 arch/powerpc/sysdev/uic.c                        |   6 +-
 arch/powerpc/sysdev/xics/xics-common.c           |   3 +-
 arch/powerpc/sysdev/xilinx_intc.c                |   5 +-
 drivers/gpio/gpio-mpc8xxx.c                      |   7 +-
 include/linux/irqdomain.h                        |  24 ++-
 kernel/irq/irqdomain.c                           | 200 +++++++++++++++--------
 35 files changed, 198 insertions(+), 185 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
index fefa7977fa9f..291d61c94718 100644
--- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
+++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
@@ -190,8 +190,7 @@ mpc5121_ads_cpld_pic_init(void)
 
 	cpld_pic_node = of_node_get(np);
 
-	cpld_pic_host =
-	    irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, 16, &cpld_pic_host_ops, 16);
+	cpld_pic_host = irq_domain_add_linear(np, 16, &cpld_pic_host_ops, NULL);
 	if (!cpld_pic_host) {
 		printk(KERN_ERR "CPLD PIC: failed to allocate irq host!\n");
 		goto end;
diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c
index a746415c4242..5db5cfb6a4ff 100644
--- a/arch/powerpc/platforms/52xx/media5200.c
+++ b/arch/powerpc/platforms/52xx/media5200.c
@@ -173,15 +173,12 @@ static void __init media5200_init_irq(void)
 
 	spin_lock_init(&media5200_irq.lock);
 
-	media5200_irq.irqhost = irq_alloc_host(fpga_np, IRQ_DOMAIN_MAP_LINEAR,
-					       MEDIA5200_NUM_IRQS,
-					       &media5200_irq_ops, -1);
+	media5200_irq.irqhost = irq_domain_add_linear(fpga_np,
+			MEDIA5200_NUM_IRQS, &media5200_irq_ops, &media5200_irq);
 	if (!media5200_irq.irqhost)
 		goto out;
 	pr_debug("%s: allocated irqhost\n", __func__);
 
-	media5200_irq.irqhost->host_data = &media5200_irq;
-
 	irq_set_handler_data(cascade_virq, &media5200_irq);
 	irq_set_chained_handler(cascade_virq, media5200_irq_cascade);
 
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index e90af8fd8413..b53275d12727 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -252,14 +252,12 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node)
 	if (!cascade_virq)
 		return;
 
-	gpt->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR, 1,
-				      &mpc52xx_gpt_irq_ops, -1);
+	gpt->irqhost = irq_domain_add_linear(node, 1, &mpc52xx_gpt_irq_ops, gpt);
 	if (!gpt->irqhost) {
-		dev_err(gpt->dev, "irq_alloc_host() failed\n");
+		dev_err(gpt->dev, "irq_domain_add_linear() failed\n");
 		return;
 	}
 
-	gpt->irqhost->host_data = gpt;
 	irq_set_handler_data(cascade_virq, gpt);
 	irq_set_chained_handler(cascade_virq, mpc52xx_gpt_irq_cascade);
 
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
index 8c997f1a9122..41fa67126c44 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c
@@ -444,9 +444,9 @@ void __init mpc52xx_init_irq(void)
 	 * As last step, add an irq host to translate the real
 	 * hw irq information provided by the ofw to linux virq
 	 */
-	mpc52xx_irqhost = irq_alloc_host(picnode, IRQ_DOMAIN_MAP_LINEAR,
+	mpc52xx_irqhost = irq_domain_add_linear(picnode,
 	                                 MPC52xx_IRQ_HIGHTESTHWIRQ,
-	                                 &mpc52xx_irqhost_ops, -1);
+	                                 &mpc52xx_irqhost_ops, NULL);
 
 	if (!mpc52xx_irqhost)
 		panic(__FILE__ ": Cannot allocate the IRQ host\n");
diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
index bdba174e7b3a..4ef9d6918e23 100644
--- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
+++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
@@ -156,17 +156,13 @@ int __init pq2ads_pci_init_irq(void)
 	out_be32(&priv->regs->mask, ~0);
 	mb();
 
-	host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, NUM_IRQS,
-	                      &pci_pic_host_ops, NUM_IRQS);
+	host = irq_domain_add_linear(np, NUM_IRQS, &pci_pic_host_ops, priv);
 	if (!host) {
 		ret = -ENOMEM;
 		goto out_unmap_regs;
 	}
 
-	host->host_data = priv;
-
 	priv->host = host;
-	host->host_data = priv;
 	irq_set_handler_data(irq, priv);
 	irq_set_chained_handler(irq, pq2ads_pci_irq_demux);
 
diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
index e3ef7c9ed7b1..1092c121adf3 100644
--- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
+++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c
@@ -280,9 +280,8 @@ void socrates_fpga_pic_init(struct device_node *pic)
 	int i;
 
 	/* Setup an irq_domain structure */
-	socrates_fpga_pic_irq_host = irq_alloc_host(pic, IRQ_DOMAIN_MAP_LINEAR,
-			SOCRATES_FPGA_NUM_IRQS,	&socrates_fpga_pic_host_ops,
-			SOCRATES_FPGA_NUM_IRQS);
+	socrates_fpga_pic_irq_host = irq_domain_add_linear(pic,
+		    SOCRATES_FPGA_NUM_IRQS, &socrates_fpga_pic_host_ops, NULL);
 	if (socrates_fpga_pic_irq_host == NULL) {
 		pr_err("FPGA PIC: Unable to allocate host\n");
 		return;
diff --git a/arch/powerpc/platforms/86xx/gef_pic.c b/arch/powerpc/platforms/86xx/gef_pic.c
index 0cf8af230bcf..126a94b530e4 100644
--- a/arch/powerpc/platforms/86xx/gef_pic.c
+++ b/arch/powerpc/platforms/86xx/gef_pic.c
@@ -212,9 +212,8 @@ void __init gef_pic_init(struct device_node *np)
 	}
 
 	/* Setup an irq_domain structure */
-	gef_pic_irq_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR,
-					  GEF_PIC_NUM_IRQS,
-					  &gef_pic_host_ops, NO_IRQ);
+	gef_pic_irq_host = irq_domain_add_linear(np, GEF_PIC_NUM_IRQS,
+					  &gef_pic_host_ops, NULL);
 	if (gef_pic_irq_host == NULL)
 		return;
 
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 1bfd18a48a7f..cf9fd3c8a9b9 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -392,16 +392,13 @@ static int axon_msi_probe(struct platform_device *device)
 	}
 	memset(msic->fifo_virt, 0xff, MSIC_FIFO_SIZE_BYTES);
 
-	msic->irq_domain = irq_alloc_host(dn, IRQ_DOMAIN_MAP_NOMAP,
-					NR_IRQS, &msic_host_ops, 0);
+	msic->irq_domain = irq_domain_add_nomap(dn, &msic_host_ops, msic);
 	if (!msic->irq_domain) {
 		printk(KERN_ERR "axon_msi: couldn't allocate irq_domain for %s\n",
 		       dn->full_name);
 		goto out_free_fifo;
 	}
 
-	msic->irq_domain->host_data = msic;
-
 	irq_set_handler_data(virq, msic);
 	irq_set_chained_handler(virq, axon_msi_cascade);
 	pr_devel("axon_msi: irq 0x%x setup for axon_msi\n", virq);
diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c
index 21b64cfef5e5..bbdf57440cd5 100644
--- a/arch/powerpc/platforms/cell/beat_interrupt.c
+++ b/arch/powerpc/platforms/cell/beat_interrupt.c
@@ -239,9 +239,7 @@ void __init beatic_init_IRQ(void)
 	ppc_md.get_irq = beatic_get_irq;
 
 	/* Allocate an irq host */
-	beatic_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0,
-				     &beatic_pic_host_ops,
-					 0);
+	beatic_host = irq_domain_add_nomap(NULL, &beatic_pic_host_ops, NULL);
 	BUG_ON(beatic_host == NULL);
 	irq_set_default_host(beatic_host);
 }
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index 6888475e7c62..c844797a6898 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -378,8 +378,8 @@ static int __init setup_iic(void)
 void __init iic_init_IRQ(void)
 {
 	/* Setup an irq host data structure */
-	iic_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_LINEAR, IIC_SOURCE_COUNT,
-				  &iic_host_ops, IIC_IRQ_INVALID);
+	iic_host = irq_domain_add_linear(NULL, IIC_SOURCE_COUNT, &iic_host_ops,
+					 NULL);
 	BUG_ON(iic_host == NULL);
 	irq_set_default_host(iic_host);
 
diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c
index 1f935a772ef8..6521d202284a 100644
--- a/arch/powerpc/platforms/cell/spider-pic.c
+++ b/arch/powerpc/platforms/cell/spider-pic.c
@@ -299,12 +299,10 @@ static void __init spider_init_one(struct device_node *of_node, int chip,
 		panic("spider_pic: can't map registers !");
 
 	/* Allocate a host */
-	pic->host = irq_alloc_host(of_node, IRQ_DOMAIN_MAP_LINEAR,
-				   SPIDER_SRC_COUNT, &spider_host_ops,
-				   SPIDER_IRQ_INVALID);
+	pic->host = irq_domain_add_linear(of_node, SPIDER_SRC_COUNT,
+					  &spider_host_ops, pic);
 	if (pic->host == NULL)
 		panic("spider_pic: can't allocate irq host !");
-	pic->host->host_data = pic;
 
 	/* Go through all sources and disable them */
 	for (i = 0; i < SPIDER_SRC_COUNT; i++) {
diff --git a/arch/powerpc/platforms/embedded6xx/flipper-pic.c b/arch/powerpc/platforms/embedded6xx/flipper-pic.c
index f862361730fb..434597166ca4 100644
--- a/arch/powerpc/platforms/embedded6xx/flipper-pic.c
+++ b/arch/powerpc/platforms/embedded6xx/flipper-pic.c
@@ -159,15 +159,13 @@ struct irq_domain * __init flipper_pic_init(struct device_node *np)
 
 	__flipper_quiesce(io_base);
 
-	irq_domain = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, FLIPPER_NR_IRQS,
-				  &flipper_irq_domain_ops, -1);
+	irq_domain = irq_domain_add_linear(np, FLIPPER_NR_IRQS,
+				  &flipper_irq_domain_ops, io_base);
 	if (!irq_domain) {
 		pr_err("failed to allocate irq_domain\n");
 		return NULL;
 	}
 
-	irq_domain->host_data = io_base;
-
 out:
 	return irq_domain;
 }
diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
index 2d4a5d48fbbd..499d410b95fa 100644
--- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
+++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
@@ -177,13 +177,12 @@ struct irq_domain *hlwd_pic_init(struct device_node *np)
 
 	__hlwd_quiesce(io_base);
 
-	irq_domain = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, HLWD_NR_IRQS,
-				  &hlwd_irq_domain_ops, -1);
+	irq_domain = irq_domain_add_linear(np, HLWD_NR_IRQS,
+					   &hlwd_irq_domain_ops, io_base);
 	if (!irq_domain) {
 		pr_err("failed to allocate irq_domain\n");
 		return NULL;
 	}
-	irq_domain->host_data = io_base;
 
 	return irq_domain;
 }
diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c
index b07d4f2e0155..5538b593079d 100644
--- a/arch/powerpc/platforms/iseries/irq.c
+++ b/arch/powerpc/platforms/iseries/irq.c
@@ -380,8 +380,7 @@ void __init iSeries_init_IRQ(void)
 	/* Create irq host. No need for a revmap since HV will give us
 	 * back our virtual irq number
 	 */
-	host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0,
-			      &iseries_irq_domain_ops, 0);
+	host = irq_domain_add_nomap(NULL, &iseries_irq_domain_ops, NULL);
 	BUG_ON(host == NULL);
 	irq_set_default_host(host);
 
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index cff326ab8ef2..646fdf3b9c0c 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -352,9 +352,8 @@ static void __init pmac_pic_probe_oldstyle(void)
 	/*
 	 * Allocate an irq host
 	 */
-	pmac_pic_host = irq_alloc_host(master, IRQ_DOMAIN_MAP_LINEAR, max_irqs,
-				       &pmac_pic_host_ops,
-				       max_irqs);
+	pmac_pic_host = irq_domain_add_linear(master, max_irqs,
+					      &pmac_pic_host_ops, NULL);
 	BUG_ON(pmac_pic_host == NULL);
 	irq_set_default_host(pmac_pic_host);
 
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 6b1ef2d4dea0..09afd704f07a 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -192,8 +192,7 @@ static int psurge_secondary_ipi_init(void)
 {
 	int rc = -ENOMEM;
 
-	psurge_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0,
-		&psurge_host_ops, 0);
+	psurge_host = irq_domain_add_nomap(NULL, &psurge_host_ops, NULL);
 
 	if (psurge_host)
 		psurge_secondary_virq = irq_create_direct_mapping(psurge_host);
diff --git a/arch/powerpc/platforms/ps3/interrupt.c b/arch/powerpc/platforms/ps3/interrupt.c
index c5980e422dc6..c05808f21015 100644
--- a/arch/powerpc/platforms/ps3/interrupt.c
+++ b/arch/powerpc/platforms/ps3/interrupt.c
@@ -753,8 +753,7 @@ void __init ps3_init_IRQ(void)
 	unsigned cpu;
 	struct irq_domain *host;
 
-	host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0, &ps3_host_ops,
-		PS3_INVALID_OUTLET);
+	host = irq_domain_add_nomap(NULL, &ps3_host_ops, NULL);
 	irq_set_default_host(host);
 	irq_set_virq_count(PS3_PLUG_MAX + 1);
 
diff --git a/arch/powerpc/platforms/wsp/opb_pic.c b/arch/powerpc/platforms/wsp/opb_pic.c
index 76b33bc36116..4837515c2826 100644
--- a/arch/powerpc/platforms/wsp/opb_pic.c
+++ b/arch/powerpc/platforms/wsp/opb_pic.c
@@ -263,13 +263,11 @@ struct opb_pic *opb_pic_init_one(struct device_node *dn)
 		goto free_opb;
 	}
 
-	/* Allocate an irq host so that Linux knows that despite only
+	/* Allocate an irq domain so that Linux knows that despite only
 	 * having one interrupt to issue, we're the controller for multiple
 	 * hardware IRQs, so later we can lookup their virtual IRQs. */
 
-	opb->host = irq_alloc_host(dn, IRQ_DOMAIN_MAP_LINEAR,
-			OPB_NR_IRQS, &opb_host_ops, -1);
-
+	opb->host = irq_domain_add_linear(dn, OPB_NR_IRQS, &opb_host_ops, opb);
 	if (!opb->host) {
 		printk(KERN_ERR "opb: Failed to allocate IRQ host!\n");
 		goto free_regs;
@@ -277,7 +275,6 @@ struct opb_pic *opb_pic_init_one(struct device_node *dn)
 
 	opb->index = opb_index++;
 	spin_lock_init(&opb->lock);
-	opb->host->host_data = opb;
 
 	/* Disable all interrupts by default */
 	opb_out(opb, OPB_MLSASIER, 0);
diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c
index 0877a757c209..53f39dbbfb97 100644
--- a/arch/powerpc/sysdev/cpm1.c
+++ b/arch/powerpc/sysdev/cpm1.c
@@ -164,8 +164,7 @@ unsigned int cpm_pic_init(void)
 
 	out_be32(&cpic_reg->cpic_cimr, 0);
 
-	cpm_pic_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR,
-				      64, &cpm_pic_host_ops, 64);
+	cpm_pic_host = irq_domain_add_linear(np, 64, &cpm_pic_host_ops, NULL);
 	if (cpm_pic_host == NULL) {
 		printk(KERN_ERR "CPM2 PIC: failed to allocate irq host!\n");
 		sirq = NO_IRQ;
diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c
index b149baae5d33..b3643322c528 100644
--- a/arch/powerpc/sysdev/cpm2_pic.c
+++ b/arch/powerpc/sysdev/cpm2_pic.c
@@ -275,8 +275,7 @@ void cpm2_pic_init(struct device_node *node)
 	out_be32(&cpm2_intctl->ic_scprrl, 0x05309770);
 
 	/* create a legacy host */
-	cpm2_pic_host = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR,
-				       64, &cpm2_pic_host_ops, 64);
+	cpm2_pic_host = irq_domain_add_linear(node, 64, &cpm2_pic_host_ops, NULL);
 	if (cpm2_pic_host == NULL) {
 		printk(KERN_ERR "CPM2 PIC: failed to allocate irq host!\n");
 		return;
diff --git a/arch/powerpc/sysdev/ehv_pic.c b/arch/powerpc/sysdev/ehv_pic.c
index 48d3ba1220d3..adea322ea18f 100644
--- a/arch/powerpc/sysdev/ehv_pic.c
+++ b/arch/powerpc/sysdev/ehv_pic.c
@@ -275,9 +275,8 @@ void __init ehv_pic_init(void)
 		return;
 	}
 
-	ehv_pic->irqhost = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR,
-		NR_EHV_PIC_INTS, &ehv_pic_host_ops, 0);
-
+	ehv_pic->irqhost = irq_domain_add_linear(np, NR_EHV_PIC_INTS,
+						 &ehv_pic_host_ops, ehv_pic);
 	if (!ehv_pic->irqhost) {
 		of_node_put(np);
 		kfree(ehv_pic);
@@ -293,7 +292,6 @@ void __init ehv_pic_init(void)
 		of_node_put(np2);
 	}
 
-	ehv_pic->irqhost->host_data = ehv_pic;
 	ehv_pic->hc_irq = ehv_pic_irq_chip;
 	ehv_pic->hc_irq.irq_set_affinity = ehv_pic_set_affinity;
 	ehv_pic->coreint_flag = coreint_flag;
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 3f9a301c4550..f4fd95bc1278 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -387,8 +387,8 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev)
 	}
 	platform_set_drvdata(dev, msi);
 
-	msi->irqhost = irq_alloc_host(dev->dev.of_node, IRQ_DOMAIN_MAP_LINEAR,
-				      NR_MSI_IRQS, &fsl_msi_host_ops, 0);
+	msi->irqhost = irq_domain_add_linear(dev->dev.of_node,
+				      NR_MSI_IRQS, &fsl_msi_host_ops, msi);
 
 	if (msi->irqhost == NULL) {
 		dev_err(&dev->dev, "No memory for MSI irqhost\n");
@@ -420,8 +420,6 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev)
 
 	msi->feature = features->fsl_pic_ip;
 
-	msi->irqhost->host_data = msi;
-
 	/*
 	 * Remember the phandle, so that we can match with any PCI nodes
 	 * that have an "fsl,msi" property.
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 7e67890b8fc2..573a73bd954a 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -263,8 +263,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr)
 	raw_spin_unlock_irqrestore(&i8259_lock, flags);
 
 	/* create a legacy host */
-	i8259_host = irq_alloc_host(node, IRQ_DOMAIN_MAP_LEGACY,
-				    0, &i8259_host_ops, 0);
+	i8259_host = irq_domain_add_legacy(node, &i8259_host_ops, NULL);
 	if (i8259_host == NULL) {
 		printk(KERN_ERR "i8259: failed to allocate irq host !\n");
 		return;
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 9abed3760545..0eaaa01c11b3 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -728,9 +728,8 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags)
 	if (ipic == NULL)
 		return NULL;
 
-	ipic->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR,
-				       NR_IPIC_INTS,
-				       &ipic_host_ops, 0);
+	ipic->irqhost = irq_domain_add_linear(node, NR_IPIC_INTS,
+					      &ipic_host_ops, ipic);
 	if (ipic->irqhost == NULL) {
 		kfree(ipic);
 		return NULL;
@@ -738,8 +737,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags)
 
 	ipic->regs = ioremap(res.start, resource_size(&res));
 
-	ipic->irqhost->host_data = ipic;
-
 	/* init hw */
 	ipic_write(ipic->regs, IPIC_SICNR, 0x0);
 
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c
index 978dfc4c3120..d5f5416be310 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/sysdev/mpc8xx_pic.c
@@ -171,8 +171,7 @@ int mpc8xx_pic_init(void)
 		goto out;
 	}
 
-	mpc8xx_pic_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR,
-					 64, &mpc8xx_pic_host_ops, 64);
+	mpc8xx_pic_host = irq_domain_add_linear(np, 64, &mpc8xx_pic_host_ops, NULL);
 	if (mpc8xx_pic_host == NULL) {
 		printk(KERN_ERR "MPC8xx PIC: failed to allocate irq host!\n");
 		ret = -ENOMEM;
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index c844d343bf32..c83a512fa175 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1345,10 +1345,9 @@ struct mpic * __init mpic_alloc(struct device_node *node,
 	mpic->isu_shift = 1 + __ilog2(mpic->isu_size - 1);
 	mpic->isu_mask = (1 << mpic->isu_shift) - 1;
 
-	mpic->irqhost = irq_alloc_host(mpic->node, IRQ_DOMAIN_MAP_LINEAR,
+	mpic->irqhost = irq_domain_add_linear(mpic->node,
 				       isu_size ? isu_size : mpic->num_sources,
-				       &mpic_host_ops,
-				       flags & MPIC_LARGE_VECTORS ? 2048 : 256);
+				       &mpic_host_ops, mpic);
 
 	/*
 	 * FIXME: The code leaks the MPIC object and mappings here; this
@@ -1357,8 +1356,6 @@ struct mpic * __init mpic_alloc(struct device_node *node,
 	if (mpic->irqhost == NULL)
 		return NULL;
 
-	mpic->irqhost->host_data = mpic;
-
 	/* Display version */
 	switch (greg_feature & MPIC_GREG_FEATURE_VERSION_MASK) {
 	case 1:
diff --git a/arch/powerpc/sysdev/mv64x60_pic.c b/arch/powerpc/sysdev/mv64x60_pic.c
index 45124a1959d0..8848e99a83f2 100644
--- a/arch/powerpc/sysdev/mv64x60_pic.c
+++ b/arch/powerpc/sysdev/mv64x60_pic.c
@@ -250,9 +250,8 @@ void __init mv64x60_init_irq(void)
 	paddr = of_translate_address(np, reg);
 	mv64x60_irq_reg_base = ioremap(paddr, reg[1]);
 
-	mv64x60_irq_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR,
-					  MV64x60_NUM_IRQS,
-					  &mv64x60_host_ops, MV64x60_NUM_IRQS);
+	mv64x60_irq_host = irq_domain_add_linear(np, MV64x60_NUM_IRQS,
+					  &mv64x60_host_ops, NULL);
 
 	spin_lock_irqsave(&mv64x60_lock, flags);
 	out_le32(mv64x60_gpp_reg_base + MV64x60_GPP_INTR_MASK,
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index 78e90192c343..e9b3d5cc65d3 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -339,8 +339,8 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags,
 	if (qe_ic == NULL)
 		return;
 
-	qe_ic->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR,
-					NR_QE_IC_INTS, &qe_ic_host_ops, 0);
+	qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS,
+					       &qe_ic_host_ops, qe_ic);
 	if (qe_ic->irqhost == NULL) {
 		kfree(qe_ic);
 		return;
@@ -348,7 +348,6 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags,
 
 	qe_ic->regs = ioremap(res.start, resource_size(&res));
 
-	qe_ic->irqhost->host_data = qe_ic;
 	qe_ic->hc_irq = qe_ic_irq_chip;
 
 	qe_ic->virq_high = irq_of_parse_and_map(node, 0);
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index f3757236e666..1be26f4b9c96 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -419,8 +419,7 @@ void __init tsi108_pci_int_init(struct device_node *node)
 {
 	DBG("Tsi108_pci_int_init: initializing PCI interrupts\n");
 
-	pci_irq_host = irq_alloc_host(node, IRQ_DOMAIN_MAP_LEGACY,
-				      0, &pci_irq_domain_ops, 0);
+	pci_irq_host = irq_domain_add_legacy(node, &pci_irq_domain_ops, NULL);
 	if (pci_irq_host == NULL) {
 		printk(KERN_ERR "pci_irq_host: failed to allocate irq domain!\n");
 		return;
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index 7eea3a64bdf7..84e59c97391f 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -270,13 +270,11 @@ static struct uic * __init uic_init_one(struct device_node *node)
 	}
 	uic->dcrbase = *dcrreg;
 
-	uic->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR,
-				      NR_UIC_INTS, &uic_host_ops, -1);
+	uic->irqhost = irq_domain_add_linear(node, NR_UIC_INTS, &uic_host_ops,
+					     uic);
 	if (! uic->irqhost)
 		return NULL; /* FIXME: panic? */
 
-	uic->irqhost->host_data = uic;
-
 	/* Start with all interrupts disabled, level and non-critical */
 	mtdcr(uic->dcrbase + UIC_ER, 0);
 	mtdcr(uic->dcrbase + UIC_CR, 0);
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index fb2e303d25fb..ea5e204e3450 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -374,8 +374,7 @@ static struct irq_domain_ops xics_host_ops = {
 
 static void __init xics_init_host(void)
 {
-	xics_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_TREE, 0, &xics_host_ops,
-				   XICS_IRQ_SPURIOUS);
+	xics_host = irq_domain_add_tree(NULL, &xics_host_ops, NULL);
 	BUG_ON(xics_host == NULL);
 	irq_set_default_host(xics_host);
 }
diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c
index 92e7d4db9fde..8d73c3c0bee6 100644
--- a/arch/powerpc/sysdev/xilinx_intc.c
+++ b/arch/powerpc/sysdev/xilinx_intc.c
@@ -201,11 +201,10 @@ xilinx_intc_init(struct device_node *np)
 	out_be32(regs + XINTC_MER, 0x3UL); /* Turn on the Master Enable. */
 
 	/* Allocate and initialize an irq_domain structure. */
-	irq = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, XILINX_INTC_MAXIRQS,
-			     &xilinx_intc_ops, -1);
+	irq = irq_domain_add_linear(np, XILINX_INTC_MAXIRQS, &xilinx_intc_ops,
+				    regs);
 	if (!irq)
 		panic(__FILE__ ": Cannot allocate IRQ host\n");
-	irq->host_data = regs;
 
 	return irq;
 }
diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c
index 9efd59732eeb..149d9876fca8 100644
--- a/drivers/gpio/gpio-mpc8xxx.c
+++ b/drivers/gpio/gpio-mpc8xxx.c
@@ -364,9 +364,8 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	if (hwirq == NO_IRQ)
 		goto skip_irq;
 
-	mpc8xxx_gc->irq =
-		irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, MPC8XXX_GPIO_PINS,
-			       &mpc8xxx_gpio_irq_ops, MPC8XXX_GPIO_PINS);
+	mpc8xxx_gc->irq = irq_domain_add_linear(np, MPC8XXX_GPIO_PINS,
+					&mpc8xxx_gpio_irq_ops, mpc8xxx_gc);
 	if (!mpc8xxx_gc->irq)
 		goto skip_irq;
 
@@ -374,8 +373,6 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	if (id)
 		mpc8xxx_gc->of_dev_id_data = id->data;
 
-	mpc8xxx_gc->irq->host_data = mpc8xxx_gc;
-
 	/* ack and mask all irqs */
 	out_be32(mm_gc->regs + GPIO_IER, 0xffffffff);
 	out_be32(mm_gc->regs + GPIO_IMR, 0);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 18f4ab002d2e..f95553fa6872 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -95,10 +95,6 @@ struct irq_domain {
 
 	/* type of reverse mapping_technique */
 	unsigned int revmap_type;
-#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */
-#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
-#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
-#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
 	union {
 		struct {
 			unsigned int size;
@@ -120,11 +116,21 @@ struct irq_domain {
 
 #ifdef CONFIG_IRQ_DOMAIN
 #ifdef CONFIG_PPC
-extern struct irq_domain *irq_alloc_host(struct device_node *of_node,
-				       unsigned int revmap_type,
-				       unsigned int revmap_arg,
-				       struct irq_domain_ops *ops,
-				       irq_hw_number_t inval_irq);
+struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+					 struct irq_domain_ops *ops,
+					 void *host_data);
+struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
+					 unsigned int size,
+					 struct irq_domain_ops *ops,
+					 void *host_data);
+struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
+					 struct irq_domain_ops *ops,
+					 void *host_data);
+struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
+					 struct irq_domain_ops *ops,
+					 void *host_data);
+
+
 extern struct irq_domain *irq_find_host(struct device_node *node);
 extern void irq_set_default_host(struct irq_domain *host);
 extern void irq_set_virq_count(unsigned int count);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 432d292b33f8..acedba1a2651 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -13,6 +13,11 @@
 #include <linux/smp.h>
 #include <linux/fs.h>
 
+#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */
+#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
+#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
+#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
+
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
 
@@ -27,100 +32,158 @@ static int default_irq_domain_match(struct irq_domain *d, struct device_node *np
 }
 
 /**
- * irq_alloc_host() - Allocate a new irq_domain data structure
+ * irq_domain_alloc() - Allocate a new irq_domain data structure
  * @of_node: optional device-tree node of the interrupt controller
  * @revmap_type: type of reverse mapping to use
- * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map
  * @ops: map/unmap domain callbacks
- * @inval_irq: provide a hw number in that domain space that is always invalid
+ * @host_data: Controller private data pointer
  *
- * Allocates and initialize and irq_domain structure. Note that in the case of
- * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by
- * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be
- * allocated later during boot automatically (the reverse mapping will use the
- * slow path until that happens).
+ * Allocates and initialize and irq_domain structure.  Caller is expected to
+ * register allocated irq_domain with irq_domain_register().  Returns pointer
+ * to IRQ domain, or NULL on failure.
  */
-struct irq_domain *irq_alloc_host(struct device_node *of_node,
-				unsigned int revmap_type,
-				unsigned int revmap_arg,
-				struct irq_domain_ops *ops,
-				irq_hw_number_t inval_irq)
+static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
+					   unsigned int revmap_type,
+					   struct irq_domain_ops *ops,
+					   void *host_data)
 {
-	struct irq_domain *domain, *h;
-	unsigned int size = sizeof(struct irq_domain);
-	unsigned int i;
-	unsigned int *rmap;
+	struct irq_domain *domain;
 
-	/* Allocate structure and revmap table if using linear mapping */
-	if (revmap_type == IRQ_DOMAIN_MAP_LINEAR)
-		size += revmap_arg * sizeof(unsigned int);
-	domain = kzalloc(size, GFP_KERNEL);
-	if (domain == NULL)
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (WARN_ON(!domain))
 		return NULL;
 
 	/* Fill structure */
 	domain->revmap_type = revmap_type;
-	domain->inval_irq = inval_irq;
 	domain->ops = ops;
+	domain->host_data = host_data;
 	domain->of_node = of_node_get(of_node);
 
 	if (domain->ops->match == NULL)
 		domain->ops->match = default_irq_domain_match;
 
+	return domain;
+}
+
+static void irq_domain_add(struct irq_domain *domain)
+{
+	mutex_lock(&irq_domain_mutex);
+	list_add(&domain->link, &irq_domain_list);
+	mutex_unlock(&irq_domain_mutex);
+	pr_debug("irq: Allocated domain of type %d @0x%p\n",
+		 domain->revmap_type, domain);
+}
+
+/**
+ * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Note: the map() callback will be called before this function returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller).
+ */
+struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+					 struct irq_domain_ops *ops,
+					 void *host_data)
+{
+	struct irq_domain *domain, *h;
+	unsigned int i;
+
+	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
+	if (!domain)
+		return NULL;
+
 	mutex_lock(&irq_domain_mutex);
 	/* Make sure only one legacy controller can be created */
-	if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
-		list_for_each_entry(h, &irq_domain_list, link) {
-			if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) {
-				mutex_unlock(&irq_domain_mutex);
-				of_node_put(domain->of_node);
-				kfree(domain);
-				return NULL;
-			}
+	list_for_each_entry(h, &irq_domain_list, link) {
+		if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) {
+			mutex_unlock(&irq_domain_mutex);
+			of_node_put(domain->of_node);
+			kfree(domain);
+			return NULL;
 		}
 	}
 	list_add(&domain->link, &irq_domain_list);
 	mutex_unlock(&irq_domain_mutex);
 
-	/* Additional setups per revmap type */
-	switch(revmap_type) {
-	case IRQ_DOMAIN_MAP_LEGACY:
-		/* 0 is always the invalid number for legacy */
-		domain->inval_irq = 0;
-		/* setup us as the domain for all legacy interrupts */
-		for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
-			struct irq_data *irq_data = irq_get_irq_data(i);
-			irq_data->hwirq = i;
-			irq_data->domain = domain;
-
-			/* Legacy flags are left to default at this point,
-			 * one can then use irq_create_mapping() to
-			 * explicitly change them
-			 */
-			ops->map(domain, i, i);
-
-			/* Clear norequest flags */
-			irq_clear_status_flags(i, IRQ_NOREQUEST);
-		}
-		break;
-	case IRQ_DOMAIN_MAP_LINEAR:
-		rmap = (unsigned int *)(domain + 1);
-		for (i = 0; i < revmap_arg; i++)
-			rmap[i] = 0;
-		domain->revmap_data.linear.size = revmap_arg;
-		domain->revmap_data.linear.revmap = rmap;
-		break;
-	case IRQ_DOMAIN_MAP_TREE:
-		INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
-		break;
-	default:
-		break;
+	/* setup us as the domain for all legacy interrupts */
+	for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
+		struct irq_data *irq_data = irq_get_irq_data(i);
+		irq_data->hwirq = i;
+		irq_data->domain = domain;
+
+		/* Legacy flags are left to default at this point,
+		 * one can then use irq_create_mapping() to
+		 * explicitly change them
+		 */
+		ops->map(domain, i, i);
+
+		/* Clear norequest flags */
+		irq_clear_status_flags(i, IRQ_NOREQUEST);
 	}
+	return domain;
+}
 
-	pr_debug("irq: Allocated domain of type %d @0x%p\n", revmap_type, domain);
+/**
+ * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ */
+struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
+					 unsigned int size,
+					 struct irq_domain_ops *ops,
+					 void *host_data)
+{
+	struct irq_domain *domain;
+	unsigned int *revmap;
+
+	revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
+	if (WARN_ON(!revmap))
+		return NULL;
 
+	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
+	if (!domain) {
+		kfree(revmap);
+		return NULL;
+	}
+	domain->revmap_data.linear.size = size;
+	domain->revmap_data.linear.revmap = revmap;
+	irq_domain_add(domain);
+	return domain;
+}
+
+struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
+					 struct irq_domain_ops *ops,
+					 void *host_data)
+{
+	struct irq_domain *domain = irq_domain_alloc(of_node,
+					IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
+	if (domain)
+		irq_domain_add(domain);
+	return domain;
+}
+
+/**
+ * irq_domain_add_tree()
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ *
+ * Note: The radix tree will be allocated later during boot automatically
+ * (the reverse mapping will use the slow path until that happens).
+ */
+struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
+					 struct irq_domain_ops *ops,
+					 void *host_data)
+{
+	struct irq_domain *domain = irq_domain_alloc(of_node,
+					IRQ_DOMAIN_MAP_TREE, ops, host_data);
+	if (domain) {
+		INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
+		irq_domain_add(domain);
+	}
 	return domain;
 }
 
@@ -393,9 +456,6 @@ void irq_dispose_mapping(unsigned int virq)
 		break;
 	}
 
-	/* Destroy map */
-	irq_data->hwirq = domain->inval_irq;
-
 	irq_free_desc(virq);
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-- 
cgit v1.2.3-55-g7522


From 1bc04f2cf8c2a1feadbd994f50c40bb145bf2989 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:55 -0700
Subject: irq_domain: Add support for base irq and hwirq in legacy mappings

Add support for a legacy mapping where irq = (hwirq - first_hwirq + first_irq)
so that a controller driver can allocate a fixed range of irq_descs and use
a simple calculation to translate back and forth between linux and hw irq
numbers.  This is needed to use an irq_domain with many of the ARM interrupt
controller drivers that manage their own irq_desc allocations.  Ultimately
the goal is to migrate those drivers to use the linear revmap, but doing it
this way allows each driver to be converted separately which makes the
migration path easier.

This patch generalizes the IRQ_DOMAIN_MAP_LEGACY method to use
(first_irq-first_hwirq) as the offset between hwirq and linux irq number,
and adds checks to make sure that the hwirq number does not exceed range
assigned to the controller.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 arch/powerpc/include/asm/irq.h   |  3 --
 arch/powerpc/sysdev/i8259.c      |  2 +-
 arch/powerpc/sysdev/tsi108_pci.c |  2 +-
 include/linux/irqdomain.h        | 20 ++++++++-
 kernel/irq/irqdomain.c           | 96 ++++++++++++++++++++++++++--------------
 5 files changed, 85 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 728cc30d04ea..fe0b09dceb7d 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -36,9 +36,6 @@ extern atomic_t ppc_n_lost_interrupts;
 /* Total number of virq in the platform */
 #define NR_IRQS		CONFIG_NR_IRQS
 
-/* Number of irqs reserved for the legacy controller */
-#define NUM_ISA_INTERRUPTS	16
-
 /* Same thing, used by the generic IRQ code */
 #define NR_IRQS_LEGACY		NUM_ISA_INTERRUPTS
 
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 573a73bd954a..997df6a7ab5d 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -263,7 +263,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr)
 	raw_spin_unlock_irqrestore(&i8259_lock, flags);
 
 	/* create a legacy host */
-	i8259_host = irq_domain_add_legacy(node, &i8259_host_ops, NULL);
+	i8259_host = irq_domain_add_legacy_isa(node, &i8259_host_ops, NULL);
 	if (i8259_host == NULL) {
 		printk(KERN_ERR "i8259: failed to allocate irq host !\n");
 		return;
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index 1be26f4b9c96..188012c58f7f 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -419,7 +419,7 @@ void __init tsi108_pci_int_init(struct device_node *node)
 {
 	DBG("Tsi108_pci_int_init: initializing PCI interrupts\n");
 
-	pci_irq_host = irq_domain_add_legacy(node, &pci_irq_domain_ops, NULL);
+	pci_irq_host = irq_domain_add_legacy_isa(node, &pci_irq_domain_ops, NULL);
 	if (pci_irq_host == NULL) {
 		printk(KERN_ERR "pci_irq_host: failed to allocate irq domain!\n");
 		return;
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index f95553fa6872..7fef39ed5523 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -39,6 +39,9 @@ struct device_node;
 struct irq_domain;
 struct of_device_id;
 
+/* Number of irqs reserved for a legacy isa controller */
+#define NUM_ISA_INTERRUPTS	16
+
 /* This type is the placeholder for a hardware interrupt number. It has to
  * be big enough to enclose whatever representation is used by a given
  * platform.
@@ -96,6 +99,11 @@ struct irq_domain {
 	/* type of reverse mapping_technique */
 	unsigned int revmap_type;
 	union {
+		struct {
+			unsigned int size;
+			unsigned int first_irq;
+			irq_hw_number_t first_hwirq;
+		} legacy;
 		struct {
 			unsigned int size;
 			unsigned int *revmap;
@@ -117,6 +125,9 @@ struct irq_domain {
 #ifdef CONFIG_IRQ_DOMAIN
 #ifdef CONFIG_PPC
 struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+					 unsigned int size,
+					 unsigned int first_irq,
+					 irq_hw_number_t first_hwirq,
 					 struct irq_domain_ops *ops,
 					 void *host_data);
 struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
@@ -130,11 +141,18 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
 					 struct irq_domain_ops *ops,
 					 void *host_data);
 
-
 extern struct irq_domain *irq_find_host(struct device_node *node);
 extern void irq_set_default_host(struct irq_domain *host);
 extern void irq_set_virq_count(unsigned int count);
 
+static inline struct irq_domain *irq_domain_add_legacy_isa(
+				struct device_node *of_node,
+				struct irq_domain_ops *ops,
+				void *host_data)
+{
+	return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops,
+				     host_data);
+}
 
 extern unsigned int irq_create_mapping(struct irq_domain *host,
 				       irq_hw_number_t hwirq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index acedba1a2651..c6740d72073e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -13,7 +13,8 @@
 #include <linux/smp.h>
 #include <linux/fs.h>
 
-#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */
+#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
+				 * ie. legacy 8259, gets irqs 1..15 */
 #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
 #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
 #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
@@ -74,9 +75,25 @@ static void irq_domain_add(struct irq_domain *domain)
 		 domain->revmap_type, domain);
 }
 
+static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
+					     irq_hw_number_t hwirq)
+{
+	irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
+	int size = domain->revmap_data.legacy.size;
+
+	if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
+		return 0;
+	return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
+}
+
 /**
  * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
  * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in legacy mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @first_hwirq: first hwirq number to use for the translation. Should normally
+ *               be '0', but a positive integer can be used if the effective
+ *               hwirqs numbering does not begin at zero.
  * @ops: map/unmap domain callbacks
  * @host_data: Controller private data pointer
  *
@@ -85,44 +102,64 @@ static void irq_domain_add(struct irq_domain *domain)
  * a legacy controller).
  */
 struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+					 unsigned int size,
+					 unsigned int first_irq,
+					 irq_hw_number_t first_hwirq,
 					 struct irq_domain_ops *ops,
 					 void *host_data)
 {
-	struct irq_domain *domain, *h;
+	struct irq_domain *domain;
 	unsigned int i;
 
 	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
 	if (!domain)
 		return NULL;
 
+	domain->revmap_data.legacy.first_irq = first_irq;
+	domain->revmap_data.legacy.first_hwirq = first_hwirq;
+	domain->revmap_data.legacy.size = size;
+
 	mutex_lock(&irq_domain_mutex);
-	/* Make sure only one legacy controller can be created */
-	list_for_each_entry(h, &irq_domain_list, link) {
-		if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) {
+	/* Verify that all the irqs are available */
+	for (i = 0; i < size; i++) {
+		int irq = first_irq + i;
+		struct irq_data *irq_data = irq_get_irq_data(irq);
+
+		if (WARN_ON(!irq_data || irq_data->domain)) {
 			mutex_unlock(&irq_domain_mutex);
 			of_node_put(domain->of_node);
 			kfree(domain);
 			return NULL;
 		}
 	}
-	list_add(&domain->link, &irq_domain_list);
-	mutex_unlock(&irq_domain_mutex);
 
-	/* setup us as the domain for all legacy interrupts */
-	for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
-		struct irq_data *irq_data = irq_get_irq_data(i);
-		irq_data->hwirq = i;
+	/* Claim all of the irqs before registering a legacy domain */
+	for (i = 0; i < size; i++) {
+		struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
+		irq_data->hwirq = first_hwirq + i;
 		irq_data->domain = domain;
+	}
+	mutex_unlock(&irq_domain_mutex);
+
+	for (i = 0; i < size; i++) {
+		int irq = first_irq + i;
+		int hwirq = first_hwirq + i;
+
+		/* IRQ0 gets ignored */
+		if (!irq)
+			continue;
 
 		/* Legacy flags are left to default at this point,
 		 * one can then use irq_create_mapping() to
 		 * explicitly change them
 		 */
-		ops->map(domain, i, i);
+		ops->map(domain, irq, hwirq);
 
 		/* Clear norequest flags */
-		irq_clear_status_flags(i, IRQ_NOREQUEST);
+		irq_clear_status_flags(irq, IRQ_NOREQUEST);
 	}
+
+	irq_domain_add(domain);
 	return domain;
 }
 
@@ -338,24 +375,19 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
 	}
 
 	/* Get a virtual interrupt number */
-	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) {
-		/* Handle legacy */
-		virq = (unsigned int)hwirq;
-		if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
-			return 0;
-		return virq;
-	} else {
-		/* Allocate a virtual interrupt number */
-		hint = hwirq % irq_virq_count;
-		if (hint == 0)
-			hint++;
-		virq = irq_alloc_desc_from(hint, 0);
-		if (!virq)
-			virq = irq_alloc_desc_from(1, 0);
-		if (!virq) {
-			pr_debug("irq: -> virq allocation failed\n");
-			return 0;
-		}
+	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+		return irq_domain_legacy_revmap(domain, hwirq);
+
+	/* Allocate a virtual interrupt number */
+	hint = hwirq % irq_virq_count;
+	if (hint == 0)
+		hint++;
+	virq = irq_alloc_desc_from(hint, 0);
+	if (!virq)
+		virq = irq_alloc_desc_from(1, 0);
+	if (!virq) {
+		pr_debug("irq: -> virq allocation failed\n");
+		return 0;
 	}
 
 	if (irq_setup_virq(domain, virq, hwirq)) {
@@ -483,7 +515,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 
 	/* legacy -> bail early */
 	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
-		return hwirq;
+		return irq_domain_legacy_revmap(domain, hwirq);
 
 	/* Slow path does a linear search of the map */
 	if (hint == 0)
-- 
cgit v1.2.3-55-g7522


From 75294957be1dee7d22dd7d90bd31334ba410e836 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 14 Feb 2012 14:06:57 -0700
Subject: irq_domain: Remove 'new' irq_domain in favour of the ppc one

This patch removes the simplistic implementation of irq_domains and enables
the powerpc infrastructure for all irq_domain users.  The powerpc
infrastructure includes support for complex mappings between Linux and
hardware irq numbers, and can manage allocation of irq_descs.

This patch also converts the few users of irq_domain_add()/irq_domain_del()
to call irq_domain_add_legacy() instead.

v3: Fix bug that set up too many irqs in translation range.
v2: Fix removal of irq_alloc_descs() call in gic driver

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/common/gic.c               |  85 +++++++++----------
 arch/arm/common/vic.c               |  16 ++--
 arch/arm/include/asm/hardware/gic.h |   4 +-
 arch/arm/include/asm/hardware/vic.h |   2 +
 arch/arm/mach-exynos/common.c       |   2 +-
 arch/arm/mach-versatile/core.c      |   7 +-
 drivers/mfd/twl-core.c              |  14 +---
 include/linux/irqdomain.h           |  45 +---------
 kernel/irq/irqdomain.c              | 159 +++---------------------------------
 9 files changed, 71 insertions(+), 263 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index dc19862be0a8..7275d808f76d 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -51,7 +51,6 @@ union gic_base {
 };
 
 struct gic_chip_data {
-	unsigned int irq_offset;
 	union gic_base dist_base;
 	union gic_base cpu_base;
 #ifdef CONFIG_CPU_PM
@@ -61,9 +60,7 @@ struct gic_chip_data {
 	u32 __percpu *saved_ppi_enable;
 	u32 __percpu *saved_ppi_conf;
 #endif
-#ifdef CONFIG_IRQ_DOMAIN
-	struct irq_domain domain;
-#endif
+	struct irq_domain *domain;
 	unsigned int gic_irqs;
 #ifdef CONFIG_GIC_NON_BANKED
 	void __iomem *(*get_base)(union gic_base *);
@@ -282,7 +279,7 @@ asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
 		irqnr = irqstat & ~0x1c00;
 
 		if (likely(irqnr > 15 && irqnr < 1021)) {
-			irqnr = irq_domain_to_irq(&gic->domain, irqnr);
+			irqnr = irq_find_mapping(gic->domain, irqnr);
 			handle_IRQ(irqnr, regs);
 			continue;
 		}
@@ -314,8 +311,8 @@ static void gic_handle_cascade_irq(unsigned int irq, struct irq_desc *desc)
 	if (gic_irq == 1023)
 		goto out;
 
-	cascade_irq = irq_domain_to_irq(&chip_data->domain, gic_irq);
-	if (unlikely(gic_irq < 32 || gic_irq > 1020 || cascade_irq >= NR_IRQS))
+	cascade_irq = irq_find_mapping(chip_data->domain, gic_irq);
+	if (unlikely(gic_irq < 32 || gic_irq > 1020))
 		do_bad_IRQ(cascade_irq, desc);
 	else
 		generic_handle_irq(cascade_irq);
@@ -348,10 +345,9 @@ void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq)
 
 static void __init gic_dist_init(struct gic_chip_data *gic)
 {
-	unsigned int i, irq;
+	unsigned int i;
 	u32 cpumask;
 	unsigned int gic_irqs = gic->gic_irqs;
-	struct irq_domain *domain = &gic->domain;
 	void __iomem *base = gic_data_dist_base(gic);
 	u32 cpu = cpu_logical_map(smp_processor_id());
 
@@ -386,23 +382,6 @@ static void __init gic_dist_init(struct gic_chip_data *gic)
 	for (i = 32; i < gic_irqs; i += 32)
 		writel_relaxed(0xffffffff, base + GIC_DIST_ENABLE_CLEAR + i * 4 / 32);
 
-	/*
-	 * Setup the Linux IRQ subsystem.
-	 */
-	irq_domain_for_each_irq(domain, i, irq) {
-		if (i < 32) {
-			irq_set_percpu_devid(irq);
-			irq_set_chip_and_handler(irq, &gic_chip,
-						 handle_percpu_devid_irq);
-			set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN);
-		} else {
-			irq_set_chip_and_handler(irq, &gic_chip,
-						 handle_fasteoi_irq);
-			set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
-		}
-		irq_set_chip_data(irq, gic);
-	}
-
 	writel_relaxed(1, base + GIC_DIST_CTRL);
 }
 
@@ -618,7 +597,23 @@ static void __init gic_pm_init(struct gic_chip_data *gic)
 }
 #endif
 
-#ifdef CONFIG_OF
+static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
+				irq_hw_number_t hw)
+{
+	if (hw < 32) {
+		irq_set_percpu_devid(irq);
+		irq_set_chip_and_handler(irq, &gic_chip,
+					 handle_percpu_devid_irq);
+		set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN);
+	} else {
+		irq_set_chip_and_handler(irq, &gic_chip,
+					 handle_fasteoi_irq);
+		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
+	}
+	irq_set_chip_data(irq, d->host_data);
+	return 0;
+}
+
 static int gic_irq_domain_xlate(struct irq_domain *d,
 				struct device_node *controller,
 				const u32 *intspec, unsigned int intsize,
@@ -639,26 +634,23 @@ static int gic_irq_domain_xlate(struct irq_domain *d,
 	*out_type = intspec[2] & IRQ_TYPE_SENSE_MASK;
 	return 0;
 }
-#endif
 
 struct irq_domain_ops gic_irq_domain_ops = {
-#ifdef CONFIG_OF
+	.map = gic_irq_domain_map,
 	.xlate = gic_irq_domain_xlate,
-#endif
 };
 
 void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 			   void __iomem *dist_base, void __iomem *cpu_base,
-			   u32 percpu_offset)
+			   u32 percpu_offset, struct device_node *node)
 {
+	irq_hw_number_t hwirq_base;
 	struct gic_chip_data *gic;
-	struct irq_domain *domain;
-	int gic_irqs;
+	int gic_irqs, irq_base;
 
 	BUG_ON(gic_nr >= MAX_GIC_NR);
 
 	gic = &gic_data[gic_nr];
-	domain = &gic->domain;
 #ifdef CONFIG_GIC_NON_BANKED
 	if (percpu_offset) { /* Frankein-GIC without banked registers... */
 		unsigned int cpu;
@@ -694,10 +686,10 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 	 * For primary GICs, skip over SGIs.
 	 * For secondary GICs, skip over PPIs, too.
 	 */
-	domain->hwirq_base = 32;
+	hwirq_base = 32;
 	if (gic_nr == 0) {
 		if ((irq_start & 31) > 0) {
-			domain->hwirq_base = 16;
+			hwirq_base = 16;
 			if (irq_start != -1)
 				irq_start = (irq_start & ~31) + 16;
 		}
@@ -713,17 +705,17 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 		gic_irqs = 1020;
 	gic->gic_irqs = gic_irqs;
 
-	domain->nr_irq = gic_irqs - domain->hwirq_base;
-	domain->irq_base = irq_alloc_descs(irq_start, 16, domain->nr_irq,
-					   numa_node_id());
-	if (IS_ERR_VALUE(domain->irq_base)) {
+	gic_irqs -= hwirq_base; /* calculate # of irqs to allocate */
+	irq_base = irq_alloc_descs(irq_start, 16, gic_irqs, numa_node_id());
+	if (IS_ERR_VALUE(irq_base)) {
 		WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
 		     irq_start);
-		domain->irq_base = irq_start;
+		irq_base = irq_start;
 	}
-	domain->host_data = gic;
-	domain->ops = &gic_irq_domain_ops;
-	irq_domain_add(domain);
+	gic->domain = irq_domain_add_legacy(node, gic_irqs, irq_base,
+				    hwirq_base, &gic_irq_domain_ops, gic);
+	if (WARN_ON(!gic->domain))
+		return;
 
 	gic_chip.flags |= gic_arch_extn.flags;
 	gic_dist_init(gic);
@@ -768,7 +760,6 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
 	void __iomem *dist_base;
 	u32 percpu_offset;
 	int irq;
-	struct irq_domain *domain = &gic_data[gic_cnt].domain;
 
 	if (WARN_ON(!node))
 		return -ENODEV;
@@ -782,9 +773,7 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent)
 	if (of_property_read_u32(node, "cpu-offset", &percpu_offset))
 		percpu_offset = 0;
 
-	domain->of_node = of_node_get(node);
-
-	gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset);
+	gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
 
 	if (parent) {
 		irq = irq_of_parse_and_map(node, 0);
diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c
index dcb004a804c7..7a66311f3066 100644
--- a/arch/arm/common/vic.c
+++ b/arch/arm/common/vic.c
@@ -56,7 +56,7 @@ struct vic_device {
 	u32		int_enable;
 	u32		soft_int;
 	u32		protect;
-	struct irq_domain domain;
+	struct irq_domain *domain;
 };
 
 /* we cannot allocate memory when VICs are initially registered */
@@ -192,14 +192,8 @@ static void __init vic_register(void __iomem *base, unsigned int irq,
 	v->resume_sources = resume_sources;
 	v->irq = irq;
 	vic_id++;
-
-	v->domain.irq_base = irq;
-	v->domain.nr_irq = 32;
-#ifdef CONFIG_OF_IRQ
-	v->domain.of_node = of_node_get(node);
-#endif /* CONFIG_OF */
-	v->domain.ops = &irq_domain_simple_ops;
-	irq_domain_add(&v->domain);
+	v->domain = irq_domain_add_legacy(node, 32, irq, 0,
+					  &irq_domain_simple_ops, v);
 }
 
 static void vic_ack_irq(struct irq_data *d)
@@ -348,7 +342,7 @@ static void __init vic_init_st(void __iomem *base, unsigned int irq_start,
 	vic_register(base, irq_start, 0, node);
 }
 
-static void __init __vic_init(void __iomem *base, unsigned int irq_start,
+void __init __vic_init(void __iomem *base, unsigned int irq_start,
 			      u32 vic_sources, u32 resume_sources,
 			      struct device_node *node)
 {
@@ -444,7 +438,7 @@ static int handle_one_vic(struct vic_device *vic, struct pt_regs *regs)
 	stat = readl_relaxed(vic->base + VIC_IRQ_STATUS);
 	while (stat) {
 		irq = ffs(stat) - 1;
-		handle_IRQ(irq_domain_to_irq(&vic->domain, irq), regs);
+		handle_IRQ(irq_find_mapping(vic->domain, irq), regs);
 		stat &= ~(1 << irq);
 		handled = 1;
 	}
diff --git a/arch/arm/include/asm/hardware/gic.h b/arch/arm/include/asm/hardware/gic.h
index 4bdfe0018696..4b1ce6cd477f 100644
--- a/arch/arm/include/asm/hardware/gic.h
+++ b/arch/arm/include/asm/hardware/gic.h
@@ -39,7 +39,7 @@ struct device_node;
 extern struct irq_chip gic_arch_extn;
 
 void gic_init_bases(unsigned int, int, void __iomem *, void __iomem *,
-		    u32 offset);
+		    u32 offset, struct device_node *);
 int gic_of_init(struct device_node *node, struct device_node *parent);
 void gic_secondary_init(unsigned int);
 void gic_handle_irq(struct pt_regs *regs);
@@ -49,7 +49,7 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq);
 static inline void gic_init(unsigned int nr, int start,
 			    void __iomem *dist , void __iomem *cpu)
 {
-	gic_init_bases(nr, start, dist, cpu, 0);
+	gic_init_bases(nr, start, dist, cpu, 0, NULL);
 }
 
 #endif
diff --git a/arch/arm/include/asm/hardware/vic.h b/arch/arm/include/asm/hardware/vic.h
index f42ebd619590..e14af1a1a320 100644
--- a/arch/arm/include/asm/hardware/vic.h
+++ b/arch/arm/include/asm/hardware/vic.h
@@ -47,6 +47,8 @@
 struct device_node;
 struct pt_regs;
 
+void __vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources,
+		u32 resume_sources, struct device_node *node);
 void vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources, u32 resume_sources);
 int vic_of_init(struct device_node *node, struct device_node *parent);
 void vic_handle_irq(struct pt_regs *regs);
diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c
index c59e18871006..6de298c5d2d3 100644
--- a/arch/arm/mach-exynos/common.c
+++ b/arch/arm/mach-exynos/common.c
@@ -402,7 +402,7 @@ void __init exynos4_init_irq(void)
 	gic_bank_offset = soc_is_exynos4412() ? 0x4000 : 0x8000;
 
 	if (!of_have_populated_dt())
-		gic_init_bases(0, IRQ_PPI(0), S5P_VA_GIC_DIST, S5P_VA_GIC_CPU, gic_bank_offset);
+		gic_init_bases(0, IRQ_PPI(0), S5P_VA_GIC_DIST, S5P_VA_GIC_CPU, gic_bank_offset, NULL);
 #ifdef CONFIG_OF
 	else
 		of_irq_init(exynos4_dt_irq_match);
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 02b7b9303f3b..008ce22b9a06 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -98,8 +98,11 @@ static const struct of_device_id sic_of_match[] __initconst = {
 
 void __init versatile_init_irq(void)
 {
-	vic_init(VA_VIC_BASE, IRQ_VIC_START, ~0, 0);
-	irq_domain_generate_simple(vic_of_match, VERSATILE_VIC_BASE, IRQ_VIC_START);
+	struct device_node *np;
+
+	np = of_find_matching_node_by_address(NULL, vic_of_match,
+					      VERSATILE_VIC_BASE);
+	__vic_init(VA_VIC_BASE, IRQ_VIC_START, ~0, 0, np);
 
 	writel(~0, VA_SIC_BASE + SIC_IRQ_ENABLE_CLEAR);
 
diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index 49677339ab5f..66f9bffc50f0 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -263,10 +263,6 @@ struct twl_client {
 
 static struct twl_client twl_modules[TWL_NUM_SLAVES];
 
-#ifdef CONFIG_IRQ_DOMAIN
-static struct irq_domain domain;
-#endif
-
 /* mapping the module id to slave id and base address */
 struct twl_mapping {
 	unsigned char sid;	/* Slave ID */
@@ -1227,14 +1223,8 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 	pdata->irq_base = status;
 	pdata->irq_end = pdata->irq_base + nr_irqs;
-
-#ifdef CONFIG_IRQ_DOMAIN
-	domain.irq_base = pdata->irq_base;
-	domain.nr_irq = nr_irqs;
-	domain.of_node = of_node_get(node);
-	domain.ops = &irq_domain_simple_ops;
-	irq_domain_add(&domain);
-#endif
+	irq_domain_add_legacy(node, nr_irqs, pdata->irq_base, 0,
+			      &irq_domain_simple_ops, NULL);
 
 	if (i2c_check_functionality(client->adapter, I2C_FUNC_I2C) == 0) {
 		dev_dbg(&client->dev, "can't talk I2C?\n");
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 7fef39ed5523..624e9ac89e79 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -55,9 +55,6 @@ typedef unsigned long irq_hw_number_t;
  * @map: Create or update a mapping between a virtual irq number and a hw
  *       irq number. This is called only once for a given mapping.
  * @unmap: Dispose of such a mapping
- * @to_irq: (optional) given a local hardware irq number, return the linux
- *          irq number.  If to_irq is not implemented, then the irq_domain
- *          will use this translation: irq = (domain->irq_base + hwirq)
  * @xlate: Given a device tree node and interrupt specifier, decode
  *         the hardware irq number and linux irq type value.
  *
@@ -70,7 +67,6 @@ struct irq_domain_ops {
 	int (*match)(struct irq_domain *d, struct device_node *node);
 	int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw);
 	void (*unmap)(struct irq_domain *d, unsigned int virq);
-	unsigned int (*to_irq)(struct irq_domain *d, unsigned long hwirq);
 	int (*xlate)(struct irq_domain *d, struct device_node *node,
 		     const u32 *intspec, unsigned int intsize,
 		     unsigned long *out_hwirq, unsigned int *out_type);
@@ -114,16 +110,11 @@ struct irq_domain {
 	void *host_data;
 	irq_hw_number_t inval_irq;
 
-	unsigned int irq_base;
-	unsigned int nr_irq;
-	unsigned int hwirq_base;
-
 	/* Optional device node pointer */
 	struct device_node *of_node;
 };
 
 #ifdef CONFIG_IRQ_DOMAIN
-#ifdef CONFIG_PPC
 struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 unsigned int size,
 					 unsigned int first_irq,
@@ -153,6 +144,10 @@ static inline struct irq_domain *irq_domain_add_legacy_isa(
 	return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops,
 				     host_data);
 }
+extern struct irq_domain *irq_find_host(struct device_node *node);
+extern void irq_set_default_host(struct irq_domain *host);
+extern void irq_set_virq_count(unsigned int count);
+
 
 extern unsigned int irq_create_mapping(struct irq_domain *host,
 				       irq_hw_number_t hwirq);
@@ -167,38 +162,7 @@ extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
 extern unsigned int irq_linear_revmap(struct irq_domain *host,
 				      irq_hw_number_t hwirq);
 
-#else /* CONFIG_PPC */
-
-/**
- * irq_domain_to_irq() - Translate from a hardware irq to a linux irq number
- *
- * Returns the linux irq number associated with a hardware irq.  By default,
- * the mapping is irq == domain->irq_base + hwirq, but this mapping can
- * be overridden if the irq_domain implements a .to_irq() hook.
- */
-static inline unsigned int irq_domain_to_irq(struct irq_domain *d,
-					     unsigned long hwirq)
-{
-	if (d->ops->to_irq)
-		return d->ops->to_irq(d, hwirq);
-	if (WARN_ON(hwirq < d->hwirq_base))
-		return 0;
-	return d->irq_base + hwirq - d->hwirq_base;
-}
-
-#define irq_domain_for_each_hwirq(d, hw) \
-	for (hw = d->hwirq_base; hw < d->hwirq_base + d->nr_irq; hw++)
-
-#define irq_domain_for_each_irq(d, hw, irq) \
-	for (hw = d->hwirq_base, irq = irq_domain_to_irq(d, hw); \
-	     hw < d->hwirq_base + d->nr_irq; \
-	     hw++, irq = irq_domain_to_irq(d, hw))
-
-extern void irq_domain_add(struct irq_domain *domain);
-extern void irq_domain_del(struct irq_domain *domain);
-
 extern struct irq_domain_ops irq_domain_simple_ops;
-
 #if defined(CONFIG_OF_IRQ)
 extern void irq_domain_add_simple(struct device_node *controller, int irq_base);
 extern void irq_domain_generate_simple(const struct of_device_id *match,
@@ -207,7 +171,6 @@ extern void irq_domain_generate_simple(const struct of_device_id *match,
 static inline void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start) { }
 #endif /* !CONFIG_OF_IRQ */
-#endif /* !CONFIG_PPC */
 #endif /* CONFIG_IRQ_DOMAIN */
 
 #endif /* _LINUX_IRQDOMAIN_H */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index c6740d72073e..2981ebfeb40c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -22,7 +22,6 @@
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
 
-#ifdef CONFIG_PPC
 static DEFINE_MUTEX(revmap_trees_mutex);
 static unsigned int irq_virq_count = NR_IRQS;
 static struct irq_domain *irq_default_domain;
@@ -694,124 +693,11 @@ static int __init irq_debugfs_init(void)
 __initcall(irq_debugfs_init);
 #endif /* CONFIG_VIRQ_DEBUG */
 
-#else /* CONFIG_PPC */
-
-/**
- * irq_domain_add() - Register an irq_domain
- * @domain: ptr to initialized irq_domain structure
- *
- * Registers an irq_domain structure.  The irq_domain must at a minimum be
- * initialized with an ops structure pointer, and either a ->to_irq hook or
- * a valid irq_base value.  Everything else is optional.
- */
-void irq_domain_add(struct irq_domain *domain)
-{
-	struct irq_data *d;
-	int hwirq, irq;
-
-	/*
-	 * This assumes that the irq_domain owner has already allocated
-	 * the irq_descs.  This block will be removed when support for dynamic
-	 * allocation of irq_descs is added to irq_domain.
-	 */
-	irq_domain_for_each_irq(domain, hwirq, irq) {
-		d = irq_get_irq_data(irq);
-		if (!d) {
-			WARN(1, "error: assigning domain to non existant irq_desc");
-			return;
-		}
-		if (d->domain) {
-			/* things are broken; just report, don't clean up */
-			WARN(1, "error: irq_desc already assigned to a domain");
-			return;
-		}
-		d->domain = domain;
-		d->hwirq = hwirq;
-	}
-
-	mutex_lock(&irq_domain_mutex);
-	list_add(&domain->link, &irq_domain_list);
-	mutex_unlock(&irq_domain_mutex);
-}
-
-/**
- * irq_domain_del() - Unregister an irq_domain
- * @domain: ptr to registered irq_domain.
- */
-void irq_domain_del(struct irq_domain *domain)
-{
-	struct irq_data *d;
-	int hwirq, irq;
-
-	mutex_lock(&irq_domain_mutex);
-	list_del(&domain->link);
-	mutex_unlock(&irq_domain_mutex);
-
-	/* Clear the irq_domain assignments */
-	irq_domain_for_each_irq(domain, hwirq, irq) {
-		d = irq_get_irq_data(irq);
-		d->domain = NULL;
-	}
-}
-
-#if defined(CONFIG_OF_IRQ)
-/**
- * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
- *
- * Used by the device tree interrupt mapping code to translate a device tree
- * interrupt specifier to a valid linux irq number.  Returns either a valid
- * linux IRQ number or 0.
- *
- * When the caller no longer need the irq number returned by this function it
- * should arrange to call irq_dispose_mapping().
- */
-unsigned int irq_create_of_mapping(struct device_node *controller,
-				   const u32 *intspec, unsigned int intsize)
-{
-	struct irq_domain *domain;
-	unsigned long hwirq;
-	unsigned int irq, type;
-	int rc = -EINVAL;
-
-	/* Find a domain which can translate the irq spec */
-	mutex_lock(&irq_domain_mutex);
-	list_for_each_entry(domain, &irq_domain_list, link) {
-		if (!domain->ops->xlate)
-			continue;
-		rc = domain->ops->xlate(domain, controller,
-					intspec, intsize, &hwirq, &type);
-		if (rc == 0)
-			break;
-	}
-	mutex_unlock(&irq_domain_mutex);
-
-	if (rc != 0)
-		return 0;
-
-	irq = irq_domain_to_irq(domain, hwirq);
-	if (type != IRQ_TYPE_NONE)
-		irq_set_irq_type(irq, type);
-	pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
-		 controller->full_name, (int)hwirq, irq, type);
-	return irq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
-/**
- * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
- * @irq: linux irq number to be discarded
- *
- * Calling this function indicates the caller no longer needs a reference to
- * the linux irq number returned by a prior call to irq_create_of_mapping().
- */
-void irq_dispose_mapping(unsigned int irq)
+int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
+			  irq_hw_number_t hwirq)
 {
-	/*
-	 * nothing yet; will be filled when support for dynamic allocation of
-	 * irq_descs is added to irq_domain
-	 */
+	return 0;
 }
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 
 int irq_domain_simple_xlate(struct irq_domain *d,
 			    struct device_node *controller,
@@ -822,10 +708,6 @@ int irq_domain_simple_xlate(struct irq_domain *d,
 		return -EINVAL;
 	if (intsize < 1)
 		return -EINVAL;
-	if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
-	    (intspec[0] >= d->hwirq_base + d->nr_irq)))
-		return -EINVAL;
-
 	*out_hwirq = intspec[0];
 	*out_type = IRQ_TYPE_NONE;
 	if (intsize > 1)
@@ -833,23 +715,17 @@ int irq_domain_simple_xlate(struct irq_domain *d,
 	return 0;
 }
 
-/**
- * irq_domain_create_simple() - Set up a 'simple' translation range
- */
+struct irq_domain_ops irq_domain_simple_ops = {
+	.map = irq_domain_simple_map,
+	.xlate = irq_domain_simple_xlate,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+#ifdef CONFIG_OF_IRQ
 void irq_domain_add_simple(struct device_node *controller, int irq_base)
 {
-	struct irq_domain *domain;
-
-	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-	if (!domain) {
-		WARN_ON(1);
-		return;
-	}
-
-	domain->irq_base = irq_base;
-	domain->of_node = of_node_get(controller);
-	domain->ops = &irq_domain_simple_ops;
-	irq_domain_add(domain);
+	irq_domain_add_legacy(controller, 32, irq_base, 0,
+			      &irq_domain_simple_ops, NULL);
 }
 EXPORT_SYMBOL_GPL(irq_domain_add_simple);
 
@@ -864,13 +740,4 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 		irq_domain_add_simple(node, irq_start);
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
-#endif /* CONFIG_OF_IRQ */
-
-struct irq_domain_ops irq_domain_simple_ops = {
-#ifdef CONFIG_OF_IRQ
-	.xlate = irq_domain_simple_xlate,
-#endif /* CONFIG_OF_IRQ */
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-
-#endif /* !CONFIG_PPC */
+#endif
-- 
cgit v1.2.3-55-g7522


From 6b783f7c5dde2648fa0bbe7fc8ac80d78699e67f Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Tue, 10 Jan 2012 17:09:30 -0700
Subject: irq_domain: Remove irq_domain_add_simple()

irq_domain_add_simple() was a stop-gap measure until complete irq_domain
support was complete.  This patch removes the irq_domain_add_simple()
interface.

This patch also drops the explicit irq_domain initialization performed
by the mach-versatile code because the versatile interrupt controller
already has irq_domain support built into it.  This was a bug that was
hanging around quietly for a while, but with the full irq_domain which
actually verifies that irq_domain ranges are available it would cause
the registration to fail and the system wouldn't boot.

v4: Fixed number of irqs in mx5 gpio code
v2: Updated to pass in host_data pointer on irq_domain allocation.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Cc: Russell King <linux@arm.linux.org.uk>
Tested-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/mach-imx/imx51-dt.c        |  4 ++--
 arch/arm/mach-imx/imx53-dt.c        |  4 ++--
 arch/arm/mach-imx/mach-imx6q.c      |  3 ++-
 arch/arm/mach-msm/board-msm8x60.c   |  8 ++------
 arch/arm/mach-omap2/board-generic.c |  2 +-
 arch/arm/mach-prima2/irq.c          |  2 +-
 include/linux/irqdomain.h           |  1 -
 kernel/irq/irqdomain.c              | 10 ++--------
 8 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-imx/imx51-dt.c b/arch/arm/mach-imx/imx51-dt.c
index e6bad17b908c..1e03ef42faa0 100644
--- a/arch/arm/mach-imx/imx51-dt.c
+++ b/arch/arm/mach-imx/imx51-dt.c
@@ -47,7 +47,7 @@ static const struct of_dev_auxdata imx51_auxdata_lookup[] __initconst = {
 static int __init imx51_tzic_add_irq_domain(struct device_node *np,
 				struct device_node *interrupt_parent)
 {
-	irq_domain_add_simple(np, 0);
+	irq_domain_add_legacy(np, 128, 0, 0, &irq_domain_simple_ops, NULL);
 	return 0;
 }
 
@@ -57,7 +57,7 @@ static int __init imx51_gpio_add_irq_domain(struct device_node *np,
 	static int gpio_irq_base = MXC_GPIO_IRQ_START + ARCH_NR_GPIOS;
 
 	gpio_irq_base -= 32;
-	irq_domain_add_simple(np, gpio_irq_base);
+	irq_domain_add_legacy(np, 32, gpio_irq_base, 0, &irq_domain_simple_ops, NULL);
 
 	return 0;
 }
diff --git a/arch/arm/mach-imx/imx53-dt.c b/arch/arm/mach-imx/imx53-dt.c
index 05ebb3e68679..fd5be0f20fbb 100644
--- a/arch/arm/mach-imx/imx53-dt.c
+++ b/arch/arm/mach-imx/imx53-dt.c
@@ -51,7 +51,7 @@ static const struct of_dev_auxdata imx53_auxdata_lookup[] __initconst = {
 static int __init imx53_tzic_add_irq_domain(struct device_node *np,
 				struct device_node *interrupt_parent)
 {
-	irq_domain_add_simple(np, 0);
+	irq_domain_add_legacy(np, 128, 0, 0, &irq_domain_simple_ops, NULL);
 	return 0;
 }
 
@@ -61,7 +61,7 @@ static int __init imx53_gpio_add_irq_domain(struct device_node *np,
 	static int gpio_irq_base = MXC_GPIO_IRQ_START + ARCH_NR_GPIOS;
 
 	gpio_irq_base -= 32;
-	irq_domain_add_simple(np, gpio_irq_base);
+	irq_domain_add_legacy(np, 32, gpio_irq_base, 0, &irq_domain_simple_ops, NULL);
 
 	return 0;
 }
diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
index c25728106917..6075d4d62dd6 100644
--- a/arch/arm/mach-imx/mach-imx6q.c
+++ b/arch/arm/mach-imx/mach-imx6q.c
@@ -97,7 +97,8 @@ static int __init imx6q_gpio_add_irq_domain(struct device_node *np,
 	static int gpio_irq_base = MXC_GPIO_IRQ_START + ARCH_NR_GPIOS;
 
 	gpio_irq_base -= 32;
-	irq_domain_add_simple(np, gpio_irq_base);
+	irq_domain_add_legacy(np, 32, gpio_irq_base, 0, &irq_domain_simple_ops,
+			      NULL);
 
 	return 0;
 }
diff --git a/arch/arm/mach-msm/board-msm8x60.c b/arch/arm/mach-msm/board-msm8x60.c
index 0a113424632c..962e71169750 100644
--- a/arch/arm/mach-msm/board-msm8x60.c
+++ b/arch/arm/mach-msm/board-msm8x60.c
@@ -80,12 +80,8 @@ static struct of_device_id msm_dt_gic_match[] __initdata = {
 
 static void __init msm8x60_dt_init(void)
 {
-	struct device_node *node;
-
-	node = of_find_matching_node_by_address(NULL, msm_dt_gic_match,
-			MSM8X60_QGIC_DIST_PHYS);
-	if (node)
-		irq_domain_add_simple(node, GIC_SPI_START);
+	irq_domain_generate_simple(msm_dt_gic_match, MSM8X60_QGIC_DIST_PHYS,
+				GIC_SPI_START);
 
 	if (of_machine_is_compatible("qcom,msm8660-surf")) {
 		printk(KERN_INFO "Init surf UART registers\n");
diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c
index d58756060483..00b1d024fa87 100644
--- a/arch/arm/mach-omap2/board-generic.c
+++ b/arch/arm/mach-omap2/board-generic.c
@@ -67,7 +67,7 @@ static void __init omap_generic_init(void)
 {
 	struct device_node *node = of_find_matching_node(NULL, intc_match);
 	if (node)
-		irq_domain_add_simple(node, 0);
+		irq_domain_add_legacy(node, 32, 0, 0, &irq_domain_simple_ops, NULL);
 
 	omap_sdrc_init(NULL, NULL);
 
diff --git a/arch/arm/mach-prima2/irq.c b/arch/arm/mach-prima2/irq.c
index d93ceef4a50a..37c2de9b6f26 100644
--- a/arch/arm/mach-prima2/irq.c
+++ b/arch/arm/mach-prima2/irq.c
@@ -68,7 +68,7 @@ void __init sirfsoc_of_irq_init(void)
 	if (!sirfsoc_intc_base)
 		panic("unable to map intc cpu registers\n");
 
-	irq_domain_add_simple(np, 0);
+	irq_domain_add_legacy(np, 32, 0, 0, &irq_domain_simple_ops, NULL);
 
 	of_node_put(np);
 
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 624e9ac89e79..e7379a3c4d7d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -164,7 +164,6 @@ extern unsigned int irq_linear_revmap(struct irq_domain *host,
 
 extern struct irq_domain_ops irq_domain_simple_ops;
 #if defined(CONFIG_OF_IRQ)
-extern void irq_domain_add_simple(struct device_node *controller, int irq_base);
 extern void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start);
 #else /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 2981ebfeb40c..6328d9350f04 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -722,13 +722,6 @@ struct irq_domain_ops irq_domain_simple_ops = {
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
 #ifdef CONFIG_OF_IRQ
-void irq_domain_add_simple(struct device_node *controller, int irq_base)
-{
-	irq_domain_add_legacy(controller, 32, irq_base, 0,
-			      &irq_domain_simple_ops, NULL);
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
-
 void irq_domain_generate_simple(const struct of_device_id *match,
 				u64 phys_base, unsigned int irq_start)
 {
@@ -737,7 +730,8 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 		(unsigned long long) phys_base, (int) irq_start);
 	node = of_find_matching_node_by_address(NULL, match, phys_base);
 	if (node)
-		irq_domain_add_simple(node, irq_start);
+		irq_domain_add_legacy(node, 32, irq_start, 0,
+				      &irq_domain_simple_ops, NULL);
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 #endif
-- 
cgit v1.2.3-55-g7522


From 16b2e6e2f31dda41f114aa0acade04f7e10f67c9 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Thu, 26 Jan 2012 11:26:52 -0700
Subject: irq_domain: Create common xlate functions that device drivers can use

Rather than having each interrupt controller driver creating its own barely
unique .xlate function for irq_domain, create a library of translators which
any driver can use directly.

v5: - Remove irq_domain_xlate_pci().  It was incorrect.

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mark Salter <msalter@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 include/linux/irqdomain.h | 12 +++++++++
 kernel/irq/irqdomain.c    | 65 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index e7379a3c4d7d..ea58f36688a0 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -163,6 +163,18 @@ extern unsigned int irq_linear_revmap(struct irq_domain *host,
 				      irq_hw_number_t hwirq);
 
 extern struct irq_domain_ops irq_domain_simple_ops;
+
+/* stock xlate functions */
+int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+			const u32 *intspec, unsigned int intsize,
+			irq_hw_number_t *out_hwirq, unsigned int *out_type);
+int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
+			const u32 *intspec, unsigned int intsize,
+			irq_hw_number_t *out_hwirq, unsigned int *out_type);
+int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
+			const u32 *intspec, unsigned int intsize,
+			irq_hw_number_t *out_hwirq, unsigned int *out_type);
+
 #if defined(CONFIG_OF_IRQ)
 extern void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6328d9350f04..456e3fc8387f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -699,25 +699,70 @@ int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-int irq_domain_simple_xlate(struct irq_domain *d,
-			    struct device_node *controller,
-			    const u32 *intspec, unsigned int intsize,
-			    unsigned long *out_hwirq, unsigned int *out_type)
+/**
+ * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with one cell
+ * bindings where the cell value maps directly to the hwirq number.
+ */
+int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+			     const u32 *intspec, unsigned int intsize,
+			     unsigned long *out_hwirq, unsigned int *out_type)
 {
-	if (d->of_node != controller)
-		return -EINVAL;
-	if (intsize < 1)
+	if (WARN_ON(intsize < 1))
 		return -EINVAL;
 	*out_hwirq = intspec[0];
 	*out_type = IRQ_TYPE_NONE;
-	if (intsize > 1)
-		*out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
+
+/**
+ * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
+			const u32 *intspec, unsigned int intsize,
+			irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+	if (WARN_ON(intsize < 2))
+		return -EINVAL;
+	*out_hwirq = intspec[0];
+	*out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
+
+/**
+ * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with either one
+ * or two cell bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ *
+ * Note: don't use this function unless your interrupt controller explicitly
+ * supports both one and two cell bindings.  For the majority of controllers
+ * the _onecell() or _twocell() variants above should be used.
+ */
+int irq_domain_xlate_onetwocell(struct irq_domain *d,
+				struct device_node *ctrlr,
+				const u32 *intspec, unsigned int intsize,
+				unsigned long *out_hwirq, unsigned int *out_type)
+{
+	if (WARN_ON(intsize < 1))
+		return -EINVAL;
+	*out_hwirq = intspec[0];
+	*out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
 
 struct irq_domain_ops irq_domain_simple_ops = {
 	.map = irq_domain_simple_map,
-	.xlate = irq_domain_simple_xlate,
+	.xlate = irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
-- 
cgit v1.2.3-55-g7522


From a18dc81bf58258ac0920bec26b91656cb0140d2a Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Thu, 26 Jan 2012 12:12:14 -0700
Subject: irq_domain: constify irq_domain_ops

Make irq_domain_ops pointer a constant to make it safer for multiple
instances to share the same ops pointer and change the irq_domain code
so that it does not modify the ops.

v4: Fix mismatched type reference in powerpc code

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Milton Miller <miltonm@bga.com>
Tested-by: Olof Johansson <olof@lixom.net>
---
 arch/powerpc/sysdev/mpic_msi.c |  2 +-
 include/linux/irqdomain.h      | 14 +++++++-------
 kernel/irq/irqdomain.c         | 31 +++++++++++++++----------------
 3 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/sysdev/mpic_msi.c b/arch/powerpc/sysdev/mpic_msi.c
index 00395f40fb5d..0622aa91b18a 100644
--- a/arch/powerpc/sysdev/mpic_msi.c
+++ b/arch/powerpc/sysdev/mpic_msi.c
@@ -32,7 +32,7 @@ void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq)
 static int mpic_msi_reserve_u3_hwirqs(struct mpic *mpic)
 {
 	irq_hw_number_t hwirq;
-	struct irq_domain_ops *ops = mpic->irqhost->ops;
+	const struct irq_domain_ops *ops = mpic->irqhost->ops;
 	struct device_node *np;
 	int flags, index, i;
 	struct of_irq oirq;
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index ea58f36688a0..52454881938a 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -106,7 +106,7 @@ struct irq_domain {
 		} linear;
 		struct radix_tree_root tree;
 	} revmap_data;
-	struct irq_domain_ops *ops;
+	const struct irq_domain_ops *ops;
 	void *host_data;
 	irq_hw_number_t inval_irq;
 
@@ -119,17 +119,17 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 unsigned int size,
 					 unsigned int first_irq,
 					 irq_hw_number_t first_hwirq,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data);
 struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
 					 unsigned int size,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data);
 struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data);
 struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data);
 
 extern struct irq_domain *irq_find_host(struct device_node *node);
@@ -138,7 +138,7 @@ extern void irq_set_virq_count(unsigned int count);
 
 static inline struct irq_domain *irq_domain_add_legacy_isa(
 				struct device_node *of_node,
-				struct irq_domain_ops *ops,
+				const struct irq_domain_ops *ops,
 				void *host_data)
 {
 	return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops,
@@ -162,7 +162,7 @@ extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host,
 extern unsigned int irq_linear_revmap(struct irq_domain *host,
 				      irq_hw_number_t hwirq);
 
-extern struct irq_domain_ops irq_domain_simple_ops;
+extern const struct irq_domain_ops irq_domain_simple_ops;
 
 /* stock xlate functions */
 int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 456e3fc8387f..25a498eb98a3 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -26,11 +26,6 @@ static DEFINE_MUTEX(revmap_trees_mutex);
 static unsigned int irq_virq_count = NR_IRQS;
 static struct irq_domain *irq_default_domain;
 
-static int default_irq_domain_match(struct irq_domain *d, struct device_node *np)
-{
-	return d->of_node != NULL && d->of_node == np;
-}
-
 /**
  * irq_domain_alloc() - Allocate a new irq_domain data structure
  * @of_node: optional device-tree node of the interrupt controller
@@ -44,7 +39,7 @@ static int default_irq_domain_match(struct irq_domain *d, struct device_node *np
  */
 static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
 					   unsigned int revmap_type,
-					   struct irq_domain_ops *ops,
+					   const struct irq_domain_ops *ops,
 					   void *host_data)
 {
 	struct irq_domain *domain;
@@ -59,9 +54,6 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
 	domain->host_data = host_data;
 	domain->of_node = of_node_get(of_node);
 
-	if (domain->ops->match == NULL)
-		domain->ops->match = default_irq_domain_match;
-
 	return domain;
 }
 
@@ -104,7 +96,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 unsigned int size,
 					 unsigned int first_irq,
 					 irq_hw_number_t first_hwirq,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data)
 {
 	struct irq_domain *domain;
@@ -170,7 +162,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
  */
 struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
 					 unsigned int size,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data)
 {
 	struct irq_domain *domain;
@@ -192,7 +184,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
 }
 
 struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data)
 {
 	struct irq_domain *domain = irq_domain_alloc(of_node,
@@ -211,7 +203,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
  * (the reverse mapping will use the slow path until that happens).
  */
 struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
-					 struct irq_domain_ops *ops,
+					 const struct irq_domain_ops *ops,
 					 void *host_data)
 {
 	struct irq_domain *domain = irq_domain_alloc(of_node,
@@ -230,6 +222,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
 struct irq_domain *irq_find_host(struct device_node *node)
 {
 	struct irq_domain *h, *found = NULL;
+	int rc;
 
 	/* We might want to match the legacy controller last since
 	 * it might potentially be set to match all interrupts in
@@ -237,11 +230,17 @@ struct irq_domain *irq_find_host(struct device_node *node)
 	 * yet though...
 	 */
 	mutex_lock(&irq_domain_mutex);
-	list_for_each_entry(h, &irq_domain_list, link)
-		if (h->ops->match(h, node)) {
+	list_for_each_entry(h, &irq_domain_list, link) {
+		if (h->ops->match)
+			rc = h->ops->match(h, node);
+		else
+			rc = (h->of_node != NULL) && (h->of_node == node);
+
+		if (rc) {
 			found = h;
 			break;
 		}
+	}
 	mutex_unlock(&irq_domain_mutex);
 	return found;
 }
@@ -760,7 +759,7 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
 }
 EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
 
-struct irq_domain_ops irq_domain_simple_ops = {
+const struct irq_domain_ops irq_domain_simple_ops = {
 	.map = irq_domain_simple_map,
 	.xlate = irq_domain_xlate_onetwocell,
 };
-- 
cgit v1.2.3-55-g7522


From 55ae451918ec62e553f11b6118fec157f90c31c3 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Mon, 13 Feb 2012 16:29:14 +0100
Subject: PM / Sleep: Unify kerneldoc comments in kernel/power/suspend.c

The kerneldoc comments in kernel/power/suspend.c are not formatted
in the same way and the quality of some of them is questionable.
Unify the formatting and improve the contents.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---
 kernel/power/suspend.c | 56 ++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 03bc92b42750..e6b5ef958603 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
 static const struct platform_suspend_ops *suspend_ops;
 
 /**
- *	suspend_set_ops - Set the global suspend method table.
- *	@ops:	Pointer to ops structure.
+ * suspend_set_ops - Set the global suspend method table.
+ * @ops: Suspend operations to use.
  */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)
 }
 
 /**
- * suspend_valid_only_mem - generic memory-only valid callback
+ * suspend_valid_only_mem - Generic memory-only valid callback.
  *
- * Platform drivers that implement mem suspend only and only need
- * to check for that in their .valid callback can use this instead
- * of rolling their own .valid callback.
+ * Platform drivers that implement mem suspend only and only need to check for
+ * that in their .valid() callback can use this instead of rolling their own
+ * .valid() callback.
  */
 int suspend_valid_only_mem(suspend_state_t state)
 {
@@ -83,10 +83,11 @@ static int suspend_test(int level)
 }
 
 /**
- *	suspend_prepare - Do prep work before entering low-power state.
+ * suspend_prepare - Prepare for entering system sleep state.
  *
- *	This is common code that is called for each state that we're entering.
- *	Run suspend notifiers, allocate a console and stop all processes.
+ * Common code run for every system sleep state that can be entered (except for
+ * hibernation).  Run suspend notifiers, allocate the "suspend" console and
+ * freeze processes.
  */
 static int suspend_prepare(void)
 {
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 
 /**
- * suspend_enter - enter the desired system sleep state.
- * @state: State to enter
- * @wakeup: Returns information that suspend should not be entered again.
+ * suspend_enter - Make the system enter the given sleep state.
+ * @state: System sleep state to enter.
+ * @wakeup: Returns information that the sleep state should not be re-entered.
  *
  * This function should be called after devices have been suspended.
  */
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 }
 
 /**
- *	suspend_devices_and_enter - suspend devices and enter the desired system
- *				    sleep state.
- *	@state:		  state to enter
+ * suspend_devices_and_enter - Suspend devices and enter system sleep state.
+ * @state: System sleep state to enter.
  */
 int suspend_devices_and_enter(suspend_state_t state)
 {
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)
 }
 
 /**
- *	suspend_finish - Do final work before exiting suspend sequence.
+ * suspend_finish - Clean up before finishing the suspend sequence.
  *
- *	Call platform code to clean up, restart processes, and free the
- *	console that we've allocated. This is not called for suspend-to-disk.
+ * Call platform code to clean up, restart processes, and free the console that
+ * we've allocated. This routine is not called for hibernation.
  */
 static void suspend_finish(void)
 {
@@ -265,14 +265,12 @@ static void suspend_finish(void)
 }
 
 /**
- *	enter_state - Do common work of entering low-power state.
- *	@state:		pm_state structure for state we're entering.
+ * enter_state - Do common work needed to enter system sleep state.
+ * @state: System sleep state to enter.
  *
- *	Make sure we're the only ones trying to enter a sleep state. Fail
- *	if someone has beat us to it, since we don't want anything weird to
- *	happen when we wake up.
- *	Then, do the setup for suspend, enter the state, and cleaup (after
- *	we've woken up).
+ * Make sure that no one else is trying to put the system into a sleep state.
+ * Fail if that's not the case.  Otherwise, prepare for system suspend, make the
+ * system enter the given sleep state and clean up after wakeup.
  */
 int enter_state(suspend_state_t state)
 {
@@ -310,11 +308,11 @@ int enter_state(suspend_state_t state)
 }
 
 /**
- *	pm_suspend - Externally visible function for suspending system.
- *	@state:		Enumerated value of state to enter.
+ * pm_suspend - Externally visible function for suspending the system.
+ * @state: System sleep state to enter.
  *
- *	Determine whether or not value is within range, get state
- *	structure, and enter (above).
+ * Check if the value of @state represents one of the supported states,
+ * execute enter_state() and update system suspend statistics.
  */
 int pm_suspend(suspend_state_t state)
 {
-- 
cgit v1.2.3-55-g7522


From 93e1ee43a72b11e1b50aab87046c131a836a4456 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Mon, 13 Feb 2012 16:29:24 +0100
Subject: PM / Sleep: Make enter_state() in kernel/power/suspend.c static

The enter_state() function in kernel/power/suspend.c should be
static and state_store() in kernel/power/suspend.c should call
pm_suspend() instead of it, so make that happen (which also reduces
code duplication related to suspend statistics).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---
 kernel/power/main.c    | 8 +++-----
 kernel/power/power.h   | 2 --
 kernel/power/suspend.c | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index b1e324878d5f..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -291,12 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 #ifdef CONFIG_SUSPEND
 	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
-		if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+		if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
+			error = pm_suspend(state);
 			break;
-	}
-	if (state < PM_SUSPEND_MAX && *s) {
-		error = enter_state(state);
-		suspend_stats_update(error);
+		}
 	}
 #endif
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 398d42b48e9e..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
 
 extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
-extern int enter_state(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
 	return -ENOSYS;
 }
-static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
 static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index e6b5ef958603..4914358a0543 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -272,7 +272,7 @@ static void suspend_finish(void)
  * Fail if that's not the case.  Otherwise, prepare for system suspend, make the
  * system enter the given sleep state and clean up after wakeup.
  */
-int enter_state(suspend_state_t state)
+static int enter_state(suspend_state_t state)
 {
 	int error;
 
-- 
cgit v1.2.3-55-g7522


From bc25cf508942c56810d4fb623ef27b56ccef7783 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Mon, 13 Feb 2012 16:29:33 +0100
Subject: PM / Sleep: Drop suspend_stats_update()

Since suspend_stats_update() is only called from pm_suspend(),
move its code directly into that function and remove the static
inline definition from include/linux/suspend.h.  Clean_up
pm_suspend() in the process.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---
 include/linux/suspend.h | 16 ----------------
 kernel/power/suspend.c  | 18 ++++++++++++------
 2 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index b90191894441..ac1c114c499d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -94,22 +94,6 @@ static inline void dpm_save_failed_step(enum suspend_stat_step step)
 	suspend_stats.last_failed_step %= REC_FAILED_NUM;
 }
 
-/**
- * suspend_stats_update - Update success/failure statistics of suspend-to-ram
- *
- * @error: Value returned by enter_state() function
- */
-static inline void suspend_stats_update(int error)
-{
-	if (error) {
-		suspend_stats.fail++;
-		dpm_save_failed_errno(error);
-	} else {
-		suspend_stats.success++;
-	}
-}
-
-
 /**
  * struct platform_suspend_ops - Callbacks for managing platform dependent
  *	system sleep states.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4914358a0543..88e5c967370d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -316,12 +316,18 @@ static int enter_state(suspend_state_t state)
  */
 int pm_suspend(suspend_state_t state)
 {
-	int ret;
-	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
-		ret = enter_state(state);
-		suspend_stats_update(ret);
-		return ret;
+	int error;
+
+	if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
+		return -EINVAL;
+
+	error = enter_state(state);
+	if (error) {
+		suspend_stats.fail++;
+		dpm_save_failed_errno(error);
+	} else {
+		suspend_stats.success++;
 	}
-	return -EINVAL;
+	return error;
 }
 EXPORT_SYMBOL(pm_suspend);
-- 
cgit v1.2.3-55-g7522


From 69f1d475cc80c55121852b3030873cdd407fd31c Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Tue, 14 Feb 2012 22:20:52 +0100
Subject: PM / Hibernate: print physical addresses consistently with other
 parts of kernel

Print physical address info in a style consistent with the %pR style used
elsewhere in the kernel.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..8e2e7461375f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 	list_for_each_entry(region, &nosave_regions, list) {
 		unsigned long pfn;
 
-		pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
-				region->start_pfn << PAGE_SHIFT,
-				region->end_pfn << PAGE_SHIFT);
+		pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+			 (unsigned long long) region->start_pfn << PAGE_SHIFT,
+			 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+				- 1);
 
 		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
 			if (pfn_valid(pfn)) {
-- 
cgit v1.2.3-55-g7522


From 9a4b430451bb6d8d6b7dcdfbee0e1330b7c475a6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Wed, 8 Feb 2012 03:37:26 +0100
Subject: cgroup: Remove wrong comment on cgroup_enable_task_cg_list()

Remove the stale comment about RCU protection. Many callers
(all of them?) of cgroup_enable_task_cg_list() don't seem
to be in an RCU read side critical section. Besides, RCU is
not helpful to protect against while_each_thread().

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 865d89a580c7..6e4eb4312571 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2701,9 +2701,6 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
  * using their cgroups capability, we don't maintain the lists running
  * through each css_set to its tasks until we see the list actually
  * used - in other words after the first call to cgroup_iter_start().
- *
- * The tasklist_lock is not held here, as do_each_thread() and
- * while_each_thread() are protected by RCU.
  */
 static void cgroup_enable_task_cg_lists(void)
 {
-- 
cgit v1.2.3-55-g7522


From 3ce3230a0cff484e5130153f244d4fb8a56b3a8b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Wed, 8 Feb 2012 03:37:27 +0100
Subject: cgroup: Walk task list under tasklist_lock in
 cgroup_enable_task_cg_list

Walking through the tasklist in cgroup_enable_task_cg_list() inside
an RCU read side critical section is not enough because:

- RCU is not (yet) safe against while_each_thread()

- If we use only RCU, a forking task that has passed cgroup_post_fork()
  without seeing use_task_css_set_links == 1 is not guaranteed to have
  its child immediately visible in the tasklist if we walk through it
  remotely with RCU. In this case it will be missing in its css_set's
  task list.

Thus we need to traverse the list (unfortunately) under the
tasklist_lock. It makes us safe against while_each_thread() and also
make sure we see all forked task that have been added to the tasklist.

As a secondary effect, reading and writing use_task_css_set_links are
now well ordered against tasklist traversing and modification. The new
layout is:

CPU 0                                      CPU 1

use_task_css_set_links = 1                write_lock(tasklist_lock)
read_lock(tasklist_lock)                  add task to tasklist
do_each_thread() {                        write_unlock(tasklist_lock)
	add thread to css set links       if (use_task_css_set_links)
} while_each_thread()                         add thread to css set links
read_unlock(tasklist_lock)

If CPU 0 traverse the list after the task has been added to the tasklist
then it is correctly added to the css set links. OTOH if CPU 0 traverse
the tasklist before the new task had the opportunity to be added to the
tasklist because it was too early in the fork process, then CPU 1
catches up and add the task to the css set links after it added the task
to the tasklist. The right value of use_task_css_set_links is guaranteed
to be visible from CPU 1 due to the LOCK/UNLOCK implicit barrier properties:
the read_unlock on CPU 0 makes the write on use_task_css_set_links happening
and the write_lock on CPU 1 make the read of use_task_css_set_links that comes
afterward to return the correct value.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6e4eb4312571..c6877fe9a831 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2707,6 +2707,14 @@ static void cgroup_enable_task_cg_lists(void)
 	struct task_struct *p, *g;
 	write_lock(&css_set_lock);
 	use_task_css_set_links = 1;
+	/*
+	 * We need tasklist_lock because RCU is not safe against
+	 * while_each_thread(). Besides, a forking task that has passed
+	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
+	 * is not guaranteed to have its child immediately visible in the
+	 * tasklist if we walk through it with RCU.
+	 */
+	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		task_lock(p);
 		/*
@@ -2718,6 +2726,7 @@ static void cgroup_enable_task_cg_lists(void)
 			list_add(&p->cg_list, &p->cgroups->tasks);
 		task_unlock(p);
 	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
 	write_unlock(&css_set_lock);
 }
 
@@ -4522,6 +4531,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
  */
 void cgroup_post_fork(struct task_struct *child)
 {
+	/*
+	 * use_task_css_set_links is set to 1 before we walk the tasklist
+	 * under the tasklist_lock and we read it here after we added the child
+	 * to the tasklist under the tasklist_lock as well. If the child wasn't
+	 * yet in the tasklist when we walked through it from
+	 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
+	 * should be visible now due to the paired locking and barriers implied
+	 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
+	 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
+	 * lock on fork.
+	 */
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
 		if (list_empty(&child->cg_list)) {
-- 
cgit v1.2.3-55-g7522


From abd2363f6a5f1030b935e0bdc15cf917313b3b10 Mon Sep 17 00:00:00 2001
From: Grant Likely
Date: Fri, 24 Feb 2012 08:07:06 -0700
Subject: irq_domain/mips: Allow irq_domain on MIPS

This patch makes IRQ_DOMAIN usable on MIPS.  It uses an ugly workaround
to preserve current behaviour so that MIPS has time to add irq_domain
registration to the irq controller drivers.  The workaround will be
removed in Linux v3.6

Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
---
 arch/mips/Kconfig           |  1 +
 arch/mips/include/asm/irq.h |  5 +----
 arch/mips/kernel/prom.c     | 14 --------------
 kernel/irq/irqdomain.c      | 12 ++++++++++++
 4 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5ab6e89603c5..edbbae17e820 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2327,6 +2327,7 @@ config USE_OF
 	bool "Flattened Device Tree support"
 	select OF
 	select OF_EARLY_FLATTREE
+	select IRQ_DOMAIN
 	help
 	  Include support for flattened device tree machine descriptions.
 
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 2354c870a63a..fb698dc09bc9 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -11,15 +11,12 @@
 
 #include <linux/linkage.h>
 #include <linux/smp.h>
+#include <linux/irqdomain.h>
 
 #include <asm/mipsmtregs.h>
 
 #include <irq.h>
 
-static inline void irq_dispose_mapping(unsigned int virq)
-{
-}
-
 #ifdef CONFIG_I8259
 static inline int irq_canonicalize(int irq)
 {
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 6b8b4208481e..558b5395795d 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -60,20 +60,6 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start,
 }
 #endif
 
-/*
- * irq_create_of_mapping - Hook to resolve OF irq specifier into a Linux irq#
- *
- * Currently the mapping mechanism is trivial; simple flat hwirq numbers are
- * mapped 1:1 onto Linux irq numbers.  Cascaded irq controllers are not
- * supported.
- */
-unsigned int irq_create_of_mapping(struct device_node *controller,
-				   const u32 *intspec, unsigned int intsize)
-{
-	return intspec[0];
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
 void __init early_init_devtree(void *params)
 {
 	/* Setup flat device-tree pointer */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 25a498eb98a3..af48e59bc2ff 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -411,6 +411,18 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 
 	domain = controller ? irq_find_host(controller) : irq_default_domain;
 	if (!domain) {
+#ifdef CONFIG_MIPS
+		/*
+		 * Workaround to avoid breaking interrupt controller drivers
+		 * that don't yet register an irq_domain.  This is temporary
+		 * code. ~~~gcl, Feb 24, 2012
+		 *
+		 * Scheduled for removal in Linux v3.6.  That should be enough
+		 * time.
+		 */
+		if (intsize > 0)
+			return intspec[0];
+#endif
 		printk(KERN_WARNING "irq: no irq domain found for %s !\n",
 		       controller->full_name);
 		return 0;
-- 
cgit v1.2.3-55-g7522


From 13ae246db4a02971ef4f557af1f6d3e21d64b710 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker
Date: Sun, 29 Jan 2012 15:44:45 -0500
Subject: includecheck: delete any duplicate instances of module.h

Different tree maintainers picked up independently generated
trivial compile fixes based on linux-next testing, resulting
in some cases where a file would have got more than one addition
of module.h once everything was all merged together.

Delete any duplicates so includecheck isn't complaining about
anything related to module.h/export.h changes.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 arch/blackfin/mach-bf537/boards/pnav10.c  | 1 -
 drivers/dma/imx-dma.c                     | 1 -
 drivers/dma/imx-sdma.c                    | 1 -
 drivers/media/video/adp1653.c             | 1 -
 drivers/mmc/host/sdhci-tegra.c            | 1 -
 drivers/power/max8998_charger.c           | 1 -
 drivers/staging/iio/dac/ad5686.c          | 1 -
 drivers/staging/iio/gyro/adis16060_core.c | 1 -
 drivers/staging/sm7xx/smtcfb.c            | 1 -
 drivers/usb/dwc3/core.c                   | 1 -
 drivers/usb/dwc3/dwc3-omap.c              | 1 -
 kernel/params.c                           | 1 -
 12 files changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/blackfin/mach-bf537/boards/pnav10.c b/arch/blackfin/mach-bf537/boards/pnav10.c
index 6fd84709fc68..7d15a3024e48 100644
--- a/arch/blackfin/mach-bf537/boards/pnav10.c
+++ b/arch/blackfin/mach-bf537/boards/pnav10.c
@@ -101,7 +101,6 @@ static struct platform_device smc91x_device = {
 
 #if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE)
 #include <linux/bfin_mac.h>
-#include <linux/export.h>
 static const unsigned short bfin_mac_peripherals[] = P_RMII0;
 
 static struct bfin_phydev_platform_data bfin_phydev_data[] = {
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index e4383ee2c9ac..38586ba8da91 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -14,7 +14,6 @@
  * http://www.gnu.org/copyleft/gpl.html
  */
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 8bc5acf36ee5..63540d3e2153 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -35,7 +35,6 @@
 #include <linux/dmaengine.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/module.h>
 
 #include <asm/irq.h>
 #include <mach/sdma.h>
diff --git a/drivers/media/video/adp1653.c b/drivers/media/video/adp1653.c
index 12eedf4d515a..6e7d094fa2b6 100644
--- a/drivers/media/video/adp1653.c
+++ b/drivers/media/video/adp1653.c
@@ -33,7 +33,6 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/version.h>
 #include <media/adp1653.h>
diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c
index 78a36eba4df0..cb348569454b 100644
--- a/drivers/mmc/host/sdhci-tegra.c
+++ b/drivers/mmc/host/sdhci-tegra.c
@@ -23,7 +23,6 @@
 #include <linux/gpio.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
-#include <linux/module.h>
 
 #include <asm/gpio.h>
 
diff --git a/drivers/power/max8998_charger.c b/drivers/power/max8998_charger.c
index 9b3f2bf56e70..6dc01c255592 100644
--- a/drivers/power/max8998_charger.c
+++ b/drivers/power/max8998_charger.c
@@ -19,7 +19,6 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/staging/iio/dac/ad5686.c b/drivers/staging/iio/dac/ad5686.c
index ce2d6193dd89..2415a6e60c77 100644
--- a/drivers/staging/iio/dac/ad5686.c
+++ b/drivers/staging/iio/dac/ad5686.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/regulator/consumer.h>
-#include <linux/module.h>
 
 #include "../iio.h"
 #include "../sysfs.h"
diff --git a/drivers/staging/iio/gyro/adis16060_core.c b/drivers/staging/iio/gyro/adis16060_core.c
index c0ca7093e0ed..02cc23420b90 100644
--- a/drivers/staging/iio/gyro/adis16060_core.c
+++ b/drivers/staging/iio/gyro/adis16060_core.c
@@ -14,7 +14,6 @@
 #include <linux/spi/spi.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
-#include <linux/module.h>
 
 #include "../iio.h"
 #include "../sysfs.h"
diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c
index ae0035f327e7..1b3e2d0c6993 100644
--- a/drivers/staging/sm7xx/smtcfb.c
+++ b/drivers/staging/sm7xx/smtcfb.c
@@ -41,7 +41,6 @@
 
 #ifdef CONFIG_PM
 #include <linux/pm.h>
-#include <linux/module.h>
 #endif
 
 #include "smtcfb.h"
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 7c9df630dbe4..eed4a5b6d394 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -51,7 +51,6 @@
 
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
-#include <linux/module.h>
 
 #include "core.h"
 #include "gadget.h"
diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index 3274ac8f1200..92cc7b8bc091 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -46,7 +46,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/ioport.h>
 #include <linux/io.h>
-#include <linux/module.h>
 
 #include "core.h"
 #include "io.h"
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..47f5bf12434a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
-- 
cgit v1.2.3-55-g7522


From 05b4877f6a4f1ba4952d1222213d262bf8c132b7 Mon Sep 17 00:00:00 2001
From: Srivatsa S. Bhat
Date: Fri, 17 Feb 2012 23:39:51 +0100
Subject: PM / Hibernate: Enable usermodehelpers in hibernate() error path

If create_basic_memory_bitmaps() fails, usermodehelpers are not re-enabled
before returning. Fix this. And while at it, reword the goto labels so that
they look more meaningful.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 72baaf011fb7..0a186cfde788 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -618,7 +618,7 @@ int hibernate(void)
 	/* Allocate memory management structures */
 	error = create_basic_memory_bitmaps();
 	if (error)
-		goto Exit;
+		goto Enable_umh;
 
 	printk(KERN_INFO "PM: Syncing filesystems ... ");
 	sys_sync();
@@ -626,7 +626,7 @@ int hibernate(void)
 
 	error = freeze_processes();
 	if (error)
-		goto Finish;
+		goto Free_bitmaps;
 
 	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
 	if (error || freezer_test_done)
@@ -659,8 +659,9 @@ int hibernate(void)
 	/* Don't bother checking whether freezer_test_done is true */
 	freezer_test_done = false;
 
- Finish:
+ Free_bitmaps:
 	free_basic_memory_bitmaps();
+ Enable_umh:
 	usermodehelper_enable();
  Exit:
 	pm_notifier_call_chain(PM_POST_HIBERNATION);
-- 
cgit v1.2.3-55-g7522


From 37f08be11be9a7d9351fb1b9b408259519a126f3 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza
Date: Tue, 21 Feb 2012 23:57:47 +0100
Subject: PM / Freezer: Remove references to TIF_FREEZE in comments

This patch removes all the references in the code about the TIF_FREEZE
flag removed by commit a3201227f803ad7fd43180c5195dbe5a2bf998aa

    freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE

There still are some references to TIF_FREEZE in
Documentation/power/freezing-of-tasks.txt, but it looks like that
documentation needs more thorough work to reflect how the new
freezer works, and hence merely removing the references to TIF_FREEZE
won't really help. So I have not touched that part in this patch.

Suggested-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Marcos Paulo de Souza <marcos.mage@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/exit.c          | 2 +-
 kernel/freezer.c       | 6 +++---
 kernel/power/process.c | 8 +++-----
 3 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 294b1709170d..fd0af05e0639 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -424,7 +424,7 @@ void daemonize(const char *name, ...)
 	 */
 	exit_mm(current);
 	/*
-	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
+	 * We don't want to get frozen, in case system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
 	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
  * freeze_task - send a freeze request to given task
  * @p: task to send the request to
  *
- * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
- * flag and either sending a fake signal to it or waking it up, depending
- * on whether it has %PF_FREEZER_NOSIG set.
+ * If @p is freezing, the freeze request is sent either by sending a fake
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ * thread).
  *
  * RETURNS:
  * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6aeb5efe00eb..0d2aeb226108 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
 			 * It is "frozen enough".  If the task does wake
 			 * up, it will immediately call try_to_freeze.
 			 *
-			 * Because freeze_task() goes through p's
-			 * scheduler lock after setting TIF_FREEZE, it's
-			 * guaranteed that either we see TASK_RUNNING or
-			 * try_to_stop() after schedule() in ptrace/signal
-			 * stop sees TIF_FREEZE.
+			 * Because freeze_task() goes through p's scheduler lock, it's
+			 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
+			 * transition can't race with task state testing here.
 			 */
 			if (!task_is_stopped_or_traced(p) &&
 			    !freezer_should_skip(p))
-- 
cgit v1.2.3-55-g7522


From e06ffa1ede4146cbc261d90f5dff3d63fe2e9d7a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Fri, 9 Mar 2012 18:03:20 +0800
Subject: workqueue: use percpu allocator for cwq on UP

I notice that the commit bbddff makes percpu allocator can work on UP,
So we don't need the magic way for UP.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bec7b5b53e03..5bbba094bfac 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -474,13 +474,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 					    struct workqueue_struct *wq)
 {
 	if (!(wq->flags & WQ_UNBOUND)) {
-		if (likely(cpu < nr_cpu_ids)) {
-#ifdef CONFIG_SMP
+		if (likely(cpu < nr_cpu_ids))
 			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
-#else
-			return wq->cpu_wq.single;
-#endif
-		}
 	} else if (likely(cpu == WORK_CPU_UNBOUND))
 		return wq->cpu_wq.single;
 	return NULL;
@@ -2897,13 +2892,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 	const size_t size = sizeof(struct cpu_workqueue_struct);
 	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
 				   __alignof__(unsigned long long));
-#ifdef CONFIG_SMP
-	bool percpu = !(wq->flags & WQ_UNBOUND);
-#else
-	bool percpu = false;
-#endif
 
-	if (percpu)
+	if (!(wq->flags & WQ_UNBOUND))
 		wq->cpu_wq.pcpu = __alloc_percpu(size, align);
 	else {
 		void *ptr;
@@ -2927,13 +2917,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
 
 static void free_cwqs(struct workqueue_struct *wq)
 {
-#ifdef CONFIG_SMP
-	bool percpu = !(wq->flags & WQ_UNBOUND);
-#else
-	bool percpu = false;
-#endif
-
-	if (percpu)
+	if (!(wq->flags & WQ_UNBOUND))
 		free_percpu(wq->cpu_wq.pcpu);
 	else if (wq->cpu_wq.single) {
 		/* the pointer to free is stored right after the cwq */
-- 
cgit v1.2.3-55-g7522


From 3047817b894ddae62be07787bc8735a616104398 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Fri, 9 Mar 2012 07:20:12 +0100
Subject: padata: Fix race in the serialization path

When a padata object is queued to the serialization queue, another
cpu might process and free the padata object. So don't dereference
it after queueing to the serialization queue.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..aa9929545855 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -230,6 +230,7 @@ out:
 
 static void padata_reorder(struct parallel_data *pd)
 {
+	int cb_cpu;
 	struct padata_priv *padata;
 	struct padata_serial_queue *squeue;
 	struct padata_instance *pinst = pd->pinst;
@@ -270,13 +271,14 @@ static void padata_reorder(struct parallel_data *pd)
 			return;
 		}
 
-		squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
+		cb_cpu = padata->cb_cpu;
+		squeue = per_cpu_ptr(pd->squeue, cb_cpu);
 
 		spin_lock(&squeue->serial.lock);
 		list_add_tail(&padata->list, &squeue->serial.list);
 		spin_unlock(&squeue->serial.lock);
 
-		queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
+		queue_work_on(cb_cpu, pinst->wq, &squeue->work);
 	}
 
 	spin_unlock_bh(&pd->lock);
-- 
cgit v1.2.3-55-g7522


From 2dc9b5dbdef09840de852a4f0cc6a9c9eece7220 Mon Sep 17 00:00:00 2001
From: Steffen Klassert
Date: Fri, 9 Mar 2012 07:20:49 +0100
Subject: padata: Fix race on sequence number wrap

When padata_do_parallel() is called from multiple cpus for the same
padata instance, we can get object reordering on sequence number wrap
because testing for sequence number wrap and reseting the sequence
number must happen atomically but is implemented with two atomic
operations. This patch fixes this by converting the sequence number
from atomic_t to an unsigned int and protect the access with a
spin_lock. As a side effect, we get rid of the sequence number wrap
handling because the seqence number wraps back to null now without
the need to do anything.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  6 ++----
 kernel/padata.c        | 38 ++++++++++----------------------------
 2 files changed, 12 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 4633b2f726b6..86292beebfe2 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -46,7 +46,6 @@ struct padata_priv {
 	struct list_head	list;
 	struct parallel_data	*pd;
 	int			cb_cpu;
-	int			seq_nr;
 	int			info;
 	void                    (*parallel)(struct padata_priv *padata);
 	void                    (*serial)(struct padata_priv *padata);
@@ -116,7 +115,6 @@ struct padata_cpumask {
  * @pinst: padata instance.
  * @pqueue: percpu padata queues used for parallelization.
  * @squeue: percpu padata queues used for serialuzation.
- * @seq_nr: The sequence number that will be attached to the next object.
  * @reorder_objects: Number of objects waiting in the reorder queues.
  * @refcnt: Number of objects holding a reference on this parallel_data.
  * @max_seq_nr:  Maximal used sequence number.
@@ -129,12 +127,12 @@ struct parallel_data {
 	struct padata_instance		*pinst;
 	struct padata_parallel_queue	__percpu *pqueue;
 	struct padata_serial_queue	__percpu *squeue;
-	atomic_t			seq_nr;
 	atomic_t			reorder_objects;
 	atomic_t			refcnt;
-	unsigned int			max_seq_nr;
 	struct padata_cpumask		cpumask;
 	spinlock_t                      lock ____cacheline_aligned;
+	spinlock_t                      seq_lock;
+	unsigned int			seq_nr;
 	unsigned int			processed;
 	struct timer_list		timer;
 };
diff --git a/kernel/padata.c b/kernel/padata.c
index aa9929545855..6f10eb285ece 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,6 @@
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
 
-#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
 #define MAX_OBJ_NUM 1000
 
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 	return target_cpu;
 }
 
-static int padata_cpu_hash(struct padata_priv *padata)
+static int padata_cpu_hash(struct parallel_data *pd)
 {
 	int cpu_index;
-	struct parallel_data *pd;
-
-	pd =  padata->pd;
 
 	/*
 	 * Hash the sequence numbers to the cpus by taking
 	 * seq_nr mod. number of cpus in use.
 	 */
-	cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+
+	spin_lock(&pd->seq_lock);
+	cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+	pd->seq_nr++;
+	spin_unlock(&pd->seq_lock);
 
 	return padata_index_to_cpu(pd, cpu_index);
 }
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 	padata->pd = pd;
 	padata->cb_cpu = cb_cpu;
 
-	if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
-		atomic_set(&pd->seq_nr, -1);
-
-	padata->seq_nr = atomic_inc_return(&pd->seq_nr);
-
-	target_cpu = padata_cpu_hash(padata);
+	target_cpu = padata_cpu_hash(pd);
 	queue = per_cpu_ptr(pd->pqueue, target_cpu);
 
 	spin_lock(&queue->parallel.lock);
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
 	int cpu, num_cpus;
-	int next_nr, next_index;
+	unsigned int next_nr, next_index;
 	struct padata_parallel_queue *queue, *next_queue;
 	struct padata_priv *padata;
 	struct padata_list *reorder;
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 	cpu = padata_index_to_cpu(pd, next_index);
 	next_queue = per_cpu_ptr(pd->pqueue, cpu);
 
-	if (unlikely(next_nr > pd->max_seq_nr)) {
-		next_nr = next_nr - pd->max_seq_nr - 1;
-		next_index = next_nr % num_cpus;
-		cpu = padata_index_to_cpu(pd, next_index);
-		next_queue = per_cpu_ptr(pd->pqueue, cpu);
-		pd->processed = 0;
-	}
-
 	padata = NULL;
 
 	reorder = &next_queue->reorder;
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 		padata = list_entry(reorder->list.next,
 				    struct padata_priv, list);
 
-		BUG_ON(next_nr != padata->seq_nr);
-
 		spin_lock(&reorder->lock);
 		list_del_init(&padata->list);
 		atomic_dec(&pd->reorder_objects);
@@ -402,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
 /* Initialize all percpu queues used by parallel workers */
 static void padata_init_pqueues(struct parallel_data *pd)
 {
-	int cpu_index, num_cpus, cpu;
+	int cpu_index, cpu;
 	struct padata_parallel_queue *pqueue;
 
 	cpu_index = 0;
@@ -417,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
 		INIT_WORK(&pqueue->work, padata_parallel_worker);
 		atomic_set(&pqueue->num_obj, 0);
 	}
-
-	num_cpus = cpumask_weight(pd->cpumask.pcpu);
-	pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
 }
 
 /* Allocate and initialize the internal cpumask dependend resources. */
@@ -446,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 	padata_init_pqueues(pd);
 	padata_init_squeues(pd);
 	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-	atomic_set(&pd->seq_nr, -1);
+	pd->seq_nr = 0;
 	atomic_set(&pd->reorder_objects, 0);
 	atomic_set(&pd->refcnt, 0);
 	pd->pinst = pinst;
-- 
cgit v1.2.3-55-g7522


From d762a50b5b1bb93e91cb3cd90b6ae133da98fe31 Mon Sep 17 00:00:00 2001
From: Cong Wang
Date: Fri, 25 Nov 2011 23:14:38 +0800
Subject: kdb: remove the second argument of k[un]map_atomic()

Signed-off-by: Cong Wang <amwang@redhat.com>
---
 kernel/debug/kdb/kdb_support.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
 	if (!pfn_valid(pfn))
 		return 1;
 	page = pfn_to_page(pfn);
-	vaddr = kmap_atomic(page, KM_KDB);
+	vaddr = kmap_atomic(page);
 	memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
-	kunmap_atomic(vaddr, KM_KDB);
+	kunmap_atomic(vaddr);
 
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 0de9a1e28a0d005f42c8cc5456a246710133b9ab Mon Sep 17 00:00:00 2001
From: Cong Wang
Date: Fri, 25 Nov 2011 23:14:38 +0800
Subject: power: remove the second argument of k[un]map_atomic()

Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Cong Wang <amwang@redhat.com>
---
 kernel/power/snapshot.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..3a564ac85f36 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1000,20 +1000,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 	s_page = pfn_to_page(src_pfn);
 	d_page = pfn_to_page(dst_pfn);
 	if (PageHighMem(s_page)) {
-		src = kmap_atomic(s_page, KM_USER0);
-		dst = kmap_atomic(d_page, KM_USER1);
+		src = kmap_atomic(s_page);
+		dst = kmap_atomic(d_page);
 		do_copy_page(dst, src);
-		kunmap_atomic(dst, KM_USER1);
-		kunmap_atomic(src, KM_USER0);
+		kunmap_atomic(dst);
+		kunmap_atomic(src);
 	} else {
 		if (PageHighMem(d_page)) {
 			/* Page pointed to by src may contain some kernel
 			 * data modified by kmap_atomic()
 			 */
 			safe_copy_page(buffer, s_page);
-			dst = kmap_atomic(d_page, KM_USER0);
+			dst = kmap_atomic(d_page);
 			copy_page(dst, buffer);
-			kunmap_atomic(dst, KM_USER0);
+			kunmap_atomic(dst);
 		} else {
 			safe_copy_page(page_address(d_page), s_page);
 		}
@@ -1728,9 +1728,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
 			 */
 			void *kaddr;
 
-			kaddr = kmap_atomic(page, KM_USER0);
+			kaddr = kmap_atomic(page);
 			copy_page(buffer, kaddr);
-			kunmap_atomic(kaddr, KM_USER0);
+			kunmap_atomic(kaddr);
 			handle->buffer = buffer;
 		} else {
 			handle->buffer = page_address(page);
@@ -2014,9 +2014,9 @@ static void copy_last_highmem_page(void)
 	if (last_highmem_page) {
 		void *dst;
 
-		dst = kmap_atomic(last_highmem_page, KM_USER0);
+		dst = kmap_atomic(last_highmem_page);
 		copy_page(dst, buffer);
-		kunmap_atomic(dst, KM_USER0);
+		kunmap_atomic(dst);
 		last_highmem_page = NULL;
 	}
 }
@@ -2309,13 +2309,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
 	void *kaddr1, *kaddr2;
 
-	kaddr1 = kmap_atomic(p1, KM_USER0);
-	kaddr2 = kmap_atomic(p2, KM_USER1);
+	kaddr1 = kmap_atomic(p1);
+	kaddr2 = kmap_atomic(p2);
 	copy_page(buf, kaddr1);
 	copy_page(kaddr1, kaddr2);
 	copy_page(kaddr2, buf);
-	kunmap_atomic(kaddr2, KM_USER1);
-	kunmap_atomic(kaddr1, KM_USER0);
+	kunmap_atomic(kaddr2);
+	kunmap_atomic(kaddr1);
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From 5f8aadd8b9966d71a77bba52b9d499cc2f38269f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 14 Mar 2012 19:55:38 +0100
Subject: CLONE_PARENT shouldn't allow to set ->exit_signal

The child must not control its ->exit_signal, it is the parent who
decides which signal the child should use for notification.

This means that CLONE_PARENT should not use "clone_flags & CSIGNAL",
the forking task is the sibling of the new process and their parent
doesn't control exit_signal in this case.

This patch uses ->exit_signal of the forking process, but perhaps
we should simply use SIGCHLD.

We read group_leader->exit_signal lockless, this can race with the
ORIGINAL_SIGNAL -> SIGCHLD transition, but this is fine.

Potentially this change allows to kill self_exec_id/parent_exec_id.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 26a7a6707fa7..c4f38a849436 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1340,7 +1340,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	clear_all_latency_tracing(p);
 
 	/* ok, now we should be set up.. */
-	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
+	if (clone_flags & CLONE_THREAD)
+		p->exit_signal = -1;
+	else if (clone_flags & CLONE_PARENT)
+		p->exit_signal = current->group_leader->exit_signal;
+	else
+		p->exit_signal = (clone_flags & CSIGNAL);
+
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
 
-- 
cgit v1.2.3-55-g7522


From e636825346b36a07ccfc8e30946d52855e21f681 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 19 Mar 2012 17:03:22 +0100
Subject: exit_signal: simplify the "we have changed execution domain" logic

exit_notify() checks "tsk->self_exec_id != tsk->parent_exec_id"
to handle the "we have changed execution domain" case.

We can change do_thread() to always set ->exit_signal = SIGCHLD
and remove this check to simplify the code.

We could change setup_new_exec() instead, this looks more logical
because it increments ->self_exec_id. But note that de_thread()
already resets ->exit_signal if it changes the leader, let's keep
both changes close to each other.

Note that we change ->exit_signal lockless, this changes the rules.
Thereafter ->exit_signal is not stable under tasklist but this is
fine, the only possible change is OLDSIG -> SIGCHLD. This can race
with eligible_child() but the race is harmless. We can race with
reparent_leader() which changes our ->exit_signal in parallel, but
it does the same change to SIGCHLD.

The noticeable user-visible change is that the execing task is not
"visible" to do_wait()->eligible_child(__WCLONE) right after exec.
To me this looks more logical, and this is consistent with mt case.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c     | 3 +++
 kernel/exit.c | 7 +------
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index b0695a9900ef..1e94d2263ae0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -977,6 +977,9 @@ static int de_thread(struct task_struct *tsk)
 	sig->notify_count = 0;
 
 no_thread_group:
+	/* we have changed execution domain */
+	tsk->exit_signal = SIGCHLD;
+
 	if (current->mm)
 		setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 752d2c0abd19..51ac4ced1313 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -827,14 +827,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * If the parent exec id doesn't match the exec id we saved
 	 * when we started then we know the parent has changed security
 	 * domain.
-	 *
-	 * If our self_exec id doesn't match our parent_exec_id then
-	 * we have changed execution domain as these two values started
-	 * the same after a fork.
 	 */
 	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
-	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id))
+	    tsk->parent_exec_id != tsk->real_parent->self_exec_id)
 		tsk->exit_signal = SIGCHLD;
 
 	if (unlikely(tsk->ptrace)) {
-- 
cgit v1.2.3-55-g7522


From b6e238dceed36891cc633167afe7151f1f3d83c5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 19 Mar 2012 17:03:41 +0100
Subject: exit_signal: fix the "parent has changed security domain" logic

exit_notify() changes ->exit_signal if the parent already did exec.
This doesn't really work, we are not going to send the signal now
if there is another live thread or the exiting task is traced. The
parent can exec before the last dies or the tracer detaches.

Move this check into do_notify_parent() which actually sends the
signal.

The user-visible change is that we do not change ->exit_signal,
and thus the exiting task is still "clone children" for
do_wait()->eligible_child(__WCLONE). Hopefully this is fine, the
current logic is racy anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 14 --------------
 kernel/signal.c |  9 +++++++++
 2 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 51ac4ced1313..ce5f758f40bd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -818,20 +818,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
-	/* Let father know we died
-	 *
-	 * Thread signals are configurable, but you aren't going to use
-	 * that to send signals to arbitrary processes.
-	 * That stops right now.
-	 *
-	 * If the parent exec id doesn't match the exec id we saved
-	 * when we started then we know the parent has changed security
-	 * domain.
-	 */
-	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
-	    tsk->parent_exec_id != tsk->real_parent->self_exec_id)
-		tsk->exit_signal = SIGCHLD;
-
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
 				thread_group_empty(tsk) &&
diff --git a/kernel/signal.c b/kernel/signal.c
index 8511e39813c7..e76001ccf5cd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	BUG_ON(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
+	if (sig != SIGCHLD) {
+		/*
+		 * This is only possible if parent == real_parent.
+		 * Check if it has changed security domain.
+		 */
+		if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+			sig = SIGCHLD;
+	}
+
 	info.si_signo = sig;
 	info.si_errno = 0;
 	/*
-- 
cgit v1.2.3-55-g7522


From 48fde701aff662559b38d9a609574068f22d00fe Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Sun, 8 Jan 2012 22:15:13 -0500
Subject: switch open-coded instances of d_make_root() to new helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  4 ++--
 arch/s390/hypfs/inode.c                   |  6 ++----
 drivers/misc/ibmasm/ibmasmfs.c            |  6 ++----
 drivers/oprofile/oprofilefs.c             |  6 ++----
 drivers/usb/core/inode.c                  |  9 +--------
 drivers/usb/gadget/f_fs.c                 |  8 ++------
 drivers/usb/gadget/inode.c                |  4 +---
 fs/9p/vfs_super.c                         |  3 +--
 fs/adfs/super.c                           |  3 +--
 fs/affs/super.c                           |  7 ++-----
 fs/afs/super.c                            |  7 ++-----
 fs/autofs4/inode.c                        | 10 ++--------
 fs/befs/linuxvfs.c                        |  3 +--
 fs/bfs/inode.c                            |  3 +--
 fs/btrfs/super.c                          |  8 ++------
 fs/ceph/super.c                           |  3 +--
 fs/cifs/cifsfs.c                          |  4 +---
 fs/coda/inode.c                           |  3 +--
 fs/configfs/mount.c                       |  3 +--
 fs/cramfs/inode.c                         |  6 ++----
 fs/devpts/inode.c                         |  3 +--
 fs/ecryptfs/main.c                        |  3 +--
 fs/efs/super.c                            |  3 +--
 fs/exofs/super.c                          |  3 +--
 fs/ext2/super.c                           |  3 +--
 fs/ext3/super.c                           |  3 +--
 fs/ext4/super.c                           |  3 +--
 fs/freevxfs/vxfs_super.c                  |  3 +--
 fs/fuse/inode.c                           |  9 ++-------
 fs/gfs2/ops_fstype.c                      |  3 +--
 fs/hfs/super.c                            |  6 ++----
 fs/hostfs/hostfs_kern.c                   |  4 ++--
 fs/hpfs/super.c                           |  6 ++----
 fs/hppfs/hppfs.c                          |  9 ++-------
 fs/hugetlbfs/inode.c                      | 13 ++-----------
 fs/isofs/inode.c                          |  3 +--
 fs/jffs2/fs.c                             |  6 ++----
 fs/jfs/super.c                            |  3 +--
 fs/libfs.c                                |  6 ++----
 fs/logfs/super.c                          |  6 ++----
 fs/ncpfs/inode.c                          |  6 ++----
 fs/nfs/getroot.c                          |  6 ++----
 fs/nilfs2/super.c                         |  3 +--
 fs/ocfs2/dlmfs/dlmfs.c                    | 14 ++------------
 fs/ocfs2/super.c                          |  3 +--
 fs/omfs/inode.c                           |  6 ++----
 fs/openpromfs/inode.c                     |  3 +--
 fs/proc/inode.c                           | 15 +++------------
 fs/pstore/inode.c                         |  3 +--
 fs/qnx4/inode.c                           |  6 ++----
 fs/ramfs/inode.c                          | 12 ++----------
 fs/reiserfs/super.c                       |  6 ++----
 fs/romfs/super.c                          |  6 ++----
 fs/squashfs/super.c                       |  3 +--
 fs/sysfs/mount.c                          |  3 +--
 fs/sysv/super.c                           |  3 +--
 fs/ubifs/super.c                          |  6 ++----
 fs/udf/super.c                            |  3 +--
 fs/ufs/super.c                            |  6 ++----
 fs/xfs/xfs_super.c                        |  6 ++----
 ipc/mqueue.c                              | 24 +++++++-----------------
 kernel/cgroup.c                           |  8 ++------
 mm/shmem.c                                |  6 ++----
 net/sunrpc/rpc_pipe.c                     |  8 ++------
 64 files changed, 105 insertions(+), 264 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index d4a094ca96f3..17b3211e3641 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -757,9 +757,9 @@ spufs_create_root(struct super_block *sb, void *data)
 		goto out_iput;
 
 	ret = -ENOMEM;
-	sb->s_root = d_alloc_root(inode);
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
-		goto out_iput;
+		goto out;
 
 	return 0;
 out_iput:
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 8a2a887478cc..6a2cb560e968 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -293,11 +293,9 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	root_inode->i_op = &simple_dir_inode_operations;
 	root_inode->i_fop = &simple_dir_operations;
-	sb->s_root = root_dentry = d_alloc_root(root_inode);
-	if (!root_dentry) {
-		iput(root_inode);
+	sb->s_root = root_dentry = d_make_root(root_inode);
+	if (!root_dentry)
 		return -ENOMEM;
-	}
 	if (MACHINE_IS_VM)
 		rc = hypfs_vm_create_files(sb, root_dentry);
 	else
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 35361753b487..15f24f362208 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -129,11 +129,9 @@ static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent)
 	root->i_op = &simple_dir_inode_operations;
 	root->i_fop = ibmasmfs_dir_ops;
 
-	root_dentry = d_alloc_root(root);
-	if (!root_dentry) {
-		iput(root);
+	root_dentry = d_make_root(root);
+	if (!root_dentry)
 		return -ENOMEM;
-	}
 	sb->s_root = root_dentry;
 
 	ibmasmfs_create_files(sb, root_dentry);
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 2f0aa0f700e6..277bb70b8d75 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -251,11 +251,9 @@ static int oprofilefs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	root_inode->i_op = &simple_dir_inode_operations;
 	root_inode->i_fop = &simple_dir_operations;
-	root_dentry = d_alloc_root(root_inode);
-	if (!root_dentry) {
-		iput(root_inode);
+	root_dentry = d_make_root(root_inode);
+	if (!root_dentry)
 		return -ENOMEM;
-	}
 
 	sb->s_root = root_dentry;
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 9e186f3da839..bdaef8e36020 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -462,16 +462,9 @@ static int usbfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op = &usbfs_ops;
 	sb->s_time_gran = 1;
 	inode = usbfs_get_inode(sb, S_IFDIR | 0755, 0);
-
-	if (!inode) {
-		dbg("%s: could not get inode!",__func__);
-		return -ENOMEM;
-	}
-
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root) {
 		dbg("%s: could not get root dentry!",__func__);
-		iput(inode);
 		return -ENOMEM;
 	}
 	sb->s_root = root;
diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c
index f63dc6c150d2..d825b248728a 100644
--- a/drivers/usb/gadget/f_fs.c
+++ b/drivers/usb/gadget/f_fs.c
@@ -1063,13 +1063,9 @@ static int ffs_sb_fill(struct super_block *sb, void *_data, int silent)
 				  &simple_dir_operations,
 				  &simple_dir_inode_operations,
 				  &data->perms);
-	if (unlikely(!inode))
+	sb->s_root = d_make_root(inode);
+	if (unlikely(!sb->s_root))
 		goto Enomem;
-	sb->s_root = d_alloc_root(inode);
-	if (unlikely(!sb->s_root)) {
-		iput(inode);
-		goto Enomem;
-	}
 
 	/* EP0 file */
 	if (unlikely(!ffs_sb_create_file(sb, "ep0", ffs,
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index ae04266dba1b..c95eea43b637 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -2059,10 +2059,8 @@ gadgetfs_fill_super (struct super_block *sb, void *opts, int silent)
 	if (!inode)
 		goto Enomem;
 	inode->i_op = &simple_dir_inode_operations;
-	if (!(sb->s_root = d_alloc_root (inode))) {
-		iput(inode);
+	if (!(sb->s_root = d_make_root (inode)))
 		goto Enomem;
-	}
 
 	/* the ep0 file is named after the controller we expect;
 	 * user mode code can use it for sanity checks, like we do.
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 7b0cd87b07c2..10b7d3c9dba8 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -155,9 +155,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		goto release_sb;
 	}
 
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root) {
-		iput(inode);
 		retval = -ENOMEM;
 		goto release_sb;
 	}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 8e3b36ace305..06fdcc9382c4 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -483,10 +483,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_d_op = &adfs_dentry_operations;
 	root = adfs_iget(sb, &root_obj);
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		int i;
-		iput(root);
 		for (i = 0; i < asb->s_map_size; i++)
 			brelse(asb->s_map[i].dm_bh);
 		kfree(asb->s_map);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8ba73fed7964..0782653a05a2 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -473,7 +473,7 @@ got_root:
 	root_inode = affs_iget(sb, root_block);
 	if (IS_ERR(root_inode)) {
 		ret = PTR_ERR(root_inode);
-		goto out_error_noinode;
+		goto out_error;
 	}
 
 	if (AFFS_SB(sb)->s_flags & SF_INTL)
@@ -481,7 +481,7 @@ got_root:
 	else
 		sb->s_d_op = &affs_dentry_operations;
 
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root) {
 		printk(KERN_ERR "AFFS: Get root inode failed\n");
 		goto out_error;
@@ -494,9 +494,6 @@ got_root:
 	 * Begin the cascaded cleanup ...
 	 */
 out_error:
-	if (root_inode)
-		iput(root_inode);
-out_error_noinode:
 	kfree(sbi->s_bitmap);
 	affs_brelse(root_bh);
 	kfree(sbi->s_prefix);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 983ec59fc80d..f02b31e7e648 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -301,7 +301,6 @@ static int afs_fill_super(struct super_block *sb,
 {
 	struct afs_super_info *as = sb->s_fs_info;
 	struct afs_fid fid;
-	struct dentry *root = NULL;
 	struct inode *inode = NULL;
 	int ret;
 
@@ -327,18 +326,16 @@ static int afs_fill_super(struct super_block *sb,
 		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
 
 	ret = -ENOMEM;
-	root = d_alloc_root(inode);
-	if (!root)
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
 		goto error;
 
 	sb->s_d_op = &afs_fs_dentry_operations;
-	sb->s_root = root;
 
 	_leave(" = 0");
 	return 0;
 
 error:
-	iput(inode);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 06858d955120..d8dc002e9cc3 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -247,12 +247,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	if (!ino)
 		goto fail_free;
 	root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
-	if (!root_inode)
-		goto fail_ino;
-
-	root = d_alloc_root(root_inode);
+	root = d_make_root(root_inode);
 	if (!root)
-		goto fail_iput;
+		goto fail_ino;
 	pipe = NULL;
 
 	root->d_fsdata = ino;
@@ -317,9 +314,6 @@ fail_fput:
 fail_dput:
 	dput(root);
 	goto fail_free;
-fail_iput:
-	printk("autofs: get root dentry failed\n");
-	iput(root_inode);
 fail_ino:
 	kfree(ino);
 fail_free:
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 6e6d536767fe..e18da23d42b5 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -852,9 +852,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = PTR_ERR(root);
 		goto unacquire_priv_sbp;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		befs_error(sb, "get root inode failed");
 		goto unacquire_priv_sbp;
 	}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index b0391bc402b1..e23dc7c8b884 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -367,9 +367,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		ret = PTR_ERR(inode);
 		goto out2;
 	}
-	s->s_root = d_alloc_root(inode);
+	s->s_root = d_make_root(inode);
 	if (!s->s_root) {
-		iput(inode);
 		ret = -ENOMEM;
 		goto out2;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3ce97b217cbe..81df3fec6a6d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -629,7 +629,6 @@ static int btrfs_fill_super(struct super_block *sb,
 			    void *data, int silent)
 {
 	struct inode *inode;
-	struct dentry *root_dentry;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_key key;
 	int err;
@@ -660,15 +659,12 @@ static int btrfs_fill_super(struct super_block *sb,
 		goto fail_close;
 	}
 
-	root_dentry = d_alloc_root(inode);
-	if (!root_dentry) {
-		iput(inode);
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
 		err = -ENOMEM;
 		goto fail_close;
 	}
 
-	sb->s_root = root_dentry;
-
 	save_mount_options(sb, data);
 	cleancache_init_fs(sb);
 	sb->s_flags |= MS_ACTIVE;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 00de2c9568cd..256f85221926 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -655,9 +655,8 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 		dout("open_root_inode success\n");
 		if (ceph_ino(inode) == CEPH_INO_ROOT &&
 		    fsc->sb->s_root == NULL) {
-			root = d_alloc_root(inode);
+			root = d_make_root(inode);
 			if (!root) {
-				iput(inode);
 				root = ERR_PTR(-ENOMEM);
 				goto out;
 			}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8b7d7ff88792..418fc42fb8b2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -122,11 +122,9 @@ cifs_read_super(struct super_block *sb)
 		goto out_no_root;
 	}
 
-	sb->s_root = d_alloc_root(inode);
-
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
 		rc = -ENOMEM;
-		iput(inode);
 		goto out_no_root;
 	}
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 32dafc875c14..05156c17b551 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -213,9 +213,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 
 	printk("coda_read_super: rootinode is %ld dev %s\n", 
 	       root->i_ino, root->i_sb->s_id);
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		error = -EINVAL;
 		goto error;
 	}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 276e15cafd58..07f60455f1c1 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -91,10 +91,9 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	}
 
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root) {
 		pr_debug("%s: could not get root dentry!\n",__func__);
-		iput(inode);
 		return -ENOMEM;
 	}
 	config_group_init(&configfs_root_group);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a2ee8f9f5a38..853480d2b3d1 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -318,11 +318,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 	root = get_cramfs_inode(sb, &super.root, 0);
 	if (IS_ERR(root))
 		goto out;
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root)
 		goto out;
-	}
 	return 0;
 out:
 	kfree(sbi);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c4e2a58a2e82..57dae0baedf2 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -309,12 +309,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
 
-	s->s_root = d_alloc_root(inode);
+	s->s_root = d_make_root(inode);
 	if (s->s_root)
 		return 0;
 
 	printk(KERN_ERR "devpts: get root dentry failed\n");
-	iput(inode);
 
 fail:
 	return -ENOMEM;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b4a6befb1216..6e0e017e6932 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -550,9 +550,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	if (IS_ERR(inode))
 		goto out_free;
 
-	s->s_root = d_alloc_root(inode);
+	s->s_root = d_make_root(inode);
 	if (!s->s_root) {
-		iput(inode);
 		rc = -ENOMEM;
 		goto out_free;
 	}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 981106429a9f..e755ec746c69 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -317,10 +317,9 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 		goto out_no_fs;
 	}
 
-	s->s_root = d_alloc_root(root);
+	s->s_root = d_make_root(root);
 	if (!(s->s_root)) {
 		printk(KERN_ERR "EFS: get root dentry failed\n");
-		iput(root);
 		ret = -ENOMEM;
 		goto out_no_fs;
 	}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cafcadfc3c8..7f2b590a36b7 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -819,9 +819,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = PTR_ERR(root);
 		goto free_sbi;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		EXOFS_ERR("ERROR: get root inode failed\n");
 		ret = -ENOMEM;
 		goto free_sbi;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9f6766a3ac1e..e1025c7a437a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1088,9 +1088,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount3;
 	}
 
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		ext2_msg(sb, KERN_ERR, "error: get root inode failed");
 		ret = -ENOMEM;
 		goto failed_mount3;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 726c7ef6cdf1..e0b45b93327b 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2046,10 +2046,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
 		goto failed_mount3;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
-		iput(root);
 		ret = -ENOMEM;
 		goto failed_mount3;
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 502c61fd7392..d2baea7bcf30 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3735,9 +3735,8 @@ no_journal:
 		iput(root);
 		goto failed_mount4;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		iput(root);
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		ret = -ENOMEM;
 		goto failed_mount4;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 9d1c99558389..d4fabd26084e 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -224,9 +224,8 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
 		ret = PTR_ERR(root);
 		goto out;
 	}
-	sbp->s_root = d_alloc_root(root);
+	sbp->s_root = d_make_root(root);
 	if (!sbp->s_root) {
-		iput(root);
 		printk(KERN_WARNING "vxfs: unable to get root dentry.\n");
 		goto out_free_ilist;
 	}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 64cf8d07393e..4aec5995867e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -988,14 +988,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	err = -ENOMEM;
 	root = fuse_get_root_inode(sb, d.rootmode);
-	if (!root)
+	root_dentry = d_make_root(root);
+	if (!root_dentry)
 		goto err_put_conn;
-
-	root_dentry = d_alloc_root(root);
-	if (!root_dentry) {
-		iput(root);
-		goto err_put_conn;
-	}
 	/* only now - we want root dentry with NULL ->d_op */
 	sb->s_d_op = &fuse_dentry_operations;
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 24f609c9ef91..10e848c6d1b5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -431,10 +431,9 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
 		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
 		return PTR_ERR(inode);
 	}
-	dentry = d_alloc_root(inode);
+	dentry = d_make_root(inode);
 	if (!dentry) {
 		fs_err(sdp, "can't alloc %s dentry\n", name);
-		iput(inode);
 		return -ENOMEM;
 	}
 	*dptr = dentry;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 8137fb3e6780..7b4c537d6e13 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -430,15 +430,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_d_op = &hfs_dentry_operations;
 	res = -ENOMEM;
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root)
-		goto bail_iput;
+		goto bail_no_root;
 
 	/* everything's okay */
 	return 0;
 
-bail_iput:
-	iput(root_inode);
 bail_no_root:
 	printk(KERN_ERR "hfs: get root inode failed.\n");
 bail:
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e130bd46d671..588d45885a6f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -966,9 +966,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	}
 
 	err = -ENOMEM;
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
 	if (sb->s_root == NULL)
-		goto out_put;
+		goto out;
 
 	return 0;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 3690467c944e..54f6eccb79d9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -625,11 +625,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	hpfs_init_inode(root);
 	hpfs_read_inode(root);
 	unlock_new_inode(root);
-	s->s_root = d_alloc_root(root);
-	if (!s->s_root) {
-		iput(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
 		goto bail0;
-	}
 
 	/*
 	 * find the root directory's . pointer & finish filling in the inode
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index d92f4ce80925..a80e45a690ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -726,17 +726,12 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 
 	err = -ENOMEM;
 	root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
-	if (!root_inode)
-		goto out_mntput;
-
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root)
-		goto out_iput;
+		goto out_mntput;
 
 	return 0;
 
- out_iput:
-	iput(root_inode);
  out_mntput:
 	mntput(proc_mnt);
  out:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1e85a7ac0217..81932fa1861a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -831,8 +831,6 @@ bad_val:
 static int
 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root;
 	int ret;
 	struct hugetlbfs_config config;
 	struct hugetlbfs_sb_info *sbinfo;
@@ -865,16 +863,9 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = HUGETLBFS_MAGIC;
 	sb->s_op = &hugetlbfs_ops;
 	sb->s_time_gran = 1;
-	inode = hugetlbfs_get_root(sb, &config);
-	if (!inode)
-		goto out_free;
-
-	root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
+	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
+	if (!sb->s_root)
 		goto out_free;
-	}
-	sb->s_root = root;
 	return 0;
 out_free:
 	kfree(sbinfo);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bd62c76fb5df..29037c365ba4 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -947,9 +947,8 @@ root_found:
 	s->s_d_op = &isofs_dentry_ops[table];
 
 	/* get the root dentry */
-	s->s_root = d_alloc_root(inode);
+	s->s_root = d_make_root(inode);
 	if (!(s->s_root)) {
-		iput(inode);
 		error = -ENOMEM;
 		goto out_no_inode;
 	}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2e0123867cb1..c0d5c9d770da 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -561,9 +561,9 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	ret = -ENOMEM;
 
 	D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n"));
-	sb->s_root = d_alloc_root(root_i);
+	sb->s_root = d_make_root(root_i);
 	if (!sb->s_root)
-		goto out_root_i;
+		goto out_root;
 
 	sb->s_maxbytes = 0xFFFFFFFF;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -573,8 +573,6 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 		jffs2_start_garbage_collect_thread(c);
 	return 0;
 
- out_root_i:
-	iput(root_i);
 out_root:
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4661ad705130..b3bb95504479 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -522,7 +522,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = PTR_ERR(inode);
 		goto out_no_rw;
 	}
-	sb->s_root = d_alloc_root(inode);
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
 		goto out_no_root;
 
@@ -540,7 +540,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 out_no_root:
 	jfs_err("jfs_read_super: get root dentry failed");
-	iput(inode);
 
 out_no_rw:
 	rc = jfs_umount(sb);
diff --git a/fs/libfs.c b/fs/libfs.c
index 5b2dbb3ba4fc..7c895a763a1e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -491,11 +491,9 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	set_nlink(inode, 2);
-	root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
+	root = d_make_root(inode);
+	if (!root)
 		return -ENOMEM;
-	}
 	for (i = 0; !files->name || files->name[0]; i++, files++) {
 		if (!files->name)
 			continue;
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index b1a491a5fe78..7de18c3021fe 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -315,11 +315,9 @@ static int logfs_get_sb_final(struct super_block *sb)
 	if (IS_ERR(rootdir))
 		goto fail;
 
-	sb->s_root = d_alloc_root(rootdir);
-	if (!sb->s_root) {
-		iput(rootdir);
+	sb->s_root = d_make_root(rootdir);
+	if (!sb->s_root)
 		goto fail;
-	}
 
 	/* at that point we know that ->put_super() will be called */
 	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 3d1e34f8a68e..49df0e7f8379 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -716,13 +716,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
         if (!root_inode)
 		goto out_disconnect;
 	DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
         if (!sb->s_root)
-		goto out_no_root;
+		goto out_disconnect;
 	return 0;
 
-out_no_root:
-	iput(root_inode);
 out_disconnect:
 	ncp_lock_server(server);
 	ncp_disconnect(server);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index dcb61548887f..801d6d830787 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -49,11 +49,9 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 {
 	/* The mntroot acts as the dummy root dentry for this superblock */
 	if (sb->s_root == NULL) {
-		sb->s_root = d_alloc_root(inode);
-		if (sb->s_root == NULL) {
-			iput(inode);
+		sb->s_root = d_make_root(inode);
+		if (sb->s_root == NULL)
 			return -ENOMEM;
-		}
 		ihold(inode);
 		/*
 		 * Ensure that this dentry is invisible to d_find_alias().
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1fc9ad3c1d14..1099a76cee59 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -917,9 +917,8 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
 		dentry = d_find_alias(inode);
 		if (!dentry) {
-			dentry = d_alloc_root(inode);
+			dentry = d_make_root(inode);
 			if (!dentry) {
-				iput(inode);
 				ret = -ENOMEM;
 				goto failed_dentry;
 			}
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index abfac0d7ae9c..3b5825ef3193 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -582,24 +582,14 @@ static int dlmfs_fill_super(struct super_block * sb,
 			    void * data,
 			    int silent)
 {
-	struct inode * inode;
-	struct dentry * root;
-
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = DLMFS_MAGIC;
 	sb->s_op = &dlmfs_ops;
-	inode = dlmfs_get_root_inode(sb);
-	if (!inode)
-		return -ENOMEM;
-
-	root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
+	sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
+	if (!sb->s_root)
 		return -ENOMEM;
-	}
-	sb->s_root = root;
 	return 0;
 }
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2b1184f7097f..337687c3e233 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1166,9 +1166,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 		goto read_super_error;
 	}
 
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root) {
-		iput(inode);
 		status = -ENOMEM;
 		mlog_errno(status);
 		goto read_super_error;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6065bb0ba207..dbc842222589 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -539,11 +539,9 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_brelse_bh2;
 	}
 
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
+	sb->s_root = d_make_root(root);
+	if (!sb->s_root)
 		goto out_brelse_bh2;
-	}
 	printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
 
 	ret = 0;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a88c03bc749d..bc49c975d501 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -408,13 +408,12 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
 	oi->type = op_inode_node;
 	oi->u.node = of_find_node_by_path("/");
 
-	s->s_root = d_alloc_root(root_inode);
+	s->s_root = d_make_root(root_inode);
 	if (!s->s_root)
 		goto out_no_root_dentry;
 	return 0;
 
 out_no_root_dentry:
-	iput(root_inode);
 	ret = -ENOMEM;
 out_no_root:
 	printk("openprom_fill_super: get root inode failed\n");
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index a70af3a44f45..8461a7b82fdb 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -486,8 +486,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 
 int proc_fill_super(struct super_block *s)
 {
-	struct inode * root_inode;
-
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
@@ -496,17 +494,10 @@ int proc_fill_super(struct super_block *s)
 	s->s_time_gran = 1;
 	
 	pde_get(&proc_root);
-	root_inode = proc_get_inode(s, &proc_root);
-	if (!root_inode)
-		goto out_no_root;
-	s->s_root = d_alloc_root(root_inode);
-	if (!s->s_root) {
-		iput(root_inode);
-		goto out_no_root;
-	}
-	return 0;
+	s->s_root = d_make_root(proc_get_inode(s, &proc_root));
+	if (s->s_root)
+		return 0;
 
-out_no_root:
 	printk("proc_read_super: get root inode failed\n");
 	pde_put(&proc_root);
 	return -ENOMEM;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index b3b426edb2fd..ec7d1fb6f35a 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -303,7 +303,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
 	/* override ramfs "dir" options so we catch unlink(2) */
 	inode->i_op = &pstore_dir_inode_operations;
 
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	sb->s_root = root;
 	if (!root) {
 		err = -ENOMEM;
@@ -314,7 +314,6 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 fail:
-	iput(inode);
 	return err;
 }
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 6b009548d2e0..db18d866d981 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -260,15 +260,13 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
  	}
 
 	ret = -ENOMEM;
- 	s->s_root = d_alloc_root(root);
+ 	s->s_root = d_make_root(root);
  	if (s->s_root == NULL)
- 		goto outi;
+ 		goto outb;
 
 	brelse(bh);
 	return 0;
 
-      outi:
-	iput(root);
       outb:
 	kfree(qs->BitMap);
       out:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index aec766abe3af..b6612d2ed718 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -210,7 +210,6 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct ramfs_fs_info *fsi;
 	struct inode *inode = NULL;
-	struct dentry *root;
 	int err;
 
 	save_mount_options(sb, data);
@@ -234,14 +233,8 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_time_gran		= 1;
 
 	inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
-	if (!inode) {
-		err = -ENOMEM;
-		goto fail;
-	}
-
-	root = d_alloc_root(inode);
-	sb->s_root = root;
-	if (!root) {
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -250,7 +243,6 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 fail:
 	kfree(fsi);
 	sb->s_fs_info = NULL;
-	iput(inode);
 	return err;
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index e12d8b97cd4d..208dfd144409 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1874,11 +1874,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 		unlock_new_inode(root_inode);
 	}
 
-	s->s_root = d_alloc_root(root_inode);
-	if (!s->s_root) {
-		iput(root_inode);
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root)
 		goto error;
-	}
 	// define and initialize hash function
 	sbi->s_hash_function = hash_function(s);
 	if (sbi->s_hash_function == NULL) {
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index bb36ab74eb45..e64f6b5f7ae5 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -538,14 +538,12 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (IS_ERR(root))
 		goto error;
 
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root)
-		goto error_i;
+		goto error;
 
 	return 0;
 
-error_i:
-	iput(root);
 error:
 	return -EINVAL;
 error_rsb_inval:
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index ecaa2f7bdb8f..970b1167e7cb 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -316,11 +316,10 @@ check_directory_table:
 	}
 	insert_inode_hash(root);
 
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (sb->s_root == NULL) {
 		ERROR("Root inode create failed\n");
 		err = -ENOMEM;
-		iput(root);
 		goto failed_mount;
 	}
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index e34f0d99ea4e..2243f8ec64d5 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -61,10 +61,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* instantiate and link root dentry */
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root) {
 		pr_debug("%s: could not get root dentry!\n",__func__);
-		iput(inode);
 		return -ENOMEM;
 	}
 	root->d_fsdata = &sysfs_root;
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index f467740e088c..7491c33b6468 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -341,9 +341,8 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
 		printk("SysV FS: get root inode failed\n");
 		return 0;
 	}
-	sb->s_root = d_alloc_root(root_inode);
+	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root) {
-		iput(root_inode);
 		printk("SysV FS: get root dentry failed\n");
 		return 0;
 	}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 63765d58445b..76e4e0566ad6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2076,15 +2076,13 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_umount;
 	}
 
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root)
-		goto out_iput;
+		goto out_umount;
 
 	mutex_unlock(&c->umount_mutex);
 	return 0;
 
-out_iput:
-	iput(root);
 out_umount:
 	ubifs_umount(c);
 out_unlock:
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8d8b25336fbb..85067b4c7e14 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -2037,10 +2037,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	}
 
 	/* Allocate a dentry for the root inode */
-	sb->s_root = d_alloc_root(inode);
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
 		udf_err(sb, "Couldn't allocate root dentry\n");
-		iput(inode);
 		goto error_out;
 	}
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index ec25d09fcaa8..f636f6b460d0 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1164,10 +1164,10 @@ magic_found:
 		ret = PTR_ERR(inode);
 		goto failed;
 	}
-	sb->s_root = d_alloc_root(inode);
+	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
 		ret = -ENOMEM;
-		goto dalloc_failed;
+		goto failed;
 	}
 
 	ufs_setup_cstotal(sb);
@@ -1181,8 +1181,6 @@ magic_found:
 	UFSD("EXIT\n");
 	return 0;
 
-dalloc_failed:
-	iput(inode);
 failed:
 	if (ubh)
 		ubh_brelse_uspi (uspi);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0e4c5c017fba..baf40e378d35 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1362,10 +1362,10 @@ xfs_fs_fill_super(
 		error = EINVAL;
 		goto out_syncd_stop;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		error = ENOMEM;
-		goto out_iput;
+		goto out_syncd_stop;
 	}
 
 	return 0;
@@ -1384,8 +1384,6 @@ xfs_fs_fill_super(
  out:
 	return -error;
 
- out_iput:
-	iput(root);
  out_syncd_stop:
 	xfs_syncd_stop(mp);
  out_unmount:
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 86ee272de210..28bd64ddeda3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -188,30 +188,20 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
 	struct ipc_namespace *ns = data;
-	int error;
 
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = MQUEUE_MAGIC;
 	sb->s_op = &mqueue_super_ops;
 
-	inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO,
-				NULL);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto out;
-	}
+	inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
-	sb->s_root = d_alloc_root(inode);
-	if (!sb->s_root) {
-		iput(inode);
-		error = -ENOMEM;
-		goto out;
-	}
-	error = 0;
-
-out:
-	return error;
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+	return 0;
 }
 
 static struct dentry *mqueue_mount(struct file_system_type *fs_type,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..711c1a30ceaa 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
 
 	struct inode *inode =
 		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
-	struct dentry *dentry;
 
 	if (!inode)
 		return -ENOMEM;
@@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
 	inode->i_op = &cgroup_dir_inode_operations;
 	/* directories start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
-	dentry = d_alloc_root(inode);
-	if (!dentry) {
-		iput(inode);
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
 		return -ENOMEM;
-	}
-	sb->s_root = dentry;
 	/* for everything else we want ->d_op set */
 	sb->s_d_op = &cgroup_dops;
 	return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index 269d049294ab..154243f0a27c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2232,14 +2232,12 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed;
 	inode->i_uid = sbinfo->uid;
 	inode->i_gid = sbinfo->gid;
-	root = d_alloc_root(inode);
+	root = d_make_root(inode);
 	if (!root)
-		goto failed_iput;
+		goto failed;
 	sb->s_root = root;
 	return 0;
 
-failed_iput:
-	iput(inode);
 failed:
 	shmem_put_super(sb);
 	return err;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 63a7a7add21e..7d6dd6efbdbe 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1033,13 +1033,9 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_time_gran = 1;
 
 	inode = rpc_get_inode(sb, S_IFDIR | 0755);
-	if (!inode)
-		return -ENOMEM;
-	sb->s_root = root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
+	sb->s_root = root = d_make_root(inode);
+	if (!root)
 		return -ENOMEM;
-	}
 	if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
 		return -ENOMEM;
 	return 0;
-- 
cgit v1.2.3-55-g7522


From 66b3fad3f4c535c92b6a1184d535a97d6aa5d82a Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Wed, 14 Mar 2012 21:48:20 -0400
Subject: constify path argument of audit_log_d_path()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 2 +-
 kernel/audit.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 9ff7a2c48b50..ed3ef1972496 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -684,7 +684,7 @@ extern void		    audit_log_untrustedstring(struct audit_buffer *ab,
 						      const char *string);
 extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
-					     struct path *path);
+					     const struct path *path);
 extern void		    audit_log_key(struct audit_buffer *ab,
 					  char *key);
 extern void		    audit_log_lost(const char *message);
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
-		      struct path *path)
+		      const struct path *path)
 {
 	char *p, *pathname;
 
-- 
cgit v1.2.3-55-g7522


From 38eff2892628fa5c4fc8962a17b7296f42833ebe Mon Sep 17 00:00:00 2001
From: Al Viro
Date: Wed, 14 Mar 2012 21:51:10 -0400
Subject: constify path argument of trace_seq_path()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/trace_seq.h   | 4 ++--
 kernel/trace/trace_output.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 7dadc3df0c77..a32d86ec8bf2 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -44,7 +44,7 @@ extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
 extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
 				size_t len);
 extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
+extern int trace_seq_path(struct trace_seq *s, const struct path *path);
 
 #else /* CONFIG_TRACING */
 static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
@@ -88,7 +88,7 @@ static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
 {
 	return NULL;
 }
-static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+static inline int trace_seq_path(struct trace_seq *s, const struct path *path)
 {
 	return 0;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0d6ff3555942..690987198ad7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 	return ret;
 }
 
-int trace_seq_path(struct trace_seq *s, struct path *path)
+int trace_seq_path(struct trace_seq *s, const struct path *path)
 {
 	unsigned char *p;
 
-- 
cgit v1.2.3-55-g7522


From c3f0327f8e9d7a503f0d64573c311eddd61f197d Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov
Date: Wed, 21 Mar 2012 16:33:48 -0700
Subject: mm: add rss counters consistency check

Warn about non-zero rss counters at final mmdrop.

This check will prevent reoccurences of bugs such as that fixed in "mm:
fix rss count leakage during migration".

I didn't hide this check under CONFIG_VM_DEBUG because it rather small and
rss counters cover whole page-table management, so this is a good
invariant.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index c4f38a849436..a9e99f3c18e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -511,6 +511,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	return NULL;
 }
 
+static void check_mm(struct mm_struct *mm)
+{
+	int i;
+
+	for (i = 0; i < NR_MM_COUNTERS; i++) {
+		long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+		if (unlikely(x))
+			printk(KERN_ALERT "BUG: Bad rss-counter state "
+					  "mm:%p idx:%d val:%ld\n", mm, i, x);
+	}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
+
 /*
  * Allocate and initialize an mm_struct.
  */
@@ -538,9 +555,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	VM_BUG_ON(mm->pmd_huge_pte);
-#endif
+	check_mm(mm);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
-- 
cgit v1.2.3-55-g7522


From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001
From: Mel Gorman
Date: Wed, 21 Mar 2012 16:34:11 -0700
Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.

[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths.  This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32.  The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.

For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.

This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side.  This is much cheaper on some architectures, including x86.  The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.

While updating the nodemask, a check is made to see if a false failure
is a risk.  If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.

In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%.  The
actual results were

                             3.3.0-rc3          3.3.0-rc3
                             rc3-vanilla        nobarrier-v2r1
    Clients   1 UserTime       0.07 (  0.00%)   0.08 (-14.19%)
    Clients   2 UserTime       0.07 (  0.00%)   0.07 (  2.72%)
    Clients   4 UserTime       0.08 (  0.00%)   0.07 (  3.29%)
    Clients   1 SysTime        0.70 (  0.00%)   0.65 (  6.65%)
    Clients   2 SysTime        0.85 (  0.00%)   0.82 (  3.65%)
    Clients   4 SysTime        1.41 (  0.00%)   1.41 (  0.32%)
    Clients   1 WallTime       0.77 (  0.00%)   0.74 (  4.19%)
    Clients   2 WallTime       0.47 (  0.00%)   0.45 (  3.73%)
    Clients   4 WallTime       0.38 (  0.00%)   0.37 (  1.58%)
    Clients   1 Flt/sec/cpu  497620.28 (  0.00%) 520294.53 (  4.56%)
    Clients   2 Flt/sec/cpu  414639.05 (  0.00%) 429882.01 (  3.68%)
    Clients   4 Flt/sec/cpu  257959.16 (  0.00%) 258761.48 (  0.31%)
    Clients   1 Flt/sec      495161.39 (  0.00%) 517292.87 (  4.47%)
    Clients   2 Flt/sec      820325.95 (  0.00%) 850289.77 (  3.65%)
    Clients   4 Flt/sec      1020068.93 (  0.00%) 1022674.06 (  0.26%)
    MMTests Statistics: duration
    Sys Time Running Test (seconds)             135.68    132.17
    User+Sys Time Running Test (seconds)         164.2    160.13
    Total Elapsed Time (seconds)                123.46    120.87

The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected).  The
actual number of page faults is noticeably improved.

For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.

To test the actual bug the commit fixed I opened two terminals.  The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data.  In a second window, the nodemask of the
cpuset was continually randomised in a loop.

Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h    | 47 ++++++++++++++++++++---------------------------
 include/linux/init_task.h |  8 ++++++++
 include/linux/sched.h     |  2 +-
 kernel/cpuset.c           | 43 ++++++++-----------------------------------
 kernel/fork.c             |  1 +
 mm/filemap.c              | 11 +++++++----
 mm/hugetlb.c              | 15 +++++++++++----
 mm/mempolicy.c            | 28 +++++++++++++++++++++-------
 mm/page_alloc.c           | 33 +++++++++++++++++++++++----------
 mm/slab.c                 | 13 ++++++++-----
 mm/slub.c                 | 40 +++++++++++++++++++++++++---------------
 mm/vmscan.c               |  2 --
 12 files changed, 133 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e9eaec522655..7a7e5fd2a277 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
 /*
- * reading current mems_allowed and mempolicy in the fastpath must protected
- * by get_mems_allowed()
+ * get_mems_allowed is required when making decisions involving mems_allowed
+ * such as during page allocation. mems_allowed can be updated in parallel
+ * and depending on the new value an operation can fail potentially causing
+ * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+ * prevents these artificial failures.
  */
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
 {
-	current->mems_allowed_change_disable++;
-
-	/*
-	 * ensure that reading mems_allowed and mempolicy happens after the
-	 * update of ->mems_allowed_change_disable.
-	 *
-	 * the write-side task finds ->mems_allowed_change_disable is not 0,
-	 * and knows the read-side task is reading mems_allowed or mempolicy,
-	 * so it will clear old bits lazily.
-	 */
-	smp_mb();
+	return read_seqcount_begin(&current->mems_allowed_seq);
 }
 
-static inline void put_mems_allowed(void)
+/*
+ * If this returns false, the operation that took place after get_mems_allowed
+ * may have failed. It is up to the caller to retry the operation if
+ * appropriate.
+ */
+static inline bool put_mems_allowed(unsigned int seq)
 {
-	/*
-	 * ensure that reading mems_allowed and mempolicy before reducing
-	 * mems_allowed_change_disable.
-	 *
-	 * the write-side task will know that the read-side task is still
-	 * reading mems_allowed or mempolicy, don't clears old bits in the
-	 * nodemask.
-	 */
-	smp_mb();
-	--ACCESS_ONCE(current->mems_allowed_change_disable);
+	return !read_seqcount_retry(&current->mems_allowed_seq, seq);
 }
 
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
 	task_lock(current);
+	write_seqcount_begin(&current->mems_allowed_seq);
 	current->mems_allowed = nodemask;
+	write_seqcount_end(&current->mems_allowed_seq);
 	task_unlock(current);
 }
 
@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
 {
+	return 0;
 }
 
-static inline void put_mems_allowed(void)
+static inline bool put_mems_allowed(unsigned int seq)
 {
+	return true;
 }
 
 #endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f994d51f70f2..e4baff5f7ff4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -29,6 +29,13 @@ extern struct fs_struct init_fs;
 #define INIT_GROUP_RWSEM(sig)
 #endif
 
+#ifdef CONFIG_CPUSETS
+#define INIT_CPUSET_SEQ							\
+	.mems_allowed_seq = SEQCNT_ZERO,
+#else
+#define INIT_CPUSET_SEQ
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.wait_chldexit	= __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -192,6 +199,7 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_CPUSET_SEQ							\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e074e1e54f85..0c147a4260a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1514,7 +1514,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;	/* Protected by alloc_lock */
-	int mems_allowed_change_disable;
+	seqcount_t mems_allowed_seq;	/* Seqence no to catch updates */
 	int cpuset_mem_spread_rotor;
 	int cpuset_slab_spread_rotor;
 #endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5d575836dba6..1010cc61931f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
 	bool need_loop;
 
-repeat:
 	/*
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
 	 */
 	need_loop = task_has_mempolicy(tsk) ||
 			!nodes_intersects(*newmems, tsk->mems_allowed);
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
-	/*
-	 * ensure checking ->mems_allowed_change_disable after setting all new
-	 * allowed nodes.
-	 *
-	 * the read-side task can see an nodemask with new allowed nodes and
-	 * old allowed nodes. and if it allocates page when cpuset clears newly
-	 * disallowed ones continuous, it can see the new allowed bits.
-	 *
-	 * And if setting all new allowed nodes is after the checking, setting
-	 * all new allowed nodes and clearing newly disallowed ones will be done
-	 * continuous, and the read-side task may find no node to alloc page.
-	 */
-	smp_mb();
+	if (need_loop)
+		write_seqcount_begin(&tsk->mems_allowed_seq);
 
-	/*
-	 * Allocation of memory is very fast, we needn't sleep when waiting
-	 * for the read-side.
-	 */
-	while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
-		task_unlock(tsk);
-		if (!task_curr(tsk))
-			yield();
-		goto repeat;
-	}
-
-	/*
-	 * ensure checking ->mems_allowed_change_disable before clearing all new
-	 * disallowed nodes.
-	 *
-	 * if clearing newly disallowed bits before the checking, the read-side
-	 * task may find no node to alloc page.
-	 */
-	smp_mb();
+	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
 	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
 	tsk->mems_allowed = *newmems;
+
+	if (need_loop)
+		write_seqcount_end(&tsk->mems_allowed_seq);
+
 	task_unlock(tsk);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index a9e99f3c18e0..9cc227d54102 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1237,6 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
 	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
 	p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+	seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index f3230604006c..843042045dc9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -499,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
 	struct page *page;
 
 	if (cpuset_do_page_mem_spread()) {
-		get_mems_allowed();
-		n = cpuset_mem_spread_node();
-		page = alloc_pages_exact_node(n, gfp, 0);
-		put_mems_allowed();
+		unsigned int cpuset_mems_cookie;
+		do {
+			cpuset_mems_cookie = get_mems_allowed();
+			n = cpuset_mem_spread_node();
+			page = alloc_pages_exact_node(n, gfp, 0);
+		} while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
 		return page;
 	}
 	return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 62f9fada4d6d..b1c314877334 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -454,14 +454,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
-	struct page *page = NULL;
+	struct page *page;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
 	struct zonelist *zonelist;
 	struct zone *zone;
 	struct zoneref *z;
+	unsigned int cpuset_mems_cookie;
 
-	get_mems_allowed();
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
 	zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol, &nodemask);
 	/*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 			}
 		}
 	}
-err:
+
 	mpol_cond_put(mpol);
-	put_mems_allowed();
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
 	return page;
+
+err:
+	mpol_cond_put(mpol);
+	return NULL;
 }
 
 static void update_and_free_page(struct hstate *h, struct page *page)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71e1a523e209..cfb6c8678754 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1850,18 +1850,24 @@ struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, int node)
 {
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct mempolicy *pol;
 	struct zonelist *zl;
 	struct page *page;
+	unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+	pol = get_vma_policy(current, vma, addr);
+	cpuset_mems_cookie = get_mems_allowed();
 
-	get_mems_allowed();
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
-		put_mems_allowed();
+		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+			goto retry_cpuset;
+
 		return page;
 	}
 	zl = policy_zonelist(gfp, pol, node);
@@ -1872,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		struct page *page =  __alloc_pages_nodemask(gfp, order,
 						zl, policy_nodemask(gfp, pol));
 		__mpol_put(pol);
-		put_mems_allowed();
+		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+			goto retry_cpuset;
 		return page;
 	}
 	/*
@@ -1880,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	 */
 	page = __alloc_pages_nodemask(gfp, order, zl,
 				      policy_nodemask(gfp, pol));
-	put_mems_allowed();
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
 	return page;
 }
 
@@ -1907,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 	struct page *page;
+	unsigned int cpuset_mems_cookie;
 
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
-	get_mems_allowed();
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
@@ -1922,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		page = __alloc_pages_nodemask(gfp, order,
 				policy_zonelist(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
-	put_mems_allowed();
+
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
+
 	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 673596ad9c80..40de6854b980 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2380,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
-	struct page *page;
+	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
+	unsigned int cpuset_mems_cookie;
 
 	gfp_mask &= gfp_allowed_mask;
 
@@ -2400,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
-	get_mems_allowed();
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
-	if (!preferred_zone) {
-		put_mems_allowed();
-		return NULL;
-	}
+	if (!preferred_zone)
+		goto out;
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2418,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
-	put_mems_allowed();
 
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+	/*
+	 * When updating a task's mems_allowed, it is possible to race with
+	 * parallel threads in such a way that an allocation can fail while
+	 * the mask is being updated. If a page allocation is about to fail,
+	 * check if the cpuset changed during allocation and if so, retry.
+	 */
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		goto retry_cpuset;
+
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2634,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
+	unsigned int cpuset_mems_cookie;
 
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 
-	get_mems_allowed();
-	ret = !node_isset(nid, cpuset_current_mems_allowed);
-	put_mems_allowed();
+	do {
+		cpuset_mems_cookie = get_mems_allowed();
+		ret = !node_isset(nid, cpuset_current_mems_allowed);
+	} while (!put_mems_allowed(cpuset_mems_cookie));
 out:
 	return ret;
 }
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3b..29c8716eb7a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
 	nid_alloc = nid_here = numa_mem_id();
-	get_mems_allowed();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_slab_spread_node();
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
-	put_mems_allowed();
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
+	unsigned int cpuset_mems_cookie;
 
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	get_mems_allowed();
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
 retry:
 	/*
 	 * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
 			}
 		}
 	}
-	put_mems_allowed();
+
+	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+		goto retry_cpuset;
 	return obj;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..f4a6229848fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *object;
+	unsigned int cpuset_mems_cookie;
 
 	/*
 	 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
-	get_mems_allowed();
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-		struct kmem_cache_node *n;
-
-		n = get_node(s, zone_to_nid(zone));
-
-		if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-				n->nr_partial > s->min_partial) {
-			object = get_partial_node(s, n, c);
-			if (object) {
-				put_mems_allowed();
-				return object;
+	do {
+		cpuset_mems_cookie = get_mems_allowed();
+		zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+			struct kmem_cache_node *n;
+
+			n = get_node(s, zone_to_nid(zone));
+
+			if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+					n->nr_partial > s->min_partial) {
+				object = get_partial_node(s, n, c);
+				if (object) {
+					/*
+					 * Return the object even if
+					 * put_mems_allowed indicated that
+					 * the cpuset mems_allowed was
+					 * updated in parallel. It's a
+					 * harmless race between the alloc
+					 * and the cpuset update.
+					 */
+					put_mems_allowed(cpuset_mems_cookie);
+					return object;
+				}
 			}
 		}
-	}
-	put_mems_allowed();
+	} while (!put_mems_allowed(cpuset_mems_cookie));
 #endif
 	return NULL;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440af1d899b9..55d86c9506f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2343,7 +2343,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
 
-	get_mems_allowed();
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
@@ -2407,7 +2406,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 
 out:
 	delayacct_freepages_end();
-	put_mems_allowed();
 
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
-- 
cgit v1.2.3-55-g7522


From 05af2e104a0c282dcd9303431e1360750ba76de6 Mon Sep 17 00:00:00 2001
From: David Rientjes
Date: Wed, 21 Mar 2012 16:34:13 -0700
Subject: mm, counters: remove task argument to sync_mm_rss() and
 __sync_task_rss_stat()

sync_mm_rss() can only be used for current to avoid race conditions in
iterating and clearing its per-task counters.  Remove the task argument
for it and its helper function, __sync_task_rss_stat(), to avoid thinking
it can be used safely for anything other than current.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c          |  2 +-
 include/linux/mm.h |  4 ++--
 kernel/exit.c      |  2 +-
 mm/memory.c        | 18 +++++++++---------
 mm/mmu_context.c   |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 3908544f5d18..6ed164d20d7d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -824,7 +824,7 @@ static int exec_mmap(struct mm_struct *mm)
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
-	sync_mm_rss(tsk, old_mm);
+	sync_mm_rss(old_mm);
 	mm_release(tsk, old_mm);
 
 	if (old_mm) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index df17ff23d50e..ce2b2a3b2876 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1131,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 }
 
 #if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
+void sync_mm_rss(struct mm_struct *mm);
 #else
-static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+static inline void sync_mm_rss(struct mm_struct *mm)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 0ed15fed579f..d26acd3c1e2e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -934,7 +934,7 @@ void do_exit(long code)
 	acct_update_integrals(tsk);
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
-		sync_mm_rss(tsk, tsk->mm);
+		sync_mm_rss(tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/mm/memory.c b/mm/memory.c
index a5de734e14a7..2d27239ce4dd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
 
 #if defined(SPLIT_RSS_COUNTING)
 
-static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+static void __sync_task_rss_stat(struct mm_struct *mm)
 {
 	int i;
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
-		if (task->rss_stat.count[i]) {
-			add_mm_counter(mm, i, task->rss_stat.count[i]);
-			task->rss_stat.count[i] = 0;
+		if (current->rss_stat.count[i]) {
+			add_mm_counter(mm, i, current->rss_stat.count[i]);
+			current->rss_stat.count[i] = 0;
 		}
 	}
-	task->rss_stat.events = 0;
+	current->rss_stat.events = 0;
 }
 
 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,12 +157,12 @@ static void check_sync_rss_stat(struct task_struct *task)
 	if (unlikely(task != current))
 		return;
 	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
-		__sync_task_rss_stat(task, task->mm);
+		__sync_task_rss_stat(task->mm);
 }
 
-void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+void sync_mm_rss(struct mm_struct *mm)
 {
-	__sync_task_rss_stat(task, mm);
+	__sync_task_rss_stat(mm);
 }
 #else /* SPLIT_RSS_COUNTING */
 
@@ -643,7 +643,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 	int i;
 
 	if (current->mm == mm)
-		sync_mm_rss(current, mm);
+		sync_mm_rss(mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
 			add_mm_counter(mm, i, rss[i]);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080a..3dcfaf4ed355 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	sync_mm_rss(tsk, mm);
+	sync_mm_rss(mm);
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
-- 
cgit v1.2.3-55-g7522


From 42aee6c495e07dba7410b863a360db6bb9ec6d66 Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Wed, 21 Mar 2012 16:34:21 -0700
Subject: cgroup: revert ss_id_lock to spinlock

Commit c1e2ee2dc436 ("memcg: replace ss->id_lock with a rwlock") has now
been seen to cause the unfair behavior we should have expected from
converting a spinlock to an rwlock: softlockup in cgroup_mkdir(), whose
get_new_cssid() is waiting for the wlock, while there are 19 tasks using
the rlock in css_get_next() to get on with their memcg workload (in an
artificial test, admittedly).  Yet lib/idr.c was made suitable for RCU
way back: revert that commit, restoring ss->id_lock to a spinlock.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  2 +-
 kernel/cgroup.c        | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 501adb1b2f43..5a85b3415c1b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -498,7 +498,7 @@ struct cgroup_subsys {
 	struct list_head sibling;
 	/* used when use_id == true */
 	struct idr idr;
-	rwlock_t id_lock;
+	spinlock_t id_lock;
 
 	/* should be defined only by modular subsystems */
 	struct module *module;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c6877fe9a831..8eb90f25bd7b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4885,9 +4885,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 
 	rcu_assign_pointer(id->css, NULL);
 	rcu_assign_pointer(css->id, NULL);
-	write_lock(&ss->id_lock);
+	spin_lock(&ss->id_lock);
 	idr_remove(&ss->idr, id->id);
-	write_unlock(&ss->id_lock);
+	spin_unlock(&ss->id_lock);
 	kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4913,10 +4913,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 		error = -ENOMEM;
 		goto err_out;
 	}
-	write_lock(&ss->id_lock);
+	spin_lock(&ss->id_lock);
 	/* Don't use 0. allocates an ID of 1-65535 */
 	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-	write_unlock(&ss->id_lock);
+	spin_unlock(&ss->id_lock);
 
 	/* Returns error when there are no free spaces for new ID.*/
 	if (error) {
@@ -4931,9 +4931,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 	return newid;
 remove_idr:
 	error = -ENOSPC;
-	write_lock(&ss->id_lock);
+	spin_lock(&ss->id_lock);
 	idr_remove(&ss->idr, myid);
-	write_unlock(&ss->id_lock);
+	spin_unlock(&ss->id_lock);
 err_out:
 	kfree(newid);
 	return ERR_PTR(error);
@@ -4945,7 +4945,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
 	struct css_id *newid;
 
-	rwlock_init(&ss->id_lock);
+	spin_lock_init(&ss->id_lock);
 	idr_init(&ss->idr);
 
 	newid = get_new_cssid(ss, 0);
@@ -5040,9 +5040,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
 		 * scan next entry from bitmap(tree), tmpid is updated after
 		 * idr_get_next().
 		 */
-		read_lock(&ss->id_lock);
+		spin_lock(&ss->id_lock);
 		tmp = idr_get_next(&ss->idr, &tmpid);
-		read_unlock(&ss->id_lock);
+		spin_unlock(&ss->id_lock);
 
 		if (!tmp)
 			break;
-- 
cgit v1.2.3-55-g7522


From ca464d69b19120a826aa2534de2511a6f542edf5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Wed, 21 Mar 2012 16:34:21 -0700
Subject: memcg: let css_get_next() rely upon rcu_read_lock()

Remove lock and unlock around css_get_next()'s call to idr_get_next().
memcg iterators (only users of css_get_next) already did rcu_read_lock(),
and its comment demands that; but add a WARN_ON_ONCE to make sure of it.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8eb90f25bd7b..391d5e991e5f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5033,6 +5033,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
 		return NULL;
 
 	BUG_ON(!ss->use_id);
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
 	/* fill start point for scan */
 	tmpid = id;
 	while (1) {
@@ -5040,10 +5042,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
 		 * scan next entry from bitmap(tree), tmpid is updated after
 		 * idr_get_next().
 		 */
-		spin_lock(&ss->id_lock);
 		tmp = idr_get_next(&ss->idr, &tmpid);
-		spin_unlock(&ss->id_lock);
-
 		if (!tmp)
 			break;
 		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
-- 
cgit v1.2.3-55-g7522


From 9fbe465efc76044dd87afe764db5464ae61aeabc Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Fri, 16 Mar 2012 13:17:13 +0100
Subject: kgdb: Respect that flush op is optional

Not all kgdb I/O drivers implement a flush operation. Adjust
gdbstub_exit accordingly.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/gdbstub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..5a155742ae96 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1129,5 +1129,6 @@ void gdbstub_exit(int status)
 	dbg_io_ops->write_char(hex_asc_lo(checksum));
 
 	/* make sure the output is flushed, lest the bootloader clobber it */
-	dbg_io_ops->flush();
+	if (dbg_io_ops->flush)
+		dbg_io_ops->flush();
 }
-- 
cgit v1.2.3-55-g7522


From 2366e047840e33928803c0442176fb3991423da8 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Fri, 16 Mar 2012 14:20:41 -0500
Subject: kgdb,debug-core,gdbstub: Hook the reboot notifier for debugger detach

The gdbstub and kdb should get detached if the system is rebooting.
Calling gdbstub_exit() will set the proper debug core state and send a
message to any debugger that is connected to correctly detach.

An attached debugger will receive the exit code from
include/linux/reboot.h based on SYS_HALT, SYS_REBOOT, etc...

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/debug_core.c | 17 +++++++++++++++++
 kernel/debug/gdbstub.c    |  7 +++++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..3c1ad4e03543 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
+#include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
@@ -784,6 +785,20 @@ void __init dbg_late_init(void)
 	kdb_init(KDB_INIT_FULL);
 }
 
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+	if (!dbg_kdb_mode)
+		gdbstub_exit(code);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+	.notifier_call		= dbg_notify_reboot,
+	.next			= NULL,
+	.priority		= INT_MAX,
+};
+
 static void kgdb_register_callbacks(void)
 {
 	if (!kgdb_io_module_registered) {
@@ -791,6 +806,7 @@ static void kgdb_register_callbacks(void)
 		kgdb_arch_init();
 		if (!dbg_is_early)
 			kgdb_arch_late();
+		register_reboot_notifier(&dbg_reboot_notifier);
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +828,7 @@ static void kgdb_unregister_callbacks(void)
 	 */
 	if (kgdb_io_module_registered) {
 		kgdb_io_module_registered = 0;
+		unregister_reboot_notifier(&dbg_reboot_notifier);
 		atomic_notifier_chain_unregister(&panic_notifier_list,
 					       &kgdb_panic_event_nb);
 		kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 5a155742ae96..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
 	unsigned char checksum, ch, buffer[3];
 	int loop;
 
+	if (!kgdb_connected)
+		return;
+	kgdb_connected = 0;
+
+	if (!dbg_io_ops || dbg_kdb_mode)
+		return;
+
 	buffer[0] = 'W';
 	buffer[1] = hex_asc_hi(status);
 	buffer[2] = hex_asc_lo(status);
-- 
cgit v1.2.3-55-g7522


From 8f30d411767351656ea62c9e7612120f9b870b59 Mon Sep 17 00:00:00 2001
From: Andrei Warkentin
Date: Tue, 28 Feb 2012 06:55:05 -0600
Subject: KDB: Fix usability issues relating to the 'enter' key.

This fixes the following problems:
1) Typematic-repeat of 'enter' gives warning message
   and leaks make/break if KDB exits. Repeats
   look something like 0x1c 0x1c .... 0x9c
2) Use of 'keypad enter' gives warning message and
   leaks the ENTER break/make code out if KDB exits.
   KP ENTER repeats look someting like 0xe0 0x1c
   0xe0 0x1c ... 0xe0 0x9c.
3) Lag on the order of seconds between "break" and "make" when
   expecting the enter "break" code. Seen under virtualized
   environments such as VMware ESX.

The existing special enter handler tries to glob the enter break code,
but this fails if the other (KP) enter was used, or if there was a key
repeat. It also fails if you mashed some keys along with enter, and
you ended up with a non-enter make or non-enter break code coming
after the enter make code. So first, we modify the handler to handle
these cases. But performing these actions on every enter is annoying
since now you can't hold ENTER down to scroll <more>d messages in
KDB. Since this special behaviour is only necessary to handle the
exiting KDB ('g' + ENTER) without leaking scancodes to the OS.  This
cleanup needs to get executed anytime the kdb_main loop exits.

Tested on QEMU. Set a bp on atkbd.c to verify no scan code was leaked.

Cc: Andrei Warkentin <andreiw@vmware.com>
[jason.wessel@windriver.com: move cleanup calls to kdb_main.c]
Signed-off-by: Andrei Warkentin <andrey.warkentin@gmail.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_keyboard.c | 95 +++++++++++++++++++++++++++++++----------
 kernel/debug/kdb/kdb_main.c     |  3 ++
 kernel/debug/kdb/kdb_private.h  |  7 +++
 3 files changed, 83 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
 #define KBD_STAT_MOUSE_OBF	0x20	/* Mouse output buffer full */
 
 static int kbd_exists;
+static int kbd_last_ret;
 
 /*
  * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
 		return -1;
 	}
 
-	if ((scancode & 0x80) != 0)
+	if ((scancode & 0x80) != 0) {
+		if (scancode == 0x9c)
+			kbd_last_ret = 0;
 		return -1;
+	}
 
 	scancode &= 0x7f;
 
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
 		return -1;	/* ignore unprintables */
 	}
 
-	if ((scancode & 0x7f) == 0x1c) {
-		/*
-		 * enter key.  All done.  Absorb the release scancode.
-		 */
+	if (scancode == 0x1c) {
+		kbd_last_ret = 1;
+		return 13;
+	}
+
+	return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+	int scancode, scanstatus;
+
+	/*
+	 * Nothing to clean up, since either
+	 * ENTER was never pressed, or has already
+	 * gotten cleaned up.
+	 */
+	if (!kbd_last_ret)
+		return;
+
+	kbd_last_ret = 0;
+	/*
+	 * Enter key. Need to absorb the break code here, lest it gets
+	 * leaked out if we exit KDB as the result of processing 'g'.
+	 *
+	 * This has several interesting implications:
+	 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+	 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+	 *   only get a break code at the end of the repeated
+	 *   sequence. This means we can't propagate the repeated key
+	 *   press, and must swallow it away.
+	 * + Need to handle possible PS/2 mouse input.
+	 * + Need to handle mashed keys.
+	 */
+
+	while (1) {
 		while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
-			;
+			cpu_relax();
 
 		/*
-		 * Fetch the scancode
+		 * Fetch the scancode.
 		 */
 		scancode = inb(KBD_DATA_REG);
 		scanstatus = inb(KBD_STATUS_REG);
 
-		while (scanstatus & KBD_STAT_MOUSE_OBF) {
-			scancode = inb(KBD_DATA_REG);
-			scanstatus = inb(KBD_STATUS_REG);
-		}
+		/*
+		 * Skip mouse input.
+		 */
+		if (scanstatus & KBD_STAT_MOUSE_OBF)
+			continue;
 
-		if (scancode != 0x9c) {
-			/*
-			 * Wasn't an enter-release,  why not?
-			 */
-			kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
-			       scancode, scanstatus);
-		}
+		/*
+		 * If we see 0xe0, this is either a break code for KP
+		 * ENTER, or a repeat make for KP ENTER. Either way,
+		 * since the second byte is equivalent to an ENTER,
+		 * skip the 0xe0 and try again.
+		 *
+		 * If we see 0x1c, this must be a repeat ENTER or KP
+		 * ENTER (and we swallowed 0xe0 before). Try again.
+		 *
+		 * We can also see make and break codes for other keys
+		 * mashed before or after pressing ENTER. Thus, if we
+		 * see anything other than 0x9c, we have to try again.
+		 *
+		 * Note, if you held some key as ENTER was depressed,
+		 * that break code would get leaked out.
+		 */
+		if (scancode != 0x9c)
+			continue;
 
-		return 13;
+		return;
 	}
-
-	return keychar & 0xff;
 }
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
 	if (KDB_STATE(DOING_SS))
 		KDB_STATE_CLEAR(SSBPT);
 
+	/* Clean up any keyboard devices before leaving */
+	kdb_kbd_cleanup_state();
+
 	return result;
 }
 
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
 
 extern void kdb_set_current_task(struct task_struct *);
 extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
 #ifdef CONFIG_MODULES
 extern struct list_head *kdb_modules;
 #endif /* CONFIG_MODULES */
-- 
cgit v1.2.3-55-g7522


From bec4d62ead8096e433d624d9339893f50badd992 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Mon, 19 Mar 2012 19:35:55 -0500
Subject: kgdb,debug_core: add the ability to control the reboot notifier

Sometimes it is desirable to stop the kernel debugger before allowing
a system to reboot either with kdb or kgdb.  This patch adds the
ability to turn the reboot notifier on and off or enter the debugger
and stop kernel execution before rebooting.

It is possible to change the setting after booting the kernel with the
following:

echo 1 > /sys/module/debug_core/parameters/kgdbreboot

It is also possible to change this setting using kdb / kgdb to
manipulate the variable directly.

Using KDB:
   mm kgdbreboot 1

Using gdb:
   set kgdbreboot=1

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 Documentation/DocBook/kgdb.tmpl | 17 +++++++++++++++++
 kernel/debug/debug_core.c       | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
index d71b57fcf116..4ee4ba3509fc 100644
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -361,6 +361,23 @@
    <para>It is possible to use this option with kgdboc on a tty that is not a system console.
    </para>
   </para>
+  </sect1>
+   <sect1 id="kgdbreboot">
+   <title>Run time parameter: kgdbreboot</title>
+   <para> The kgdbreboot feature allows you to change how the debugger
+   deals with the reboot notification.  You have 3 choices for the
+   behavior.  The default behavior is always set to 0.</para>
+   <orderedlist>
+   <listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para>
+   <para>Ignore the reboot notification entirely.</para>
+   </listitem>
+   <listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para>
+   <para>Send the detach message to any attached debugger client.</para>
+   </listitem>
+   <listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para>
+   <para>Enter the debugger on reboot notify.</para>
+   </listitem>
+   </orderedlist>
   </sect1>
   </chapter>
   <chapter id="usingKDB">
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 3c1ad4e03543..3f88a45e6f0a 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -76,6 +76,8 @@ static int			exception_level;
 struct kgdb_io		*dbg_io_ops;
 static DEFINE_SPINLOCK(kgdb_registration_lock);
 
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
 /* kgdb console driver is loaded */
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
@@ -97,6 +99,7 @@ static int __init opt_kgdb_con(char *str)
 early_param("kgdbcon", opt_kgdb_con);
 
 module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
 
 /*
  * Holds information about breakpoints in a kernel. These breakpoints are
@@ -788,8 +791,21 @@ void __init dbg_late_init(void)
 static int
 dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
 {
+	/*
+	 * Take the following action on reboot notify depending on value:
+	 *    1 == Enter debugger
+	 *    0 == [the default] detatch debug client
+	 *   -1 == Do nothing... and use this until the board resets
+	 */
+	switch (kgdbreboot) {
+	case 1:
+		kgdb_breakpoint();
+	case -1:
+		goto done;
+	}
 	if (!dbg_kdb_mode)
 		gdbstub_exit(code);
+done:
 	return NOTIFY_DONE;
 }
 
-- 
cgit v1.2.3-55-g7522


From b8adde8ddec9ff62a21564fa8020b5463e70d4de Mon Sep 17 00:00:00 2001
From: Tim Bird
Date: Wed, 21 Sep 2011 13:19:12 -0700
Subject: kdb: Avoid using dbg_io_ops until it is initialized

This fixes a bug with setting a breakpoint during kdb initialization
(from kdb_cmds).  Any call to kdb_printf() before the initialization
of the kgdboc serial console driver (which happens much later during
bootup than kdb_init), results in kernel panic due to the use of
dbg_io_ops before it is initialized.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/debug/kdb/kdb_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
 	if (!dbg_kdb_mode && kgdb_connected) {
 		gdbstub_msg_write(kdb_buffer, retlen);
 	} else {
-		if (!dbg_io_ops->is_console) {
+		if (dbg_io_ops && !dbg_io_ops->is_console) {
 			len = strlen(kdb_buffer);
 			cp = kdb_buffer;
 			while (len--) {
-- 
cgit v1.2.3-55-g7522


From 1ba0c1720eb0de2d0f3abf84c0b128d10af520d1 Mon Sep 17 00:00:00 2001
From: Jason Wessel
Date: Wed, 21 Sep 2011 13:07:47 -0700
Subject: kdb: Add message about CONFIG_DEBUG_RODATA on failure to install
 breakpoint

On x86, if CONFIG_DEBUG_RODATA is set, one cannot set breakpoints
via KDB.  Apparently this is a well-known problem, as at least one distribution
now ships with both KDB enabled and CONFIG_DEBUG_RODATA=y for security reasons.

This patch adds an printk message to the breakpoint failure case,
in order to provide suggestions about how to use the debugger.

Reported-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Tim Bird <tim.bird@am.sony.com>
---
 kernel/debug/kdb/kdb_bp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
 	} else {
 		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
 			   __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+		if (!bp->bp_type) {
+			kdb_printf("Software breakpoints are unavailable.\n"
+				   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+				   "  OR use hw breaks: help bph\n");
+		}
+#endif
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.3-55-g7522


From ebec18a6d3aa1e7d84aab16225e87fd25170ec2b Mon Sep 17 00:00:00 2001
From: Lennart Poettering
Date: Fri, 23 Mar 2012 15:01:54 -0700
Subject: prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process
 supervision

Userspace service managers/supervisors need to track their started
services.  Many services daemonize by double-forking and get implicitly
re-parented to PID 1.  The service manager will no longer be able to
receive the SIGCHLD signals for them, and is no longer in charge of
reaping the children with wait().  All information about the children is
lost at the moment PID 1 cleans up the re-parented processes.

With this prctl, a service manager process can mark itself as a sort of
'sub-init', able to stay as the parent for all orphaned processes
created by the started services.  All SIGCHLD signals will be delivered
to the service manager.

Receiving SIGCHLD and doing wait() is in cases of a service-manager much
preferred over any possible asynchronous notification about specific
PIDs, because the service manager has full access to the child process
data in /proc and the PID can not be re-used until the wait(), the
service-manager itself is in charge of, has happened.

As a side effect, the relevant parent PID information does not get lost
by a double-fork, which results in a more elaborate process tree and
'ps' output:

before:
  # ps afx
  253 ?        Ss     0:00 /bin/dbus-daemon --system --nofork
  294 ?        Sl     0:00 /usr/libexec/polkit-1/polkitd
  328 ?        S      0:00 /usr/sbin/modem-manager
  608 ?        Sl     0:00 /usr/libexec/colord
  658 ?        Sl     0:00 /usr/libexec/upowerd
  819 ?        Sl     0:00 /usr/libexec/imsettings-daemon
  916 ?        Sl     0:00 /usr/libexec/udisks-daemon
  917 ?        S      0:00  \_ udisks-daemon: not polling any devices

after:
  # ps afx
  294 ?        Ss     0:00 /bin/dbus-daemon --system --nofork
  426 ?        Sl     0:00  \_ /usr/libexec/polkit-1/polkitd
  449 ?        S      0:00  \_ /usr/sbin/modem-manager
  635 ?        Sl     0:00  \_ /usr/libexec/colord
  705 ?        Sl     0:00  \_ /usr/libexec/upowerd
  959 ?        Sl     0:00  \_ /usr/libexec/udisks-daemon
  960 ?        S      0:00  |   \_ udisks-daemon: not polling any devices
  977 ?        Sl     0:00  \_ /usr/libexec/packagekitd

This prctl is orthogonal to PID namespaces.  PID namespaces are isolated
from each other, while a service management process usually requires the
services to live in the same namespace, to be able to talk to each
other.

Users of this will be the systemd per-user instance, which provides
init-like functionality for the user's login session and D-Bus, which
activates bus services on-demand.  Both need init-like capabilities to
be able to properly keep track of the services they start.

Many thanks to Oleg for several rounds of review and insights.

[akpm@linux-foundation.org: fix comment layout and spelling]
[akpm@linux-foundation.org: add lengthy code comment from Oleg]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/prctl.h |  3 +++
 include/linux/sched.h | 12 ++++++++++++
 kernel/exit.c         | 33 ++++++++++++++++++++++++++++-----
 kernel/fork.c         |  3 +++
 kernel/sys.c          |  8 ++++++++
 5 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a0413ac3abe8..e0cfec2490aa 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -121,4 +121,7 @@
 #define PR_SET_PTRACER 0x59616d61
 # define PR_SET_PTRACER_ANY ((unsigned long)-1)
 
+#define PR_SET_CHILD_SUBREAPER 36
+#define PR_GET_CHILD_SUBREAPER 37
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c147a4260a5..0c3854b0d4b1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,6 +553,18 @@ struct signal_struct {
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 
+	/*
+	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
+	 * manager, to re-parent orphan (double-forking) child processes
+	 * to this process instead of 'init'. The service manager is
+	 * able to receive SIGCHLD signals and is able to investigate
+	 * the process until it calls wait(). All children of this
+	 * process will inherit a flag if they should look for a
+	 * child_subreaper process at exit.
+	 */
+	unsigned int		is_child_subreaper:1;
+	unsigned int		has_child_subreaper:1;
+
 	/* POSIX.1b Interval Timers */
 	struct list_head posix_timers;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 16b07bfac224..456329fd4ea3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 }
 
 /*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ *    child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
  */
 static struct task_struct *find_new_reaper(struct task_struct *father)
 	__releases(&tasklist_lock)
@@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 		 * forget_original_parent() must move them somewhere.
 		 */
 		pid_ns->child_reaper = init_pid_ns.child_reaper;
+	} else if (father->signal->has_child_subreaper) {
+		struct task_struct *reaper;
+
+		/*
+		 * Find the first ancestor marked as child_subreaper.
+		 * Note that the code below checks same_thread_group(reaper,
+		 * pid_ns->child_reaper).  This is what we need to DTRT in a
+		 * PID namespace. However we still need the check above, see
+		 * http://marc.info/?l=linux-kernel&m=131385460420380
+		 */
+		for (reaper = father->real_parent;
+		     reaper != &init_task;
+		     reaper = reaper->real_parent) {
+			if (same_thread_group(reaper, pid_ns->child_reaper))
+				break;
+			if (!reaper->signal->is_child_subreaper)
+				continue;
+			thread = reaper;
+			do {
+				if (!(thread->flags & PF_EXITING))
+					return reaper;
+			} while_each_thread(reaper, thread);
+		}
 	}
 
 	return pid_ns->child_reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index 37674ec55cde..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
+	sig->has_child_subreaper = current->signal->has_child_subreaper ||
+				   current->signal->is_child_subreaper;
+
 	mutex_init(&sig->cred_guard_mutex);
 
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd195..9eb7fcab8df6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_MM:
 			error = prctl_set_mm(arg2, arg3, arg4, arg5);
 			break;
+		case PR_SET_CHILD_SUBREAPER:
+			me->signal->is_child_subreaper = !!arg2;
+			error = 0;
+			break;
+		case PR_GET_CHILD_SUBREAPER:
+			error = put_user(me->signal->is_child_subreaper,
+					 (int __user *) arg2);
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-55-g7522


From 397a21f24d455982a8a6f9bc11b5f3326ce3c6ef Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 23 Mar 2012 15:01:54 -0700
Subject: kernel/exit.c: if init dies, log a signal which killed it, if any

I just received another user's pleas for help when their init
mysteriously died.  I again explained that they need to check whether it
died because of bad instruction, a segv, or something else.  Which was
an annoying detour into writing a trivial C program to spawn his init
and print its exit code:

  http://lists.busybox.net/pipermail/busybox/2012-January/077172.html

I hear you saying "just test it under /bin/sh".  Well, the crashing init
_was_ /bin/sh.

Which prompted me to make kernel do this first step automatically.  We can
print exit code, which makes it possible to see that death was from e.g.
SIGILL without writing test programs.

[akpm@linux-foundation.org: add 0x to hex number output]
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 456329fd4ea3..3db1909faed9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 
 	if (unlikely(pid_ns->child_reaper == father)) {
 		write_unlock_irq(&tasklist_lock);
-		if (unlikely(pid_ns == &init_pid_ns))
-			panic("Attempted to kill init!");
+		if (unlikely(pid_ns == &init_pid_ns)) {
+			panic("Attempted to kill init! exitcode=0x%08x\n",
+				father->signal->group_exit_code ?:
+					father->exit_code);
+		}
 
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
-- 
cgit v1.2.3-55-g7522


From 7a05c0f7bbae91d08b7d0acf016fdb42dbc912ae Mon Sep 17 00:00:00 2001
From: Michal Hocko
Date: Fri, 23 Mar 2012 15:01:55 -0700
Subject: watchdog: make sure the watchdog thread gets CPU on loaded system

If the system is loaded while hotplugging a CPU we might end up with a
bogus hardlockup detection.  This has been seen during LTP pounder test
executed in parallel with hotplug test.

The main problem is that enable_watchdog (called when CPU is brought up)
registers perf event which periodically checks per-cpu counter
(hrtimer_interrupts), updated from a hrtimer callback, but the hrtimer
is fired from the kernel thread.

This means that while we already do check for the hard lockup the kernel
thread might be sitting on the runqueue with zillions of tasks so there
is nobody to update the value we rely on and so we KABOOM.

Let's fix this by boosting the watchdog thread priority before we wake
it up rather than when it's already running.  This still doesn't handle
a case where we have the same amount of high prio FIFO tasks but that
doesn't seem to be common.  The current implementation doesn't handle
that case anyway so this is not worse at least.

Unfortunately, we cannot start perf counter from the watchdog thread
because we could miss a real lock up and also we cannot start the
hrtimer watchdog_enable because we there is no way (at least I don't
know any) to start a hrtimer from a different CPU.

[dzickus@redhat.com: fix compile issue with param]
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Mandeep Singh Baines <msb@chromium.org>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14bc092fb12c..203fc6e1a285 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -319,11 +319,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  */
 static int watchdog(void *unused)
 {
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct sched_param param = { .sched_priority = 0 };
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
-	sched_setscheduler(current, SCHED_FIFO, &param);
-
 	/* initialize timestamp */
 	__touch_watchdog();
 
@@ -350,7 +348,6 @@ static int watchdog(void *unused)
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
-	param.sched_priority = 0;
 	sched_setscheduler(current, SCHED_NORMAL, &param);
 	return 0;
 }
@@ -439,6 +436,7 @@ static int watchdog_enable(int cpu)
 
 	/* create the watchdog thread */
 	if (!p) {
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
@@ -450,6 +448,7 @@ static int watchdog_enable(int cpu)
 			}
 			goto out;
 		}
+		sched_setscheduler(p, SCHED_FIFO, &param);
 		kthread_bind(p, cpu);
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 		per_cpu(softlockup_watchdog, cpu) = p;
-- 
cgit v1.2.3-55-g7522


From 4501980aae221ed8120dee3491f799ecd75187ad Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Fri, 23 Mar 2012 15:01:55 -0700
Subject: kernel/watchdog.c: convert to pr_foo()

It fixes some 80-col wordwrappings and adds some consistency.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 203fc6e1a285..a01cb03b045a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,6 +9,8 @@
  * to those contributors as well.
  */
 
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/nmi.h>
@@ -373,18 +375,20 @@ static int watchdog_nmi_enable(int cpu)
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
 	if (!IS_ERR(event)) {
-		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+		pr_info("enabled, takes one hw-pmu counter.\n");
 		goto out_save;
 	}
 
 
 	/* vary the KERN level based on the returned errno */
 	if (PTR_ERR(event) == -EOPNOTSUPP)
-		printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
 	else if (PTR_ERR(event) == -ENOENT)
-		printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+		pr_warning("disabled (cpu%i): hardware events not enabled\n",
+			 cpu);
 	else
-		printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+			cpu, PTR_ERR(event));
 	return PTR_ERR(event);
 
 	/* success path */
@@ -439,7 +443,7 @@ static int watchdog_enable(int cpu)
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
-			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+			pr_err("softlockup watchdog for %i failed\n", cpu);
 			if (!err) {
 				/* if hardlockup hasn't already set this */
 				err = PTR_ERR(p);
@@ -495,7 +499,7 @@ static void watchdog_enable_all_cpus(void)
 			watchdog_enabled = 1;
 
 	if (!watchdog_enabled)
-		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+		pr_err("failed to be enabled on some cpus\n");
 
 }
 
-- 
cgit v1.2.3-55-g7522


From b60f796c4ca72545327a069f12938360d833cce7 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Fri, 23 Mar 2012 15:01:56 -0700
Subject: kernel/watchdog.c: add comment to watchdog() exit path

Revelation from Peter.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@tglx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a01cb03b045a..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -349,6 +349,10 @@ static int watchdog(void *unused)
 
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
+	/*
+	 * Drop the policy/priority elevation during thread exit to avoid a
+	 * scheduling latency spike.
+	 */
 	__set_current_state(TASK_RUNNING);
 	sched_setscheduler(current, SCHED_NORMAL, &param);
 	return 0;
-- 
cgit v1.2.3-55-g7522


From 8c5cf9e5c50dc902713897e10201aa71f3546aa1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 23 Mar 2012 15:02:40 -0700
Subject: ptrace: don't modify flags on PTRACE_SETOPTIONS failure

On ptrace(PTRACE_SETOPTIONS, pid, 0, <opts>), we used to set those
option bits which are known, and then fail with -EINVAL if there are
some unknown bits in <opts>.

This is inconsistent with typical error handling, which does not change
any state if input is invalid.

This patch changes PTRACE_SETOPTIONS behavior so that in this case, we
return -EINVAL and don't change any bits in task->ptrace.

It's very unlikely that there is userspace code in the wild which will
be affected by this change: it should have the form

    ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_BOGUSOPT)

where PTRACE_O_BOGUSOPT is a constant unknown to the kernel.  But kernel
headers, naturally, don't contain any PTRACE_O_BOGUSOPTs, thus the only
way userspace can use one if it defines one itself.  I can't see why
anyone would do such a thing deliberately.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..273f56ea39d2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -528,6 +528,9 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
+	if (data & ~(unsigned long)PTRACE_O_MASK)
+		return -EINVAL;
+
 	child->ptrace &= ~PT_TRACE_MASK;
 
 	if (data & PTRACE_O_TRACESYSGOOD)
@@ -551,7 +554,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 	if (data & PTRACE_O_TRACEEXIT)
 		child->ptrace |= PT_TRACE_EXIT;
 
-	return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+	return 0;
 }
 
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
-- 
cgit v1.2.3-55-g7522


From 86b6c1f301faf085de5a3f9ce16b8de6e69c729b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 23 Mar 2012 15:02:41 -0700
Subject: ptrace: simplify PTRACE_foo constants and PTRACE_SETOPTIONS code

Exchange PT_TRACESYSGOOD and PT_PTRACE_CAP bit positions, which makes
PT_option bits contiguous and therefore makes code in
ptrace_setoptions() much simpler.

Every PTRACE_O_TRACEevent is defined to (1 << PTRACE_EVENT_event)
instead of using explicit numeric constants, to ensure we don't mess up
relationship between bit positions and event ids.

PT_EVENT_FLAG_SHIFT was not particularly useful, PT_OPT_FLAG_SHIFT with
value of PT_EVENT_FLAG_SHIFT-1 is easier to use.

PT_TRACE_MASK constant is nuked, the only its use is replaced by
(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT).

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 33 +++++++++++++++------------------
 kernel/ptrace.c        | 31 ++++++++-----------------------
 2 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 6fdb196caa3e..6f1260ee5be5 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -54,17 +54,6 @@
 /* flags in @data for PTRACE_SEIZE */
 #define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
 
-/* options set using PTRACE_SETOPTIONS */
-#define PTRACE_O_TRACESYSGOOD	0x00000001
-#define PTRACE_O_TRACEFORK	0x00000002
-#define PTRACE_O_TRACEVFORK	0x00000004
-#define PTRACE_O_TRACECLONE	0x00000008
-#define PTRACE_O_TRACEEXEC	0x00000010
-#define PTRACE_O_TRACEVFORKDONE	0x00000020
-#define PTRACE_O_TRACEEXIT	0x00000040
-
-#define PTRACE_O_MASK		0x0000007f
-
 /* Wait extended result codes for the above trace options.  */
 #define PTRACE_EVENT_FORK	1
 #define PTRACE_EVENT_VFORK	2
@@ -74,6 +63,17 @@
 #define PTRACE_EVENT_EXIT	6
 #define PTRACE_EVENT_STOP	7
 
+/* options set using PTRACE_SETOPTIONS */
+#define PTRACE_O_TRACESYSGOOD	1
+#define PTRACE_O_TRACEFORK	(1 << PTRACE_EVENT_FORK)
+#define PTRACE_O_TRACEVFORK	(1 << PTRACE_EVENT_VFORK)
+#define PTRACE_O_TRACECLONE	(1 << PTRACE_EVENT_CLONE)
+#define PTRACE_O_TRACEEXEC	(1 << PTRACE_EVENT_EXEC)
+#define PTRACE_O_TRACEVFORKDONE	(1 << PTRACE_EVENT_VFORK_DONE)
+#define PTRACE_O_TRACEEXIT	(1 << PTRACE_EVENT_EXIT)
+
+#define PTRACE_O_MASK		0x0000007f
+
 #include <asm/ptrace.h>
 
 #ifdef __KERNEL__
@@ -88,13 +88,12 @@
 #define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
 #define PT_PTRACED	0x00000001
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
-#define PT_TRACESYSGOOD	0x00000004
-#define PT_PTRACE_CAP	0x00000008	/* ptracer can follow suid-exec */
+#define PT_PTRACE_CAP	0x00000004	/* ptracer can follow suid-exec */
 
+#define PT_OPT_FLAG_SHIFT	3
 /* PT_TRACE_* event enable flags */
-#define PT_EVENT_FLAG_SHIFT	4
-#define PT_EVENT_FLAG(event)	(1 << (PT_EVENT_FLAG_SHIFT + (event) - 1))
-
+#define PT_EVENT_FLAG(event)	(1 << (PT_OPT_FLAG_SHIFT + (event)))
+#define PT_TRACESYSGOOD		PT_EVENT_FLAG(0)
 #define PT_TRACE_FORK		PT_EVENT_FLAG(PTRACE_EVENT_FORK)
 #define PT_TRACE_VFORK		PT_EVENT_FLAG(PTRACE_EVENT_VFORK)
 #define PT_TRACE_CLONE		PT_EVENT_FLAG(PTRACE_EVENT_CLONE)
@@ -102,8 +101,6 @@
 #define PT_TRACE_VFORK_DONE	PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
 #define PT_TRACE_EXIT		PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
 
-#define PT_TRACE_MASK	0x000003f4
-
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
 #define PT_SINGLESTEP		(1<<PT_SINGLESTEP_BIT)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 273f56ea39d2..9acd07a6f5bb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -262,7 +262,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	/*
 	 * Protect exec's credential calculations against our interference;
-	 * interference; SUID, SGID and LSM creds get determined differently
+	 * SUID, SGID and LSM creds get determined differently
 	 * under ptrace.
 	 */
 	retval = -ERESTARTNOINTR;
@@ -528,31 +528,16 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
+	unsigned flags;
+
 	if (data & ~(unsigned long)PTRACE_O_MASK)
 		return -EINVAL;
 
-	child->ptrace &= ~PT_TRACE_MASK;
-
-	if (data & PTRACE_O_TRACESYSGOOD)
-		child->ptrace |= PT_TRACESYSGOOD;
-
-	if (data & PTRACE_O_TRACEFORK)
-		child->ptrace |= PT_TRACE_FORK;
-
-	if (data & PTRACE_O_TRACEVFORK)
-		child->ptrace |= PT_TRACE_VFORK;
-
-	if (data & PTRACE_O_TRACECLONE)
-		child->ptrace |= PT_TRACE_CLONE;
-
-	if (data & PTRACE_O_TRACEEXEC)
-		child->ptrace |= PT_TRACE_EXEC;
-
-	if (data & PTRACE_O_TRACEVFORKDONE)
-		child->ptrace |= PT_TRACE_VFORK_DONE;
-
-	if (data & PTRACE_O_TRACEEXIT)
-		child->ptrace |= PT_TRACE_EXIT;
+	/* Avoid intermediate state when all opts are cleared */
+	flags = child->ptrace;
+	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+	flags |= (data << PT_OPT_FLAG_SHIFT);
+	child->ptrace = flags;
 
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From aa9147c98f27550bd39416eca5a5844e54bced26 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 23 Mar 2012 15:02:42 -0700
Subject: ptrace: make PTRACE_SEIZE set ptrace options specified in 'data'
 parameter

This can be used to close a few corner cases in strace where we get
unwanted racy behavior after attach, but before we have a chance to set
options (the notorious post-execve SIGTRAP comes to mind), and removes
the need to track "did we set opts for this task" state in strace
internals.

While we are at it:

Make it possible to extend SEIZE in the future with more functionality
by passing non-zero 'addr' parameter.  To that end, error out if 'addr'
is non-zero.  PTRACE_ATTACH did not (and still does not) have such
check, and users (strace) do pass garbage there...  let's avoid
repeating this mistake with SEIZE.

Set all task->ptrace bits in one operation - before this change, we were
adding PT_SEIZED and PT_PTRACE_CAP with task->ptrace |= BIT ops.  This
was probably ok (not a bug), but let's be on a safer side.

Changes since v2: use (unsigned long) casts instead of (long) ones, move
PTRACE_SEIZE_DEVEL-related code to separate lines of code.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Pedro Alves <palves@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9acd07a6f5bb..4661c5bc07e5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,6 +231,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 }
 
 static int ptrace_attach(struct task_struct *task, long request,
+			 unsigned long addr,
 			 unsigned long flags)
 {
 	bool seize = (request == PTRACE_SEIZE);
@@ -238,19 +239,29 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	/*
 	 * SEIZE will enable new ptrace behaviors which will be implemented
-	 * gradually.  SEIZE_DEVEL is used to prevent applications
+	 * gradually.  SEIZE_DEVEL bit is used to prevent applications
 	 * expecting full SEIZE behaviors trapping on kernel commits which
 	 * are still in the process of implementing them.
 	 *
 	 * Only test programs for new ptrace behaviors being implemented
 	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
 	 *
-	 * Once SEIZE behaviors are completely implemented, this flag and
-	 * the following test will be removed.
+	 * Once SEIZE behaviors are completely implemented, this flag
+	 * will be removed.
 	 */
 	retval = -EIO;
-	if (seize && !(flags & PTRACE_SEIZE_DEVEL))
-		goto out;
+	if (seize) {
+		if (addr != 0)
+			goto out;
+		if (!(flags & PTRACE_SEIZE_DEVEL))
+			goto out;
+		flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL;
+		if (flags & ~(unsigned long)PTRACE_O_MASK)
+			goto out;
+		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+	} else {
+		flags = PT_PTRACED;
+	}
 
 	audit_ptrace(task);
 
@@ -282,11 +293,11 @@ static int ptrace_attach(struct task_struct *task, long request,
 	if (task->ptrace)
 		goto unlock_tasklist;
 
-	task->ptrace = PT_PTRACED;
 	if (seize)
-		task->ptrace |= PT_SEIZED;
+		flags |= PT_SEIZED;
 	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
-		task->ptrace |= PT_PTRACE_CAP;
+		flags |= PT_PTRACE_CAP;
+	task->ptrace = flags;
 
 	__ptrace_link(task, current);
 
@@ -879,7 +890,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 	}
 
 	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-		ret = ptrace_attach(child, request, data);
+		ret = ptrace_attach(child, request, addr, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
@@ -1022,7 +1033,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	}
 
 	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-		ret = ptrace_attach(child, request, data);
+		ret = ptrace_attach(child, request, addr, data);
 		/*
 		 * Some architectures need to do book-keeping after
 		 * a ptrace attach.
-- 
cgit v1.2.3-55-g7522


From ee00560c7dac1dbbf048446a8489550d0a5765b7 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 23 Mar 2012 15:02:43 -0700
Subject: ptrace: remove PTRACE_SEIZE_DEVEL bit

PTRACE_SEIZE code is tested and ready for production use, remove the
code which requires special bit in data argument to make PTRACE_SEIZE
work.

Strace team prepares for a new release of strace, and we would like to
ship the code which uses PTRACE_SEIZE, preferably after this change goes
into released kernel.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  5 +----
 kernel/ptrace.c        | 15 ---------------
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 30be18064dfd..407c678d2e30 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -51,9 +51,6 @@
 #define PTRACE_INTERRUPT	0x4207
 #define PTRACE_LISTEN		0x4208
 
-/* flags in @data for PTRACE_SEIZE */
-#define PTRACE_SEIZE_DEVEL	0x80000000 /* temp flag for development */
-
 /* Wait extended result codes for the above trace options.  */
 #define PTRACE_EVENT_FORK	1
 #define PTRACE_EVENT_VFORK	2
@@ -64,7 +61,7 @@
 /* Extended result codes which enabled by means other than options.  */
 #define PTRACE_EVENT_STOP	128
 
-/* options set using PTRACE_SETOPTIONS */
+/* Options set using PTRACE_SETOPTIONS or using PTRACE_SEIZE @data param */
 #define PTRACE_O_TRACESYSGOOD	1
 #define PTRACE_O_TRACEFORK	(1 << PTRACE_EVENT_FORK)
 #define PTRACE_O_TRACEVFORK	(1 << PTRACE_EVENT_VFORK)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4661c5bc07e5..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,25 +237,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 	bool seize = (request == PTRACE_SEIZE);
 	int retval;
 
-	/*
-	 * SEIZE will enable new ptrace behaviors which will be implemented
-	 * gradually.  SEIZE_DEVEL bit is used to prevent applications
-	 * expecting full SEIZE behaviors trapping on kernel commits which
-	 * are still in the process of implementing them.
-	 *
-	 * Only test programs for new ptrace behaviors being implemented
-	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
-	 *
-	 * Once SEIZE behaviors are completely implemented, this flag
-	 * will be removed.
-	 */
 	retval = -EIO;
 	if (seize) {
 		if (addr != 0)
 			goto out;
-		if (!(flags & PTRACE_SEIZE_DEVEL))
-			goto out;
-		flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL;
 		if (flags & ~(unsigned long)PTRACE_O_MASK)
 			goto out;
 		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
-- 
cgit v1.2.3-55-g7522


From 629d362b9950166c6fac2aa8425db34d824bb043 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:44 -0700
Subject: signal: give SEND_SIG_FORCED more power to beat SIGNAL_UNKILLABLE

force_sig_info() and friends have the special semantics for synchronous
signals, this interface should not be used if the target is not current.
And it needs the fixes, in particular the clearing of SIGNAL_UNKILLABLE
is not exactly right.

However there are callers which have to use force_ exactly because it
clears SIGNAL_UNKILLABLE and thus it can kill the CLONE_NEWPID tasks,
although this is almost always is wrong by various reasons.

With this patch SEND_SIG_FORCED ignores SIGNAL_UNKILLABLE, like we do if
the signal comes from the ancestor namespace.

This makes the naming in prepare_signal() paths insane, fixed by the
next cleanup.

Note: this only affects SIGKILL/SIGSTOP, but this is enough for
force_sig() abusers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index e76001ccf5cd..2584f5a91fbe 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	assert_spin_locked(&t->sighand->siglock);
 
 	result = TRACE_SIGNAL_IGNORED;
-	if (!prepare_signal(sig, t, from_ancestor_ns))
+	if (!prepare_signal(sig, t,
+			from_ancestor_ns || (info == SEND_SIG_FORCED)))
 		goto ret;
 
 	pending = group ? &t->signal->shared_pending : &t->pending;
-- 
cgit v1.2.3-55-g7522


From def8cf72562e17ec8316ce0cb5697c7afd6400e3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:45 -0700
Subject: signal: cosmetic, s/from_ancestor_ns/force/ in prepare_signal() paths

Cosmetic, rename the from_ancestor_ns argument in prepare_signal()
paths.  After the previous change it doesn't match the reality.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2584f5a91fbe..d523da02dd14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_task_ignored(struct task_struct *t, int sig,
-		int from_ancestor_ns)
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 {
 	void __user *handler;
 
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL && !from_ancestor_ns)
+			handler == SIG_DFL && !force)
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
 }
 
-static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+static int sig_ignored(struct task_struct *t, int sig, bool force)
 {
 	/*
 	 * Blocked signals are never ignored, since the
@@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig, from_ancestor_ns))
+	if (!sig_task_ignored(t, sig, force))
 		return 0;
 
 	/*
@@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t)
  * Returns true if the signal should be actually delivered, otherwise
  * it should be dropped.
  */
-static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
+static int prepare_signal(int sig, struct task_struct *p, bool force)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 		}
 	}
 
-	return !sig_ignored(p, sig, from_ancestor_ns);
+	return !sig_ignored(p, sig, force);
 }
 
 /*
@@ -1602,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 
 	ret = 1; /* the signal is ignored */
 	result = TRACE_SIGNAL_IGNORED;
-	if (!prepare_signal(sig, t, 0))
+	if (!prepare_signal(sig, t, false))
 		goto out;
 
 	ret = 0;
-- 
cgit v1.2.3-55-g7522


From a02d6fd643cbd4c559113b35b31d3b04e4ec60c7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:46 -0700
Subject: signal: zap_pid_ns_processes: s/SEND_SIG_NOINFO/SEND_SIG_FORCED/

Change zap_pid_ns_processes() to use SEND_SIG_FORCED, it looks more
clear compared to SEND_SIG_NOINFO which relies on from_ancestor_ns logic
send_signal().

It is also more efficient if we need to kill a lot of tasks because it
doesn't alloc sigqueue.

While at it, add the __fatal_signal_pending(task) check as a minor
optimization.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..17b232869a04 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	while (nr > 0) {
 		rcu_read_lock();
 
-		/*
-		 * Any nested-container's init processes won't ignore the
-		 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-		 */
 		task = pid_task(find_vpid(nr), PIDTYPE_PID);
-		if (task)
-			send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
+		if (task && !__fatal_signal_pending(task))
+			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
 
 		rcu_read_unlock();
 
-- 
cgit v1.2.3-55-g7522


From b3449922502f5a161ee2b5022a33aec8472fbf18 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:47 -0700
Subject: usermodehelper: introduce umh_complete(sub_info)

Preparation.  Add the new trivial helper, umh_complete().  Currently it
simply does complete(sub_info->complete).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..8ea25944ce33 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -199,6 +199,11 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 
+static void umh_complete(struct subprocess_info *sub_info)
+{
+	complete(sub_info->complete);
+}
+
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -235,7 +240,7 @@ static int wait_for_helper(void *data)
 			sub_info->retval = ret;
 	}
 
-	complete(sub_info->complete);
+	umh_complete(sub_info);
 	return 0;
 }
 
@@ -269,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work)
 	case UMH_WAIT_EXEC:
 		if (pid < 0)
 			sub_info->retval = pid;
-		complete(sub_info->complete);
+		umh_complete(sub_info);
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From d0bd587a80960d7ba7e0c8396e154028c9045c54 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:47 -0700
Subject: usermodehelper: implement UMH_KILLABLE

Implement UMH_KILLABLE, should be used along with UMH_WAIT_EXEC/PROC.
The caller must ensure that subprocess_info->path/etc can not go away
until call_usermodehelper_freeinfo().

call_usermodehelper_exec(UMH_KILLABLE) does
wait_for_completion_killable.  If it fails, it uses
xchg(&sub_info->complete, NULL) to serialize with umh_complete() which
does the same xhcg() to access sub_info->complete.

If call_usermodehelper_exec wins, it can safely return.  umh_complete()
should get NULL and call call_usermodehelper_freeinfo().

Otherwise we know that umh_complete() was already called, in this case
call_usermodehelper_exec() falls back to wait_for_completion() which
should succeed "very soon".

Note: UMH_NO_WAIT == -1 but it obviously should not be used with
UMH_KILLABLE.  We delay the neccessary cleanup to simplify the back
porting.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h |  2 ++
 kernel/kmod.c        | 27 +++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 722f477c4ef7..1b5985855ffc 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -54,6 +54,8 @@ enum umh_wait {
 	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
 };
 
+#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
+
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8ea25944ce33..f92f917c450c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -201,7 +201,15 @@ EXPORT_SYMBOL(call_usermodehelper_freeinfo);
 
 static void umh_complete(struct subprocess_info *sub_info)
 {
-	complete(sub_info->complete);
+	struct completion *comp = xchg(&sub_info->complete, NULL);
+	/*
+	 * See call_usermodehelper_exec(). If xchg() returns NULL
+	 * we own sub_info, the UMH_KILLABLE caller has gone away.
+	 */
+	if (comp)
+		complete(comp);
+	else
+		call_usermodehelper_freeinfo(sub_info);
 }
 
 /* Keventd can't block, but this (a child) can. */
@@ -252,6 +260,9 @@ static void __call_usermodehelper(struct work_struct *work)
 	enum umh_wait wait = sub_info->wait;
 	pid_t pid;
 
+	if (wait != UMH_NO_WAIT)
+		wait &= ~UMH_KILLABLE;
+
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -461,9 +472,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	queue_work(khelper_wq, &sub_info->work);
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
+
+	if (wait & UMH_KILLABLE) {
+		retval = wait_for_completion_killable(&done);
+		if (!retval)
+			goto wait_done;
+
+		/* umh_complete() will see NULL and free sub_info */
+		if (xchg(&sub_info->complete, NULL))
+			goto unlock;
+		/* fallthrough, umh_complete() was already called */
+	}
+
 	wait_for_completion(&done);
+wait_done:
 	retval = sub_info->retval;
-
 out:
 	call_usermodehelper_freeinfo(sub_info);
 unlock:
-- 
cgit v1.2.3-55-g7522


From 9d944ef32e83405a07376f112e9f02161d3e9731 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:48 -0700
Subject: usermodehelper: kill umh_wait, renumber UMH_* constants

No functional changes.  It is not sane to use UMH_KILLABLE with enum
umh_wait, but obviously we do not want another argument in
call_usermodehelper_* helpers.  Kill this enum, use the plain int.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h        | 18 +++++++-----------
 kernel/kmod.c               |  8 ++------
 security/keys/request_key.c |  2 +-
 3 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 1b5985855ffc..9efeae679106 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -48,12 +48,9 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
 struct cred;
 struct file;
 
-enum umh_wait {
-	UMH_NO_WAIT = -1,	/* don't wait at all */
-	UMH_WAIT_EXEC = 0,	/* wait for the exec, but not the process */
-	UMH_WAIT_PROC = 1,	/* wait for the process to complete */
-};
-
+#define UMH_NO_WAIT	0	/* don't wait at all */
+#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
+#define UMH_WAIT_PROC	2	/* wait for the process to complete */
 #define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
 
 struct subprocess_info {
@@ -62,7 +59,7 @@ struct subprocess_info {
 	char *path;
 	char **argv;
 	char **envp;
-	enum umh_wait wait;
+	int wait;
 	int retval;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
@@ -80,15 +77,14 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
 		    void *data);
 
 /* Actually execute the sub-process */
-int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait);
+int call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
 /* Free the subprocess_info. This is only needed if you're not going
    to call call_usermodehelper_exec */
 void call_usermodehelper_freeinfo(struct subprocess_info *info);
 
 static inline int
-call_usermodehelper_fns(char *path, char **argv, char **envp,
-			enum umh_wait wait,
+call_usermodehelper_fns(char *path, char **argv, char **envp, int wait,
 			int (*init)(struct subprocess_info *info, struct cred *new),
 			void (*cleanup)(struct subprocess_info *), void *data)
 {
@@ -106,7 +102,7 @@ call_usermodehelper_fns(char *path, char **argv, char **envp,
 }
 
 static inline int
-call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait)
+call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
 	return call_usermodehelper_fns(path, argv, envp, wait,
 				       NULL, NULL, NULL);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f92f917c450c..8341de91613f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -257,12 +257,9 @@ static void __call_usermodehelper(struct work_struct *work)
 {
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
-	enum umh_wait wait = sub_info->wait;
+	int wait = sub_info->wait & ~UMH_KILLABLE;
 	pid_t pid;
 
-	if (wait != UMH_NO_WAIT)
-		wait &= ~UMH_KILLABLE;
-
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -451,8 +448,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
-			     enum umh_wait wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 82465328c39b..cc3790315d2f 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -91,7 +91,7 @@ static void umh_keys_cleanup(struct subprocess_info *info)
  * Call a usermode helper with a specific session keyring.
  */
 static int call_usermodehelper_keys(char *path, char **argv, char **envp,
-			 struct key *session_keyring, enum umh_wait wait)
+					struct key *session_keyring, int wait)
 {
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 	struct subprocess_info *info =
-- 
cgit v1.2.3-55-g7522


From 5b9bd473e3b8a8c6c4ae99be475e6e9b27568555 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:49 -0700
Subject: usermodehelper: ____call_usermodehelper() doesn't need do_exit()

Minor cleanup.  ____call_usermodehelper() can simply return, no need to
call do_exit() explicitely.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8341de91613f..685b246b13b0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -188,7 +188,7 @@ static int ____call_usermodehelper(void *data)
 	/* Exec failed? */
 fail:
 	sub_info->retval = retval;
-	do_exit(0);
+	return 0;
 }
 
 void call_usermodehelper_freeinfo(struct subprocess_info *info)
-- 
cgit v1.2.3-55-g7522


From 3e63a93b987685f02421e18b2aa452d20553a88b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:49 -0700
Subject: kmod: introduce call_modprobe() helper

No functional changes.  Move the call_usermodehelper code from
__request_module() into the new simple helper, call_modprobe().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 685b246b13b0..56a29e812ff0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,21 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
+static int call_modprobe(char *module_name, int wait)
+{
+	static char *envp[] = {
+		"HOME=/",
+		"TERM=linux",
+		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+		NULL
+	};
+
+	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+
+	return call_usermodehelper_fns(modprobe_path, argv, envp,
+					wait, NULL, NULL, NULL);
+}
+
 /**
  * __request_module - try to load a kernel module
  * @wait: wait (or not) for the operation to complete
@@ -81,11 +96,6 @@ int __request_module(bool wait, const char *fmt, ...)
 	char module_name[MODULE_NAME_LEN];
 	unsigned int max_modprobes;
 	int ret;
-	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
-	static char *envp[] = { "HOME=/",
-				"TERM=linux",
-				"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-				NULL };
 	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
@@ -128,9 +138,7 @@ int __request_module(bool wait, const char *fmt, ...)
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
-	ret = call_usermodehelper_fns(modprobe_path, argv, envp,
-			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
-			NULL, NULL, NULL);
+	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 
 	atomic_dec(&kmod_concurrent);
 	return ret;
-- 
cgit v1.2.3-55-g7522


From 1cc684ab75123efe7ff446eb821d44375ba8fa30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 23 Mar 2012 15:02:50 -0700
Subject: kmod: make __request_module() killable

As Tetsuo Handa pointed out, request_module() can stress the system
while the oom-killed caller sleeps in TASK_UNINTERRUPTIBLE.

The task T uses "almost all" memory, then it does something which
triggers request_module().  Say, it can simply call sys_socket().  This
in turn needs more memory and leads to OOM.  oom-killer correctly
chooses T and kills it, but this can't help because it sleeps in
TASK_UNINTERRUPTIBLE and after that oom-killer becomes "disabled" by the
TIF_MEMDIE task T.

Make __request_module() killable.  The only necessary change is that
call_modprobe() should kmalloc argv and module_name, they can't live in
the stack if we use UMH_KILLABLE.  This memory is freed via
call_usermodehelper_freeinfo()->cleanup.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56a29e812ff0..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,12 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+	kfree(info->argv[3]); /* check call_modprobe() */
+	kfree(info->argv);
+}
+
 static int call_modprobe(char *module_name, int wait)
 {
 	static char *envp[] = {
@@ -69,10 +75,26 @@ static int call_modprobe(char *module_name, int wait)
 		NULL
 	};
 
-	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
+	char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+	if (!argv)
+		goto out;
+
+	module_name = kstrdup(module_name, GFP_KERNEL);
+	if (!module_name)
+		goto free_argv;
+
+	argv[0] = modprobe_path;
+	argv[1] = "-q";
+	argv[2] = "--";
+	argv[3] = module_name;	/* check free_modprobe_argv() */
+	argv[4] = NULL;
 
 	return call_usermodehelper_fns(modprobe_path, argv, envp,
-					wait, NULL, NULL, NULL);
+		wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+free_argv:
+	kfree(argv);
+out:
+	return -ENOMEM;
 }
 
 /**
-- 
cgit v1.2.3-55-g7522