summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c29
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c189
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/hashtab.c504
-rw-r--r--kernel/bpf/helpers.c2
-rw-r--r--kernel/bpf/inode.c20
-rw-r--r--kernel/bpf/percpu_freelist.c100
-rw-r--r--kernel/bpf/percpu_freelist.h31
-rw-r--r--kernel/bpf/stackmap.c290
-rw-r--r--kernel/bpf/syscall.c112
-rw-r--r--kernel/bpf/verifier.c60
-rw-r--r--kernel/cgroup.c1398
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c6
-rw-r--r--kernel/context_tracking.c4
-rw-r--r--kernel/cpu.c1218
-rw-r--r--kernel/cpuset.c85
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/callchain.c32
-rw-r--r--kernel/events/core.c1575
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c40
-rw-r--r--kernel/events/uprobes.c14
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c48
-rw-r--r--kernel/futex.c293
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/gcov/base.c7
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c13
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/ipi.c326
-rw-r--r--kernel/irq/irqdesc.c40
-rw-r--r--kernel/irq/irqdomain.c35
-rw-r--r--kernel/irq/manage.c42
-rw-r--r--kernel/irq/msi.c66
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/kallsyms.c42
-rw-r--r--kernel/kcmp.c4
-rw-r--r--kernel/kexec.c10
-rw-r--r--kernel/kexec_core.c45
-rw-r--r--kernel/kexec_file.c83
-rw-r--r--kernel/kexec_internal.h21
-rw-r--r--kernel/ksysfs.c26
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/livepatch/core.c327
-rw-r--r--kernel/locking/lockdep.c189
-rw-r--r--kernel/locking/mcs_spinlock.h8
-rw-r--r--kernel/locking/mutex.c5
-rw-r--r--kernel/locking/qspinlock.c85
-rw-r--r--kernel/locking/qspinlock_paravirt.h259
-rw-r--r--kernel/locking/qspinlock_stat.h290
-rw-r--r--kernel/locking/rtmutex.c135
-rw-r--r--kernel/memremap.c257
-rw-r--r--kernel/module.c569
-rw-r--r--kernel/module_signing.c7
-rw-r--r--kernel/panic.c77
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/hibernate.c17
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/process.c12
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/printk/printk.c217
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c49
-rw-r--r--kernel/rcu/rcutorture.c44
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny_plugin.h15
-rw-r--r--kernel/rcu/tree.c543
-rw-r--r--kernel/rcu/tree.h97
-rw-r--r--kernel/rcu/tree_plugin.h123
-rw-r--r--kernel/rcu/tree_trace.c39
-rw-r--r--kernel/rcu/update.c23
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/resource.c165
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/clock.c7
-rw-r--r--kernel/sched/core.c698
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpufreq.c37
-rw-r--r--kernel/sched/cputime.c130
-rw-r--r--kernel/sched/deadline.c125
-rw-r--r--kernel/sched/debug.c415
-rw-r--r--kernel/sched/fair.c625
-rw-r--r--kernel/sched/idle.c18
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c116
-rw-r--r--kernel/sched/sched.h211
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/swait.c123
-rw-r--r--kernel/seccomp.c22
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smp.c10
-rw-r--r--kernel/smpboot.c6
-rw-r--r--kernel/smpboot.h6
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/stop_machine.c88
-rw-r--r--kernel/sys.c25
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c140
-rw-r--r--kernel/time/alarmtimer.c17
-rw-r--r--kernel/time/clocksource.c56
-rw-r--r--kernel/time/hrtimer.c73
-rw-r--r--kernel/time/itimer.c2
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c54
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-clock.c4
-rw-r--r--kernel/time/posix-cpu-timers.c52
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/tick-sched.c237
-rw-r--r--kernel/time/tick-sched.h1
-rw-r--r--kernel/time/time.c9
-rw-r--r--kernel/time/timekeeping.c331
-rw-r--r--kernel/time/timekeeping_internal.h8
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/bpf_trace.c20
-rw-r--r--kernel/trace/ftrace.c449
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c45
-rw-r--r--kernel/trace/trace_events_filter.c13
-rw-r--r--kernel/trace/trace_events_trigger.c25
-rw-r--r--kernel/trace/trace_kprobe.c19
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/trace/trace_syscalls.c16
-rw-r--r--kernel/tsacct.c54
-rw-r--r--kernel/user_namespace.c21
-rw-r--r--kernel/watchdog.c29
-rw-r--r--kernel/workqueue.c304
-rw-r--r--kernel/workqueue_internal.h2
153 files changed, 10347 insertions, 4926 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..baa55e50a315 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
-# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+# Do not trace internal ftrace files
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0bf63..d2edd6efec56 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -326,3 +326,4 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
+EXPORT_SYMBOL_GPL(current_is_async);
diff --git a/kernel/audit.c b/kernel/audit.c
index 8fa7533bf106..678c3f000191 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -938,7 +938,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err == 1) { /* match or error */
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = tty_audit_push_current();
+ err = tty_audit_push();
if (err)
break;
}
@@ -1048,20 +1048,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk = current;
+ unsigned int t;
- spin_lock(&tsk->sighand->siglock);
- s.enabled = tsk->signal->audit_tty;
- s.log_passwd = tsk->signal->audit_tty_log_passwd;
- spin_unlock(&tsk->sighand->siglock);
+ t = READ_ONCE(current->signal->audit_tty);
+ s.enabled = t & AUDIT_TTY_ENABLE;
+ s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
struct audit_tty_status s, old;
- struct task_struct *tsk = current;
struct audit_buffer *ab;
+ unsigned int t;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
@@ -1071,14 +1070,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
(s.log_passwd != 0 && s.log_passwd != 1))
err = -EINVAL;
- spin_lock(&tsk->sighand->siglock);
- old.enabled = tsk->signal->audit_tty;
- old.log_passwd = tsk->signal->audit_tty_log_passwd;
- if (!err) {
- tsk->signal->audit_tty = s.enabled;
- tsk->signal->audit_tty_log_passwd = s.log_passwd;
+ if (err)
+ t = READ_ONCE(current->signal->audit_tty);
+ else {
+ t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD);
+ t = xchg(&current->signal->audit_tty, t);
}
- spin_unlock(&tsk->sighand->siglock);
+ old.enabled = t & AUDIT_TTY_ENABLE;
+ old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
@@ -1737,7 +1736,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
/* Copy inode data into an audit_names. */
void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
+ struct inode *inode)
{
name->ino = inode->i_ino;
name->dev = inode->i_sb->s_dev;
diff --git a/kernel/audit.h b/kernel/audit.h
index de6cbb7cf547..cbbe6bb6496e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -207,7 +207,7 @@ extern u32 audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
- const struct inode *inode);
+ struct inode *inode);
extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap);
extern void audit_log_name(struct audit_context *context,
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 27c6046c2c3d..f84f8d06e1f6 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
if (IS_ERR(dentry))
return (void *)dentry; /* returning an error */
inode = path.dentry->d_inode;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
if (unlikely(!audit_mark)) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0348b12b5a4d..3cf1c5978d39 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+ inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d_backing_inode(d)->i_sb->s_dev;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..195ffaee50b9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
@@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct inode *inode = d_backing_inode(dentry);
const char *dname = dentry->d_name.name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 13272582eee0..eed911d091da 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,4 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..76d5a794e426 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -17,15 +17,43 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ free_percpu(array->pptrs[i]);
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+ void __percpu *ptr;
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++) {
+ ptr = __alloc_percpu_gfp(array->elem_size, 8,
+ GFP_USER | __GFP_NOWARN);
+ if (!ptr) {
+ bpf_array_free_percpu(array);
+ return -ENOMEM;
+ }
+ array->pptrs[i] = ptr;
+ }
+
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
struct bpf_array *array;
- u32 elem_size, array_size;
+ u64 array_size;
+ u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0)
+ attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
@@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
- /* check round_up into zero and u32 overflow */
- if (elem_size == 0 ||
- attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+ array_size = sizeof(*array);
+ if (percpu)
+ array_size += (u64) attr->max_entries * sizeof(void *);
+ else
+ array_size += (u64) attr->max_entries * elem_size;
+
+ /* make sure there is no u32 overflow later in round_up() */
+ if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
- array_size = sizeof(*array) + attr->max_entries * elem_size;
/* allocate all map elements and zero-initialize them */
array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}
/* copy mandatory map attributes */
+ array->map.map_type = attr->map_type;
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
- array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
array->elem_size = elem_size;
+ if (!percpu)
+ goto out;
+
+ array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
+
+ if (array_size >= U32_MAX - PAGE_SIZE ||
+ elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+ kvfree(array);
+ return ERR_PTR(-ENOMEM);
+ }
+out:
+ array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+
return &array->map;
}
@@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
return NULL;
return array->value + array->elem_size * index;
}
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return this_cpu_ptr(array->pptrs[index]);
+}
+
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -ENOENT;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ pptr = array->pptrs[index];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (map_flags > BPF_EXIST)
+ if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (unlikely(map_flags == BPF_NOEXIST))
+ /* all elements already exist */
+ return -EEXIST;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ memcpy(this_cpu_ptr(array->pptrs[index]),
+ value, map->value_size);
+ else
+ memcpy(array->value + array->elem_size * index,
+ value, map->value_size);
+ return 0;
+}
+
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ if (unlikely(index >= array->map.max_entries))
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (map_flags == BPF_NOEXIST)
+ if (unlikely(map_flags == BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- memcpy(array->value + array->elem_size * index, value, map->value_size);
+ /* the user space will provide round_up(value_size, 8) bytes that
+ * will be copied into per-cpu area. bpf programs can only access
+ * value_size of it. During lookup the same extra bytes will be
+ * returned or zeros which were zero-filled by percpu_alloc,
+ * so no kernel data leaks possible
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ pptr = array->pptrs[index];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ off += size;
+ }
+ rcu_read_unlock();
return 0;
}
@@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map)
*/
synchronize_rcu();
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ bpf_array_free_percpu(array);
+
kvfree(array);
}
@@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
.type = BPF_MAP_TYPE_ARRAY,
};
+static const struct bpf_map_ops percpu_array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list percpu_array_type __read_mostly = {
+ .ops = &percpu_array_ops,
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+};
+
static int __init register_array_map(void)
{
bpf_register_map_type(&array_type);
+ bpf_register_map_type(&percpu_array_type);
return 0;
}
late_initcall(register_array_map);
@@ -291,10 +435,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
{
struct perf_event *event;
const struct perf_event_attr *attr;
+ struct file *file;
- event = perf_event_get(fd);
- if (IS_ERR(event))
- return event;
+ file = perf_event_get(fd);
+ if (IS_ERR(file))
+ return file;
+
+ event = file->private_data;
attr = perf_event_attrs(event);
if (IS_ERR(attr))
@@ -304,24 +451,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
goto err;
if (attr->type == PERF_TYPE_RAW)
- return event;
+ return file;
if (attr->type == PERF_TYPE_HARDWARE)
- return event;
+ return file;
if (attr->type == PERF_TYPE_SOFTWARE &&
attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return event;
+ return file;
err:
- perf_event_release_kernel(event);
+ fput(file);
return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- struct perf_event *event = ptr;
-
- perf_event_release_kernel(event);
+ fput((struct file *)ptr);
}
static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd572c..972d9a8e4ac4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
select_insn:
goto *jumptable[insn->code];
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b3746fa..fff3650d52fc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -13,38 +14,127 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/vmalloc.h>
+#include "percpu_freelist.h"
+
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
- raw_spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ struct bucket *buckets;
+ void *elems;
+ struct pcpu_freelist freelist;
+ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
- struct hlist_node hash_node;
+ union {
+ struct hlist_node hash_node;
+ struct bpf_htab *htab;
+ struct pcpu_freelist_node fnode;
+ };
struct rcu_head rcu;
u32 hash;
char key[0] __aligned(8);
};
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+ void __percpu *pptr)
+{
+ *(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+ return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+ return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+ int i;
+
+ if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ goto free_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+ htab->map.key_size);
+ free_percpu(pptr);
+ }
+free_elems:
+ vfree(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+ int err = -ENOMEM, i;
+
+ htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+ if (!htab->elems)
+ return -ENOMEM;
+
+ if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ goto skip_percpu_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ u32 size = round_up(htab->map.value_size, 8);
+ void __percpu *pptr;
+
+ pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ goto free_elems;
+ htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+ pptr);
+ }
+
+skip_percpu_elems:
+ err = pcpu_freelist_init(&htab->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
+ htab->map.max_entries);
+ return 0;
+
+free_elems:
+ htab_free_elems(htab);
+ return err;
+}
+
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
struct bpf_htab *htab;
int err, i;
+ u64 cost;
+
+ if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+ /* reserved bits should not be used */
+ return ERR_PTR(-EINVAL);
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
/* mandatory map attributes */
+ htab->map.map_type = attr->map_type;
htab->map.key_size = attr->key_size;
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
+ htab->map.map_flags = attr->map_flags;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -73,43 +163,65 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
+ if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ /* make sure the size for pcpu_alloc() is reasonable */
+ goto free_htab;
+
htab->elem_size = sizeof(struct htab_elem) +
- round_up(htab->map.key_size, 8) +
- htab->map.value_size;
+ round_up(htab->map.key_size, 8);
+ if (percpu)
+ htab->elem_size += sizeof(void *);
+ else
+ htab->elem_size += round_up(htab->map.value_size, 8);
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
- (u64) htab->elem_size * htab->map.max_entries >=
- U32_MAX - PAGE_SIZE)
+ cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+
+ if (percpu)
+ cost += (u64) round_up(htab->map.value_size, 8) *
+ num_possible_cpus() * htab->map.max_entries;
+
+ if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
- htab->elem_size * htab->map.max_entries,
- PAGE_SIZE) >> PAGE_SHIFT;
+ htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(htab->map.pages);
+ if (err)
+ goto free_htab;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN);
if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
if (!htab->buckets)
goto free_htab;
}
- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
- raw_spin_lock_init(&htab->lock);
- htab->count = 0;
+ if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+ err = prealloc_elems_and_freelist(htab);
+ if (err)
+ goto free_buckets;
+ }
return &htab->map;
+free_buckets:
+ kvfree(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -120,11 +232,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
void *key, u32 key_size)
{
@@ -138,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
}
/* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
@@ -156,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
+ return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
if (l)
return l->key + round_up(map->key_size, 8);
@@ -216,90 +340,254 @@ find_first_elem:
}
}
- /* itereated over all buckets and all elements */
+ /* iterated over all buckets and all elements */
return -ENOENT;
}
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
+{
+ if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+ free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ kfree(l);
+
+}
+
+static void htab_elem_free_rcu(struct rcu_head *head)
+{
+ struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+ struct bpf_htab *htab = l->htab;
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ htab_elem_free(htab, l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
+ } else {
+ atomic_dec(&htab->count);
+ l->htab = htab;
+ call_rcu(&l->rcu, htab_elem_free_rcu);
+ }
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+ void *value, u32 key_size, u32 hash,
+ bool percpu, bool onallcpus)
+{
+ u32 size = htab->map.value_size;
+ bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+ struct htab_elem *l_new;
+ void __percpu *pptr;
+
+ if (prealloc) {
+ l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
+ if (!l_new)
+ return ERR_PTR(-E2BIG);
+ } else {
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(l_new->key, key, key_size);
+ if (percpu) {
+ /* round up value_size to 8 bytes */
+ size = round_up(size, 8);
+
+ if (prealloc) {
+ pptr = htab_elem_get_ptr(l_new, key_size);
+ } else {
+ /* alloc_percpu zero-fills */
+ pptr = __alloc_percpu_gfp(size, 8,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!pptr) {
+ kfree(l_new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ } else {
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+ if (!prealloc)
+ htab_elem_set_ptr(l_new, key_size, pptr);
+ } else {
+ memcpy(l_new->key + round_up(key_size, 8), value, size);
+ }
+
+ l_new->hash = hash;
+ return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+ u64 map_flags)
+{
+ if (l_old && map_flags == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!l_old && map_flags == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new, *l_old;
+ struct htab_elem *l_new = NULL, *l_old;
struct hlist_head *head;
unsigned long flags;
- u32 key_size;
+ struct bucket *b;
+ u32 key_size, hash;
int ret;
- if (map_flags > BPF_EXIST)
+ if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
- /* allocate new element outside of lock */
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return -ENOMEM;
-
key_size = map->key_size;
- memcpy(l_new->key, key, key_size);
- memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+ hash = htab_map_hash(key, key_size);
- l_new->hash = htab_map_hash(l_new->key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, l_new->hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
- l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
- if (!l_old && unlikely(htab->count >= map->max_entries)) {
- /* if elem with this 'key' doesn't exist and we've reached
- * max_entries limit, fail insertion of new elem
- */
- ret = -E2BIG;
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
goto err;
- }
- if (l_old && map_flags == BPF_NOEXIST) {
- /* elem already exists */
- ret = -EEXIST;
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+ if (IS_ERR(l_new)) {
+ /* all pre-allocated elements are in use or memory exhausted */
+ ret = PTR_ERR(l_new);
goto err;
}
- if (!l_old && map_flags == BPF_EXIST) {
- /* elem doesn't exist, cannot update it */
- ret = -ENOENT;
- goto err;
- }
-
- /* add new element to the head of the list, so that concurrent
- * search will find it before old elem
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
*/
hlist_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_del_rcu(&l_old->hash_node);
- kfree_rcu(l_old, rcu);
- } else {
- htab->count++;
+ free_htab_elem(htab, l_old);
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ ret = 0;
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ return ret;
+}
- return 0;
+static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
+ u32 size = htab->map.value_size;
+
+ /* per-cpu hash map can update value in-place */
+ if (!onallcpus) {
+ memcpy(this_cpu_ptr(pptr), value, size);
+ } else {
+ int off = 0, cpu;
+
+ size = round_up(size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+ } else {
+ l_new = alloc_htab_elem(htab, key, value, key_size,
+ hash, true, onallcpus);
+ if (IS_ERR(l_new)) {
+ ret = PTR_ERR(l_new);
+ goto err;
+ }
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ }
+ ret = 0;
err:
- raw_spin_unlock_irqrestore(&htab->lock, flags);
- kfree(l_new);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -310,21 +598,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
- kfree_rcu(l, rcu);
+ free_htab_elem(htab, l);
ret = 0;
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
@@ -339,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- htab->count--;
- kfree(l);
+ htab_elem_free(htab, l);
}
}
}
-
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -357,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
*/
synchronize_rcu();
- /* some of kfree_rcu() callbacks for elements of this map may not have
- * executed. It's ok. Proceed to free residual elements and map itself
+ /* some of free_htab_elem() callbacks for elements of this map may
+ * not have executed. Wait for them.
*/
- delete_all_elements(htab);
+ rcu_barrier();
+ if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+ delete_all_elements(htab);
+ } else {
+ htab_free_elems(htab);
+ pcpu_freelist_destroy(&htab->freelist);
+ }
kvfree(htab->buckets);
kfree(htab);
}
@@ -379,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l)
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ else
+ return NULL;
+}
+
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int ret = -ENOENT;
+ int cpu, off = 0;
+ u32 size;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ l = __htab_map_lookup_elem(map, key);
+ if (!l)
+ goto out;
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_update_elem = htab_percpu_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+ .ops = &htab_percpu_ops,
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
+ bpf_register_map_type(&htab_percpu_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4504ca66118d..50da680c479f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
if (!task)
return -EINVAL;
- memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
return 0;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797d50b7..f2ece3c174a5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
}
}
+static int bpf_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_link(old_dentry, dir, new_dentry);
+}
+
+static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ if (bpf_dname_reserved(new_dentry))
+ return -EPERM;
+
+ return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static const struct inode_operations bpf_dir_iops = {
.lookup = simple_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
+ .rename = bpf_rename,
+ .link = bpf_link,
.unlink = simple_unlink,
};
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
new file mode 100644
index 000000000000..5c51d1985b51
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "percpu_freelist.h"
+
+int pcpu_freelist_init(struct pcpu_freelist *s)
+{
+ int cpu;
+
+ s->freelist = alloc_percpu(struct pcpu_freelist_head);
+ if (!s->freelist)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
+
+ raw_spin_lock_init(&head->lock);
+ head->first = NULL;
+ }
+ return 0;
+}
+
+void pcpu_freelist_destroy(struct pcpu_freelist *s)
+{
+ free_percpu(s->freelist);
+}
+
+static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
+{
+ raw_spin_lock(&head->lock);
+ node->next = head->first;
+ head->first = node;
+ raw_spin_unlock(&head->lock);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
+
+ __pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems)
+{
+ struct pcpu_freelist_head *head;
+ unsigned long flags;
+ int i, cpu, pcpu_entries;
+
+ pcpu_entries = nr_elems / num_possible_cpus() + 1;
+ i = 0;
+
+ /* disable irq to workaround lockdep false positive
+ * in bpf usage pcpu_freelist_populate() will never race
+ * with pcpu_freelist_push()
+ */
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+again:
+ head = per_cpu_ptr(s->freelist, cpu);
+ __pcpu_freelist_push(head, buf);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+ local_irq_restore(flags);
+}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_head *head;
+ struct pcpu_freelist_node *node;
+ int orig_cpu, cpu;
+
+ orig_cpu = cpu = raw_smp_processor_id();
+ while (1) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ raw_spin_lock(&head->lock);
+ node = head->first;
+ if (node) {
+ head->first = node->next;
+ raw_spin_unlock(&head->lock);
+ return node;
+ }
+ raw_spin_unlock(&head->lock);
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = 0;
+ if (cpu == orig_cpu)
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
new file mode 100644
index 000000000000..3049aae8ea1e
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __PERCPU_FREELIST_H__
+#define __PERCPU_FREELIST_H__
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+struct pcpu_freelist_head {
+ struct pcpu_freelist_node *first;
+ raw_spinlock_t lock;
+};
+
+struct pcpu_freelist {
+ struct pcpu_freelist_head __percpu *freelist;
+};
+
+struct pcpu_freelist_node {
+ struct pcpu_freelist_node *next;
+};
+
+void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems);
+int pcpu_freelist_init(struct pcpu_freelist *);
+void pcpu_freelist_destroy(struct pcpu_freelist *s);
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..499d9e933f8e
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,290 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+#include "percpu_freelist.h"
+
+struct stack_map_bucket {
+ struct pcpu_freelist_node fnode;
+ u32 hash;
+ u32 nr;
+ u64 ip[];
+};
+
+struct bpf_stack_map {
+ struct bpf_map map;
+ void *elems;
+ struct pcpu_freelist freelist;
+ u32 n_buckets;
+ struct stack_map_bucket *buckets[];
+};
+
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+ u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ int err;
+
+ smap->elems = vzalloc(elem_size * smap->map.max_entries);
+ if (!smap->elems)
+ return -ENOMEM;
+
+ err = pcpu_freelist_init(&smap->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+ smap->map.max_entries);
+ return 0;
+
+free_elems:
+ vfree(smap->elems);
+ return err;
+}
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+ u32 value_size = attr->value_size;
+ struct bpf_stack_map *smap;
+ u64 cost, n_buckets;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->map_flags)
+ return ERR_PTR(-EINVAL);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ value_size < 8 || value_size % 8 ||
+ value_size / 8 > PERF_MAX_STACK_DEPTH)
+ return ERR_PTR(-EINVAL);
+
+ /* hash table size must be power of 2 */
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
+ cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
+ if (!smap) {
+ smap = vzalloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = -E2BIG;
+ cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_smap;
+
+ smap->map.map_type = attr->map_type;
+ smap->map.key_size = attr->key_size;
+ smap->map.value_size = value_size;
+ smap->map.max_entries = attr->max_entries;
+ smap->n_buckets = n_buckets;
+ smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ err = bpf_map_precharge_memlock(smap->map.pages);
+ if (err)
+ goto free_smap;
+
+ err = get_callchain_buffers();
+ if (err)
+ goto free_smap;
+
+ err = prealloc_elems_and_freelist(smap);
+ if (err)
+ goto put_buffers;
+
+ return &smap->map;
+
+put_buffers:
+ put_callchain_buffers();
+free_smap:
+ kvfree(smap);
+ return ERR_PTR(err);
+}
+
+static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+ struct pt_regs *regs = (struct pt_regs *) (long) r1;
+ struct bpf_map *map = (struct bpf_map *) (long) r2;
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct perf_callchain_entry *trace;
+ struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+ u32 max_depth = map->value_size / 8;
+ /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
+ u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 hash, id, trace_nr, trace_len;
+ bool user = flags & BPF_F_USER_STACK;
+ bool kernel = !user;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+
+ if (unlikely(!trace))
+ /* couldn't fetch the stack trace */
+ return -EFAULT;
+
+ /* get_perf_callchain() guarantees that trace->nr >= init_nr
+ * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ */
+ trace_nr = trace->nr - init_nr;
+
+ if (trace_nr <= skip)
+ /* skipping more than usable stack trace */
+ return -EFAULT;
+
+ trace_nr -= skip;
+ trace_len = trace_nr * sizeof(u64);
+ ips = trace->ip + skip + init_nr;
+ hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+ id = hash & (smap->n_buckets - 1);
+ bucket = READ_ONCE(smap->buckets[id]);
+
+ if (bucket && bucket->hash == hash) {
+ if (flags & BPF_F_FAST_STACK_CMP)
+ return id;
+ if (bucket->nr == trace_nr &&
+ memcmp(bucket->ip, ips, trace_len) == 0)
+ return id;
+ }
+
+ /* this call stack is not in the map, try to add it */
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+
+ memcpy(new_bucket->ip, ips, trace_len);
+ new_bucket->hash = hash;
+ new_bucket->nr = trace_nr;
+
+ old_bucket = xchg(&smap->buckets[id], new_bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return id;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+ .func = bpf_get_stackid,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* Called from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *bucket, *old_bucket;
+ u32 id = *(u32 *)key, trace_len;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -ENOENT;
+
+ bucket = xchg(&smap->buckets[id], NULL);
+ if (!bucket)
+ return -ENOENT;
+
+ trace_len = bucket->nr * sizeof(u64);
+ memcpy(value, bucket->ip, trace_len);
+ memset(value + trace_len, 0, map->value_size - trace_len);
+
+ old_bucket = xchg(&smap->buckets[id], bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EINVAL;
+}
+
+static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *old_bucket;
+ u32 id = *(u32 *)key;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -E2BIG;
+
+ old_bucket = xchg(&smap->buckets[id], NULL);
+ if (old_bucket) {
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+
+ /* wait for bpf programs to complete before freeing stack map */
+ synchronize_rcu();
+
+ vfree(smap->elems);
+ pcpu_freelist_destroy(&smap->freelist);
+ kvfree(smap);
+ put_callchain_buffers();
+}
+
+static const struct bpf_map_ops stack_map_ops = {
+ .map_alloc = stack_map_alloc,
+ .map_free = stack_map_free,
+ .map_get_next_key = stack_map_get_next_key,
+ .map_lookup_elem = stack_map_lookup_elem,
+ .map_update_elem = stack_map_update_elem,
+ .map_delete_elem = stack_map_delete_elem,
+};
+
+static struct bpf_map_type_list stack_map_type __read_mostly = {
+ .ops = &stack_map_ops,
+ .type = BPF_MAP_TYPE_STACK_TRACE,
+};
+
+static int __init register_stack_map(void)
+{
+ bpf_register_map_type(&stack_map_type);
+ return 0;
+}
+late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3b39550d8485..2a2efe1bc76c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
#include <linux/filter.h>
#include <linux/version.h>
+DEFINE_PER_CPU(int, bpf_prog_active);
+
int sysctl_unprivileged_bpf_disabled __read_mostly;
static LIST_HEAD(bpf_map_types);
@@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+int bpf_map_precharge_memlock(u32 pages)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit, cur;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ cur = atomic_long_read(&user->locked_vm);
+ free_uid(user);
+ if (cur + pages > memlock_limit)
+ return -EPERM;
+ return 0;
+}
+
static int bpf_map_charge_memlock(struct bpf_map *map)
{
struct user_struct *user = get_current_user();
@@ -113,8 +128,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_map *map = filp->private_data;
+
+ seq_printf(m,
+ "map_type:\t%u\n"
+ "key_size:\t%u\n"
+ "value_size:\t%u\n"
+ "max_entries:\t%u\n",
+ map->map_type,
+ map->key_size,
+ map->value_size,
+ map->max_entries);
+}
+#endif
+
static const struct file_operations bpf_map_fops = {
- .release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_map_show_fdinfo,
+#endif
+ .release = bpf_map_release,
};
int bpf_map_new_fd(struct bpf_map *map)
@@ -131,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -209,6 +244,11 @@ static void __user *u64_to_ptr(__u64 val)
return (void __user *) (unsigned long) val;
}
+int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
+ return -ENOTSUPP;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
@@ -219,6 +259,7 @@ static int map_lookup_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value, *ptr;
+ u32 value_size;
struct fd f;
int err;
@@ -239,23 +280,37 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else
+ value_size = map->value_size;
+
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
- if (ptr)
- memcpy(value, ptr, map->value_size);
- rcu_read_unlock();
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else {
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, value_size);
+ rcu_read_unlock();
+ err = ptr ? 0 : -ENOENT;
+ }
- err = -ENOENT;
- if (!ptr)
+ if (err)
goto free_value;
err = -EFAULT;
- if (copy_to_user(uvalue, value, map->value_size) != 0)
+ if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
err = 0;
@@ -278,6 +333,7 @@ static int map_update_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
+ u32 value_size;
struct fd f;
int err;
@@ -298,21 +354,37 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else
+ value_size = map->value_size;
+
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -EFAULT;
- if (copy_from_user(value, uvalue, map->value_size) != 0)
+ if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* eBPF program that use maps are running under rcu_read_lock(),
- * therefore all map accessors rely on this fact, so do the same here
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
*/
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
free_value:
kfree(value);
@@ -351,9 +423,13 @@ static int map_delete_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
free_key:
kfree(key);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7945d10b378..2e08f8e9b771 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -246,6 +246,7 @@ static const struct {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
+ {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
};
static void print_verifier_state(struct verifier_env *env)
@@ -778,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized
*/
-static int check_stack_boundary(struct verifier_env *env,
- int regno, int access_size)
+static int check_stack_boundary(struct verifier_env *env, int regno,
+ int access_size, bool zero_size_allowed)
{
struct verifier_state *state = &env->cur_state;
struct reg_state *regs = state->regs;
int off, i;
- if (regs[regno].type != PTR_TO_STACK)
+ if (regs[regno].type != PTR_TO_STACK) {
+ if (zero_size_allowed && access_size == 0 &&
+ regs[regno].type == CONST_IMM &&
+ regs[regno].imm == 0)
+ return 0;
+
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[regs[regno].type],
+ reg_type_str[PTR_TO_STACK]);
return -EACCES;
+ }
off = regs[regno].imm;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -829,15 +839,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return 0;
}
- if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ } else if (arg_type == ARG_CONST_STACK_SIZE ||
+ arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
} else if (arg_type == ARG_PTR_TO_CTX) {
expected_type = PTR_TO_CTX;
+ } else if (arg_type == ARG_PTR_TO_STACK) {
+ expected_type = PTR_TO_STACK;
+ /* One exception here. In case function allows for NULL to be
+ * passed in as argument, it's a CONST_IMM type. Final test
+ * happens during stack boundary checking.
+ */
+ if (reg->type == CONST_IMM && reg->imm == 0)
+ expected_type = CONST_IMM;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -867,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->key_size);
-
+ err = check_stack_boundary(env, regno, (*mapp)->key_size,
+ false);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -878,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->value_size);
+ err = check_stack_boundary(env, regno, (*mapp)->value_size,
+ false);
+ } else if (arg_type == ARG_CONST_STACK_SIZE ||
+ arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+ bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
/* bpf_xxx(..., buf, len) call will access 'len' bytes
* from stack pointer 'buf'. Check it
* note: regno == len, regno - 1 == buf
@@ -890,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno - 1, reg->imm);
+ err = check_stack_boundary(env, regno - 1, reg->imm,
+ zero_size_allowed);
}
return err;
@@ -911,8 +934,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
* don't allow any other map type to be passed into
* the special func;
*/
- if (bool_func && bool_map != bool_func)
+ if (bool_func && bool_map != bool_func) {
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
return -EINVAL;
+ }
}
return 0;
@@ -1121,6 +1147,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+ opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+ if (insn->imm < 0 || insn->imm >= size) {
+ verbose("invalid shift %d\n", insn->imm);
+ return -EINVAL;
+ }
+ }
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
@@ -2072,7 +2108,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
/* adjust offset of jmps if necessary */
if (i < pos && i + insn->off + 1 > pos)
insn->off += delta;
- else if (i > pos && i + insn->off + 1 < pos)
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
insn->off -= delta;
}
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 470f6536b9e8..3fe02c152799 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,9 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
#include <linux/delay.h>
-
#include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <net/sock.h>
/*
* pidlists linger the following amount before being destroyed. The goal
@@ -177,10 +178,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_root_visible;
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
/* some controllers are not supported in the default hierarchy */
-static unsigned long cgrp_dfl_root_inhibit_ss_mask;
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */
@@ -204,22 +211,25 @@ static u64 css_serial_nr_next = 1;
* fork/exit handlers to call. This avoids us having to do extra work in the
* fork/exit path to check which subsystems have fork/exit callbacks.
*/
-static unsigned long have_fork_callback __read_mostly;
-static unsigned long have_exit_callback __read_mostly;
-static unsigned long have_free_callback __read_mostly;
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
-static unsigned long have_canfork_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
+static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask);
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
@@ -236,9 +246,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
static bool cgroup_ssid_enabled(int ssid)
{
+ if (CGROUP_SUBSYS_COUNT == 0)
+ return false;
+
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
+static bool cgroup_ssid_no_v1(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
@@ -337,6 +355,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp)
return NULL;
}
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ u16 root_ss_mask = cgrp->root->subsys_mask;
+
+ if (parent)
+ return parent->subtree_control;
+
+ if (cgroup_on_dfl(cgrp))
+ root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+ cgrp_dfl_implicit_ss_mask);
+ return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (parent)
+ return parent->subtree_ss_mask;
+
+ return cgrp->root->subsys_mask;
+}
+
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
@@ -376,16 +420,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!ss)
return &cgrp->self;
- if (!(cgrp->root->subsys_mask & (1 << ss->id)))
- return NULL;
-
/*
* This function is used while updating css associations and thus
- * can't test the csses directly. Use ->child_subsys_mask.
+ * can't test the csses directly. Test ss_mask.
*/
- while (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp);
+ if (!cgrp)
+ return NULL;
+ }
return cgroup_css(cgrp, ss);
}
@@ -440,11 +483,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
return css_tryget(&cgrp->self);
}
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -465,25 +503,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor. It also returns %true
- * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
- while (cgrp) {
- if (cgrp == ancestor)
- return true;
- cgrp = cgroup_parent(cgrp);
- }
- return false;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -528,22 +547,28 @@ static int notify_on_release(const struct cgroup *cgrp)
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/**
- * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- * @ss_maskp: a pointer to the bitmask
+ * @ss_mask: the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
- * mask is set to 1.
+ * @ss_mask is set.
*/
-#define for_each_subsys_which(ss, ssid, ss_maskp) \
- if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
+#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
+ unsigned long __ss_mask = (ss_mask); \
+ if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
(ssid) = 0; \
- else \
- for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
- if (((ss) = cgroup_subsys[ssid]) && false) \
- break; \
- else
+ break; \
+ } \
+ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
+ (ss) = cgroup_subsys[ssid]; \
+ {
+
+#define while_each_subsys_mask() \
+ } \
+ } \
+} while (false)
/* iterate across the hierarchies */
#define for_each_root(root) \
@@ -557,6 +582,24 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
+ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
+ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
static void cgroup_release_agent(struct work_struct *work);
static void check_for_release(struct cgroup *cgrp);
@@ -687,6 +730,9 @@ static void css_set_move_task(struct task_struct *task,
{
lockdep_assert_held(&css_set_lock);
+ if (to_cset && !css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+
if (from_cset) {
struct css_task_iter *it, *pos;
@@ -720,8 +766,6 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
- if (!css_set_populated(to_cset))
- css_set_update_populated(to_cset, true);
rcu_assign_pointer(task->cgroups, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
@@ -1124,13 +1168,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */
- rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
@@ -1270,46 +1314,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
}
/**
- * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
- * @cgrp: the target cgroup
+ * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
* @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
* This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied to @cgrp. The returned mask is always
- * a superset of @subtree_control and follows the usual hierarchy rules.
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- unsigned long subtree_control)
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
- struct cgroup *parent = cgroup_parent(cgrp);
- unsigned long cur_ss_mask = subtree_control;
+ u16 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
- if (!cgroup_on_dfl(cgrp))
- return cur_ss_mask;
+ cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- unsigned long new_ss_mask = cur_ss_mask;
+ u16 new_ss_mask = cur_ss_mask;
- for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
+ } while_each_subsys_mask();
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
- if (parent)
- new_ss_mask &= parent->child_subsys_mask;
- else
- new_ss_mask &= cgrp->root->subsys_mask;
+ new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask)
break;
@@ -1320,19 +1358,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
}
/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
- * @cgrp: the target cgroup
- *
- * Update @cgrp->child_subsys_mask according to the current
- * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
- */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
-{
- cgrp->child_subsys_mask =
- cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
-}
-
-/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1360,19 +1385,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.
+ * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+ bool drain_offline)
{
struct cgroup *cgrp;
@@ -1391,7 +1419,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
return NULL;
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ if (drain_offline)
+ cgroup_lock_and_drain_offline(cgrp);
+ else
+ mutex_lock(&cgroup_mutex);
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -1421,14 +1452,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
* @css: taget css
- * @cgrp_override: specify if target cgroup is different from css->cgroup
*/
-static void css_clear_dir(struct cgroup_subsys_state *css,
- struct cgroup *cgrp_override)
+static void css_clear_dir(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cgroup *cgrp = css->cgroup;
struct cftype *cfts;
+ if (!(css->flags & CSS_VISIBLE))
+ return;
+
+ css->flags &= ~CSS_VISIBLE;
+
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
}
@@ -1436,17 +1470,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css,
/**
* css_populate_dir - create subsys files in a cgroup directory
* @css: target css
- * @cgrp_overried: specify if target cgroup is different from css->cgroup
*
* On failure, no file is added.
*/
-static int css_populate_dir(struct cgroup_subsys_state *css,
- struct cgroup *cgrp_override)
+static int css_populate_dir(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cgroup *cgrp = css->cgroup;
struct cftype *cfts, *failed_cfts;
int ret;
+ if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ return 0;
+
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_dfl_base_files;
@@ -1463,6 +1498,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css,
goto err;
}
}
+
+ css->flags |= CSS_VISIBLE;
+
return 0;
err:
list_for_each_entry(cfts, &css->ss->cfts, node) {
@@ -1473,67 +1511,30 @@ err:
return ret;
}
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- unsigned long tmp_ss_mask;
int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys_which(ss, ssid, &ss_mask) {
- /* if @ss has non-root csses attached to it, can't move */
- if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ do_each_subsys_mask(ss, ssid, ss_mask) {
+ /*
+ * If @ss has non-root csses attached to it, can't move.
+ * If @ss is an implicit controller, it is exempt from this
+ * rule and can be stolen.
+ */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+ !ss->implicit_on_dfl)
return -EBUSY;
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
- }
+ } while_each_subsys_mask();
- /* skip creating root files on dfl_root for inhibited subsystems */
- tmp_ss_mask = ss_mask;
- if (dst_root == &cgrp_dfl_root)
- tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-
- for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- struct cgroup *scgrp = &ss->root->cgrp;
- int tssid;
-
- ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
- if (!ret)
- continue;
-
- /*
- * Rebinding back to the default root is not allowed to
- * fail. Using both default and non-default roots should
- * be rare. Moving subsystems back and forth even more so.
- * Just warn about it and continue.
- */
- if (dst_root == &cgrp_dfl_root) {
- if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- ret, ss_mask);
- pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
- }
- continue;
- }
-
- for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
- if (tssid == ssid)
- break;
- css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
- }
- return ret;
- }
-
- /*
- * Nothing can fail from this point on. Remove files for the
- * removed subsystems and rebind each subsystem.
- */
- for_each_subsys_which(ss, ssid, &ss_mask) {
+ do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
@@ -1541,8 +1542,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
WARN_ON(!css || cgroup_css(dcgrp, ss));
- css_clear_dir(css, NULL);
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ /* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
@@ -1554,23 +1559,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
&dcgrp->e_csets[ss->id]);
spin_unlock_bh(&css_set_lock);
- src_root->subsys_mask &= ~(1 << ssid);
- scgrp->subtree_control &= ~(1 << ssid);
- cgroup_refresh_child_subsys_mask(scgrp);
-
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
- cgroup_refresh_child_subsys_mask(dcgrp);
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
+ ret = cgroup_apply_control(dcgrp);
+ if (ret)
+ pr_warn("partial failure to rebind %s controller (err=%d)\n",
+ ss->name, ret);
+
if (ss->bind)
ss->bind(css);
- }
+ } while_each_subsys_mask();
kernfs_activate(dcgrp->kn);
return 0;
@@ -1606,7 +1611,7 @@ static int cgroup_show_options(struct seq_file *seq,
}
struct cgroup_sb_opts {
- unsigned long subsys_mask;
+ u16 subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
@@ -1619,13 +1624,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = -1UL;
+ u16 mask = U16_MAX;
struct cgroup_subsys *ss;
int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~(1U << cpuset_cgrp_id);
+ mask = ~((u16)1 << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1647,10 +1652,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
@@ -1704,6 +1705,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
if (!cgroup_ssid_enabled(i))
continue;
+ if (cgroup_ssid_no_v1(i))
+ continue;
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
@@ -1717,15 +1720,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
@@ -1733,7 +1727,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i))
+ if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
opts->subsys_mask |= (1 << i);
/*
@@ -1763,14 +1757,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned long added_mask, removed_mask;
+ u16 added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
return -EINVAL;
}
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1803,7 +1797,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
if (ret)
goto out_unlock;
- rebind_subsystems(&cgrp_dfl_root, removed_mask);
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
if (opts.release_agent) {
spin_lock(&release_agent_path_lock);
@@ -1911,7 +1905,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1924,6 +1918,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
if (ret < 0)
goto out;
root_cgrp->id = ret;
+ root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
GFP_KERNEL);
@@ -1933,10 +1928,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
- * cgroup_lock, and that's us. The worst that can happen is that we
- * have some link structures left over
+ * cgroup_lock, and that's us. Later rebinding may disable
+ * controllers on the default hierarchy and thus create new csets,
+ * which can't be more than the existing ones. Allocate 2x.
*/
- ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret)
goto cancel_ref;
@@ -1953,7 +1949,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = css_populate_dir(&root_cgrp->self, NULL);
+ ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -2004,6 +2000,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
@@ -2020,22 +2017,24 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
- mutex_lock(&cgroup_mutex);
+ if (is_v2) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ goto out_mount;
+ }
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
@@ -2146,9 +2145,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
-
+out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
+ is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+ &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2191,6 +2191,12 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -2362,38 +2368,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * cgroup_taskset_migrate - migrate a taskset
* @tset: taget taskset
- * @dst_cgrp: destination cgroup
+ * @root: cgroup root the migration is taking place on
*
- * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
- * ->can_attach callbacks fails and guarantees that either all or none of
- * the tasks in @tset are migrated. @tset is consumed regardless of
- * success.
+ * Migrate tasks in @tset as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
*/
static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- struct cgroup *dst_cgrp)
+ struct cgroup_root *root)
{
- struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
- int i, ret;
+ int ssid, failed_ssid, ret;
/* methods shouldn't be called if no task is actually migrating */
if (list_empty(&tset->src_csets))
return 0;
/* check that we can legitimately attach to the cgroup */
- for_each_e_css(css, i, dst_cgrp) {
- if (css->ss->can_attach) {
- tset->ssid = i;
- ret = css->ss->can_attach(tset);
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
if (ret) {
- failed_css = css;
+ failed_ssid = ssid;
goto out_cancel_attach;
}
}
- }
+ } while_each_subsys_mask();
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2420,25 +2426,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;
- for_each_e_css(css, i, dst_cgrp) {
- if (css->ss->attach) {
- tset->ssid = i;
- css->ss->attach(tset);
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
}
- }
+ } while_each_subsys_mask();
ret = 0;
goto out_release_tset;
out_cancel_attach:
- for_each_e_css(css, i, dst_cgrp) {
- if (css == failed_css)
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ssid == failed_ssid)
break;
- if (css->ss->cancel_attach) {
- tset->ssid = i;
- css->ss->cancel_attach(tset);
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
}
- }
+ } while_each_subsys_mask();
out_release_tset:
spin_lock_bh(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2451,6 +2457,20 @@ out_release_tset:
}
/**
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
+ */
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+{
+ return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+ !dst_cgrp->subtree_control;
+}
+
+/**
* cgroup_migrate_finish - cleanup after attach
* @preloaded_csets: list of preloaded css_sets
*
@@ -2466,6 +2486,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
@@ -2498,58 +2519,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
+ /*
+ * If ->dead, @src_set is associated with one or more dead cgroups
+ * and doesn't contain any migratable tasks. Ignore it early so
+ * that the rest of migration path doesn't get confused by it.
+ */
+ if (src_cset->dead)
+ return;
+
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
return;
WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
WARN_ON(!list_empty(&src_cset->mg_node));
src_cset->mg_src_cgrp = src_cgrp;
+ src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
list_add(&src_cset->mg_preload_node, preloaded_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup (may be %NULL)
* @preloaded_csets: list of preloaded source css_sets
*
- * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- * have been preloaded to @preloaded_csets. This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
- * source css_set is assumed to be its cgroup on the default hierarchy.
+ * Tasks are about to be moved and all the source css_sets have been
+ * preloaded to @preloaded_csets. This function looks up and pins all
+ * destination css_sets, links each to its source, and append them to
+ * @preloaded_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
* @preloaded_csets.
*/
-static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
+static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
{
LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
- /*
- * Except for the root, child_subsys_mask must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
- dst_cgrp->child_subsys_mask)
- return -EBUSY;
-
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
struct css_set *dst_cset;
- dst_cset = find_css_set(src_cset,
- dst_cgrp ?: src_cset->dfl_cgrp);
+ dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
goto err;
@@ -2562,6 +2581,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
*/
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
+ src_cset->mg_dst_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
@@ -2587,11 +2607,11 @@ err:
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
- * @cgrp: the destination cgroup
+ * @root: cgroup root migration is taking place on
*
- * Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem. The
- * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * Migrate a process or task denoted by @leader. If migrating a process,
+ * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
+ * responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
*
@@ -2602,7 +2622,7 @@ err:
* actually starting migrating.
*/
static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
- struct cgroup *cgrp)
+ struct cgroup_root *root)
{
struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct task_struct *task;
@@ -2623,7 +2643,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
rcu_read_unlock();
spin_unlock_bh(&css_set_lock);
- return cgroup_taskset_migrate(&tset, cgrp);
+ return cgroup_taskset_migrate(&tset, root);
}
/**
@@ -2641,6 +2661,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task;
int ret;
+ if (!cgroup_may_migrate_to(dst_cgrp))
+ return -EBUSY;
+
/* look up all src csets */
spin_lock_bh(&css_set_lock);
rcu_read_lock();
@@ -2655,9 +2678,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
spin_unlock_bh(&css_set_lock);
/* prepare dst csets and commit */
- ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (!ret)
- ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
+ ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
cgroup_migrate_finish(&preloaded_csets);
return ret;
@@ -2720,7 +2743,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn);
+ cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -2764,6 +2787,7 @@ out_unlock_rcu:
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn);
+ cpuset_post_attach_flush();
return ret ?: nbytes;
}
@@ -2817,7 +2841,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- cgrp = cgroup_kn_lock_live(of->kn);
+ cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
spin_lock(&release_agent_path_lock);
@@ -2845,38 +2869,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
int ssid;
- for_each_subsys_which(ss, ssid, &ss_mask) {
+ do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
seq_printf(seq, "%s", ss->name);
printed = true;
- }
+ } while_each_subsys_mask();
if (printed)
seq_putc(seq, '\n');
}
-/* show controllers which are currently attached to the default hierarchy */
-static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
- ~cgrp_dfl_root_inhibit_ss_mask);
- return 0;
-}
-
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+ cgroup_print_ss_mask(seq, cgroup_control(cgrp));
return 0;
}
@@ -2893,16 +2907,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
* @cgrp: root of the subtree to update csses for
*
- * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
- * css associations need to be updated accordingly. This function looks up
- * all css_sets which are attached to the subtree, creates the matching
- * updated css_sets and migrates the tasks to the new ones.
+ * @cgrp's control masks have changed and its subtree's css associations
+ * need to be updated accordingly. This function looks up all css_sets
+ * which are attached to the subtree, creates the matching updated css_sets
+ * and migrates the tasks to the new ones.
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
LIST_HEAD(preloaded_csets);
struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup *dsct;
struct css_set *src_cset;
int ret;
@@ -2912,21 +2927,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_bh(&css_set_lock);
- css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
- /* self is not affected by child_subsys_mask change */
- if (css->cgroup == cgrp)
- continue;
-
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, cgrp,
+ list_for_each_entry(link, &dsct->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, dsct,
&preloaded_csets);
}
spin_unlock_bh(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
- ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
goto out_finish;
@@ -2944,20 +2955,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
}
spin_unlock_bh(&css_set_lock);
- ret = cgroup_taskset_migrate(&tset, cgrp);
+ ret = cgroup_taskset_migrate(&tset, cgrp->root);
out_finish:
cgroup_migrate_finish(&preloaded_csets);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
+/**
+ * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ * @cgrp: root of the target subtree
+ *
+ * Because css offlining is asynchronous, userland may try to re-enable a
+ * controller while the previous css is still around. This function grabs
+ * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ */
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+ __acquires(&cgroup_mutex)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+restart:
+ mutex_lock(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+ DEFINE_WAIT(wait);
+
+ if (!css || !percpu_ref_is_dying(&css->refcnt))
+ continue;
+
+ cgroup_get(dsct);
+ prepare_to_wait(&dsct->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_unlock(&cgroup_mutex);
+ schedule();
+ finish_wait(&dsct->offline_waitq, &wait);
+
+ cgroup_put(dsct);
+ goto restart;
+ }
+ }
+}
+
+/**
+ * cgroup_save_control - save control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_save_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ dsct->old_subtree_control = dsct->subtree_control;
+ dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+ }
+}
+
+/**
+ * cgroup_propagate_control - refresh control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ * ->subtree_control and propagate controller availability through the
+ * subtree so that descendants don't have unavailable controllers enabled.
+ */
+static void cgroup_propagate_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ dsct->subtree_control &= cgroup_control(dsct);
+ dsct->subtree_ss_mask =
+ cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+ cgroup_ss_mask(dsct));
+ }
+}
+
+/**
+ * cgroup_restore_control - restore control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_restore_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ dsct->subtree_control = dsct->old_subtree_control;
+ dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+ }
+}
+
+static bool css_visible(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_control(cgrp) & (1 << ss->id))
+ return true;
+ if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+ return false;
+ return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+}
+
+/**
+ * cgroup_apply_control_enable - enable or show csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and create new csses or make the existing ones
+ * visible. A css is created invisible if it's being implicitly enabled
+ * through dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
+ *
+ * Returns 0 on success, -errno on failure. On failure, csses which have
+ * been processed already aren't cleaned up. The caller is responsible for
+ * cleaning up with cgroup_apply_control_disble().
+ */
+static int cgroup_apply_control_enable(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid, ret;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+ WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+ if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+ continue;
+
+ if (!css) {
+ css = css_create(dsct, ss);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+ }
+
+ if (css_visible(css)) {
+ ret = css_populate_dir(css);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * cgroup_apply_control_disable - kill or hide csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and kill and hide csses so that they match
+ * cgroup_ss_mask() and cgroup_visible_mask().
+ *
+ * A css is hidden when the userland requests it to be disabled while other
+ * subsystems are still depending on it. The css must not actively control
+ * resources and be in the vanilla state if it's made visible again later.
+ * Controllers which may be depended upon should provide ->css_reset() for
+ * this purpose.
+ */
+static void cgroup_apply_control_disable(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+ WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+ if (!css)
+ continue;
+
+ if (css->parent &&
+ !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+ kill_css(css);
+ } else if (!css_visible(css)) {
+ css_clear_dir(css);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
+ }
+}
+
+/**
+ * cgroup_apply_control - apply control mask updates to the subtree
+ * @cgrp: root of the target subtree
+ *
+ * subsystems can be enabled and disabled in a subtree using the following
+ * steps.
+ *
+ * 1. Call cgroup_save_control() to stash the current state.
+ * 2. Update ->subtree_control masks in the subtree as desired.
+ * 3. Call cgroup_apply_control() to apply the changes.
+ * 4. Optionally perform other related operations.
+ * 5. Call cgroup_finalize_control() to finish up.
+ *
+ * This function implements step 3 and propagates the mask changes
+ * throughout @cgrp's subtree, updates csses accordingly and perform
+ * process migrations.
+ */
+static int cgroup_apply_control(struct cgroup *cgrp)
+{
+ int ret;
+
+ cgroup_propagate_control(cgrp);
+
+ ret = cgroup_apply_control_enable(cgrp);
+ if (ret)
+ return ret;
+
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * cgroup_finalize_control - finalize control mask update
+ * @cgrp: root of the target subtree
+ * @ret: the result of the update
+ *
+ * Finalize control mask update. See cgroup_apply_control() for more info.
+ */
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+{
+ if (ret) {
+ cgroup_restore_control(cgrp);
+ cgroup_propagate_control(cgrp);
+ }
+
+ cgroup_apply_control_disable(cgrp);
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- unsigned long enable = 0, disable = 0;
- unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+ u16 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2969,11 +3232,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
- unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
-
if (tok[0] == '\0')
continue;
- for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
if (!cgroup_ssid_enabled(ssid) ||
strcmp(tok + 1, ss->name))
continue;
@@ -2988,12 +3249,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
return -EINVAL;
}
break;
- }
+ } while_each_subsys_mask();
if (ssid == CGROUP_SUBSYS_COUNT)
return -EINVAL;
}
- cgrp = cgroup_kn_lock_live(of->kn);
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENODEV;
@@ -3004,10 +3265,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
continue;
}
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ if (!(cgroup_control(cgrp) & (1 << ssid))) {
ret = -ENOENT;
goto out_unlock;
}
@@ -3041,150 +3299,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Update subsys masks and calculate what needs to be done. More
- * subsystems than specified may need to be enabled or disabled
- * depending on subsystem dependencies.
- */
- old_sc = cgrp->subtree_control;
- old_ss = cgrp->child_subsys_mask;
- new_sc = (old_sc | enable) & ~disable;
- new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
+ /* save and update control masks and prepare csses */
+ cgroup_save_control(cgrp);
- css_enable = ~old_ss & new_ss;
- css_disable = old_ss & ~new_ss;
- enable |= css_enable;
- disable |= css_disable;
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
- /*
- * Because css offlining is asynchronous, userland might try to
- * re-enable the same controller while the previous instance is
- * still around. In such cases, wait till it's gone using
- * offline_waitq.
- */
- for_each_subsys_which(ss, ssid, &css_enable) {
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
-
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
- }
+ ret = cgroup_apply_control(cgrp);
- cgrp->subtree_control = new_sc;
- cgrp->child_subsys_mask = new_ss;
-
- /*
- * Create new csses or make the existing ones visible. A css is
- * created invisible if it's being implicitly enabled through
- * dependency. An invisible css is made visible when the userland
- * explicitly enables it.
- */
- for_each_subsys(ss, ssid) {
- if (!(enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- if (css_enable & (1 << ssid))
- ret = create_css(child, ss,
- cgrp->subtree_control & (1 << ssid));
- else
- ret = css_populate_dir(cgroup_css(child, ss),
- NULL);
- if (ret)
- goto err_undo_css;
- }
- }
-
- /*
- * At this point, cgroup_e_css() results reflect the new csses
- * making the following cgroup_update_dfl_csses() properly update
- * css associations of all tasks in the subtree.
- */
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- goto err_undo_css;
-
- /*
- * All tasks are migrated out of disabled csses. Kill or hide
- * them. A css is hidden when the userland requests it to be
- * disabled while other subsystems are still depending on it. The
- * css must not actively control resources and be in the vanilla
- * state if it's made visible again later. Controllers which may
- * be depended upon should provide ->css_reset() for this purpose.
- */
- for_each_subsys(ss, ssid) {
- if (!(disable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
- if (css_disable & (1 << ssid)) {
- kill_css(css);
- } else {
- css_clear_dir(css, NULL);
- if (ss->css_reset)
- ss->css_reset(css);
- }
- }
- }
-
- /*
- * The effective csses of all the descendants (excluding @cgrp) may
- * have changed. Subsystems can optionally subscribe to this event
- * by implementing ->css_e_css_changed() which is invoked if any of
- * the effective csses seen from the css's cgroup may have changed.
- */
- for_each_subsys(ss, ssid) {
- struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
- struct cgroup_subsys_state *css;
-
- if (!ss->css_e_css_changed || !this_css)
- continue;
-
- css_for_each_descendant_pre(css, this_css)
- if (css != this_css)
- ss->css_e_css_changed(css);
- }
+ cgroup_finalize_control(cgrp, ret);
kernfs_activate(cgrp->kn);
ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
-
-err_undo_css:
- cgrp->subtree_control = old_sc;
- cgrp->child_subsys_mask = old_ss;
-
- for_each_subsys(ss, ssid) {
- if (!(enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
- if (!css)
- continue;
-
- if (css_enable & (1 << ssid))
- kill_css(css);
- else
- css_clear_dir(css, NULL);
- }
- }
- goto out_unlock;
}
static int cgroup_events_show(struct seq_file *seq, void *v)
@@ -3382,7 +3511,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
bool is_add)
{
struct cftype *cft, *cft_end = NULL;
- int ret;
+ int ret = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -3411,7 +3540,7 @@ restart:
cgroup_rm_file(cgrp, cft);
}
}
- return 0;
+ return ret;
}
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
@@ -3428,7 +3557,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
- if (cgroup_is_dead(cgrp))
+ if (!(css->flags & CSS_VISIBLE))
continue;
ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
@@ -4049,6 +4178,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
struct task_struct *task;
int ret;
+ if (!cgroup_may_migrate_to(to))
+ return -EBUSY;
+
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
@@ -4057,12 +4189,12 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
spin_unlock_bh(&css_set_lock);
- ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
goto out_err;
/*
- * Migrate tasks one-by-one until @form is empty. This fails iff
+ * Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
@@ -4073,7 +4205,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
css_task_iter_end(&it);
if (task) {
- ret = cgroup_migrate(task, false, to);
+ ret = cgroup_migrate(task, false, to->root);
put_task_struct(task);
}
} while (task && !ret);
@@ -4580,12 +4712,6 @@ static struct cftype cgroup_dfl_base_files[] = {
},
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_root_controllers_show,
- },
- {
- .name = "cgroup.controllers",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show,
},
{
@@ -4680,14 +4806,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) {
/* css free path */
+ struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
- if (css->parent)
- css_put(css->parent);
-
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
+
+ if (parent)
+ css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
@@ -4753,7 +4880,9 @@ static void css_release_work_fn(struct work_struct *work)
* Those are supported by RCU protecting clearing of
* cgrp->kn->priv backpointer.
*/
- RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+ if (cgrp->kn)
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
+ NULL);
}
mutex_unlock(&cgroup_mutex);
@@ -4783,6 +4912,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
+ atomic_set(&css->online_cnt, 0);
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4805,6 +4935,10 @@ static int online_css(struct cgroup_subsys_state *css)
if (!ret) {
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+ atomic_inc(&css->online_cnt);
+ if (css->parent)
+ atomic_inc(&css->parent->online_cnt);
}
return ret;
}
@@ -4819,6 +4953,9 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
+ if (ss->css_reset)
+ ss->css_reset(css);
+
if (ss->css_offline)
ss->css_offline(css);
@@ -4829,17 +4966,16 @@ static void offline_css(struct cgroup_subsys_state *css)
}
/**
- * create_css - create a cgroup_subsys_state
+ * css_create - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
- * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created if
- * @visible. Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp. This function doesn't create the
+ * interface files. Returns 0 on success, -errno on failure.
*/
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible)
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4850,7 +4986,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
css = ss->css_alloc(parent_css);
if (IS_ERR(css))
- return PTR_ERR(css);
+ return css;
init_and_link_css(css, ss, cgrp);
@@ -4863,12 +4999,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
goto err_free_percpu_ref;
css->id = err;
- if (visible) {
- err = css_populate_dir(css, NULL);
- if (err)
- goto err_free_id;
- }
-
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -4886,45 +5016,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
ss->warned_broken_hierarchy = true;
}
- return 0;
+ return css;
err_list_del:
list_del_rcu(&css->sibling);
- css_clear_dir(css, NULL);
-err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
- return err;
+ return ERR_PTR(err);
}
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+static struct cgroup *cgroup_create(struct cgroup *parent)
{
- struct cgroup *parent, *cgrp;
- struct cgroup_root *root;
- struct cgroup_subsys *ss;
- struct kernfs_node *kn;
- int ssid, ret;
-
- /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
- */
- if (strchr(name, '\n'))
- return -EINVAL;
-
- parent = cgroup_kn_lock_live(parent_kn);
- if (!parent)
- return -ENODEV;
- root = parent->root;
+ struct cgroup_root *root = parent->root;
+ struct cgroup *cgrp, *tcgrp;
+ int level = parent->level + 1;
+ int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp) {
- ret = -ENOMEM;
- goto out_unlock;
- }
+ cgrp = kzalloc(sizeof(*cgrp) +
+ sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
+ if (!cgrp)
+ return ERR_PTR(-ENOMEM);
ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
@@ -4944,6 +5059,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
cgrp->self.parent = &parent->self;
cgrp->root = root;
+ cgrp->level = level;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4951,20 +5070,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- /* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- goto out_free_id;
- }
- cgrp->kn = kn;
-
- /*
- * This extra ref will be put in cgroup_free_fn() and guarantees
- * that @cgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
cgrp->self.serial_nr = css_serial_nr_next++;
/* allocation complete, commit to creation */
@@ -4978,51 +5083,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
*/
cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- ret = cgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * subtree_control from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->subtree_control = cgroup_control(cgrp);
- ret = css_populate_dir(&cgrp->self, NULL);
+ cgroup_propagate_control(cgrp);
+
+ /* @cgrp doesn't have dir yet so the following will only create csses */
+ ret = cgroup_apply_control_enable(cgrp);
if (ret)
goto out_destroy;
- /* let's create and online css's */
- for_each_subsys(ss, ssid) {
- if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss,
- parent->subtree_control & (1 << ssid));
- if (ret)
- goto out_destroy;
- }
+ return cgrp;
+
+out_cancel_ref:
+ percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+ kfree(cgrp);
+ return ERR_PTR(ret);
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ return ERR_PTR(ret);
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ struct kernfs_node *kn;
+ int ret;
+
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = cgroup_kn_lock_live(parent_kn, false);
+ if (!parent)
+ return -ENODEV;
+
+ cgrp = cgroup_create(parent);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
}
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_destroy;
+ }
+ cgrp->kn = kn;
+
/*
- * On the default hierarchy, a child doesn't automatically inherit
- * subtree_control from the parent. Each is configured manually.
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
*/
- if (!cgroup_on_dfl(cgrp)) {
- cgrp->subtree_control = parent->subtree_control;
- cgroup_refresh_child_subsys_mask(cgrp);
- }
+ kernfs_get(kn);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ ret = css_populate_dir(&cgrp->self);
+ if (ret)
+ goto out_destroy;
+
+ ret = cgroup_apply_control_enable(cgrp);
+ if (ret)
+ goto out_destroy;
+ /* let's create and online css's */
kernfs_activate(kn);
ret = 0;
goto out_unlock;
-out_free_id:
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
-out_cancel_ref:
- percpu_ref_exit(&cgrp->self.refcnt);
-out_free_cgrp:
- kfree(cgrp);
+out_destroy:
+ cgroup_destroy_locked(cgrp);
out_unlock:
cgroup_kn_unlock(parent_kn);
return ret;
-
-out_destroy:
- cgroup_destroy_locked(cgrp);
- goto out_unlock;
}
/*
@@ -5036,10 +5180,15 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex);
- offline_css(css);
- mutex_unlock(&cgroup_mutex);
- css_put(css);
+ do {
+ offline_css(css);
+ css_put(css);
+ /* @css can't go away while we're holding cgroup_mutex */
+ css = css->parent;
+ } while (css && atomic_dec_and_test(&css->online_cnt));
+
+ mutex_unlock(&cgroup_mutex);
}
/* css kill confirmation processing requires process context, bounce */
@@ -5048,8 +5197,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ if (atomic_dec_and_test(&css->online_cnt)) {
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+ }
}
/**
@@ -5069,7 +5220,7 @@ static void kill_css(struct cgroup_subsys_state *css)
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
*/
- css_clear_dir(css, NULL);
+ css_clear_dir(css);
/*
* Killing would put the base ref, but we need to keep it alive
@@ -5118,6 +5269,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup_subsys_state *css;
+ struct cgrp_cset_link *link;
int ssid;
lockdep_assert_held(&cgroup_mutex);
@@ -5138,11 +5290,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return -EBUSY;
/*
- * Mark @cgrp dead. This prevents further task migration and child
- * creation by disabling cgroup_lock_live_group().
+ * Mark @cgrp and the associated csets dead. The former prevents
+ * further task migration and child creation by disabling
+ * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
+ spin_lock_bh(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ link->cset->dead = true;
+ spin_unlock_bh(&css_set_lock);
+
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
@@ -5166,7 +5325,7 @@ static int cgroup_rmdir(struct kernfs_node *kn)
struct cgroup *cgrp;
int ret = 0;
- cgrp = cgroup_kn_lock_live(kn);
+ cgrp = cgroup_kn_lock_live(kn, false);
if (!cgrp)
return 0;
@@ -5188,7 +5347,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
@@ -5256,7 +5415,7 @@ int __init cgroup_init_early(void)
for_each_subsys(ss, i) {
WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
- "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
@@ -5273,7 +5432,7 @@ int __init cgroup_init_early(void)
return 0;
}
-static unsigned long cgroup_disable_mask __initdata;
+static u16 cgroup_disable_mask __initdata;
/**
* cgroup_init - cgroup initialization
@@ -5284,18 +5443,21 @@ static unsigned long cgroup_disable_mask __initdata;
int __init cgroup_init(void)
{
struct cgroup_subsys *ss;
- unsigned long key;
int ssid;
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
mutex_lock(&cgroup_mutex);
- /* Add init_css_set to the hash table */
- key = css_set_hash(init_css_set.subsys);
- hash_add(css_set_table, &init_css_set.hlist, key);
+ /*
+ * Add init_css_set to the hash table so that dfl_root can link to
+ * it during init.
+ */
+ hash_add(css_set_table, &init_css_set.hlist,
+ css_set_hash(init_css_set.subsys));
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
@@ -5328,10 +5490,16 @@ int __init cgroup_init(void)
continue;
}
+ if (cgroup_ssid_no_v1(ssid))
+ printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
+ ss->name);
+
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- if (!ss->dfl_cftypes)
- cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->implicit_on_dfl)
+ cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+ else if (!ss->dfl_cftypes)
+ cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
@@ -5344,8 +5512,14 @@ int __init cgroup_init(void)
ss->bind(init_css_set.subsys[ssid]);
}
+ /* init_css_set.subsys[] has been updated, re-hash */
+ hash_del(&init_css_set.hlist);
+ hash_add(css_set_table, &init_css_set.hlist,
+ css_set_hash(init_css_set.subsys));
+
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
return 0;
@@ -5401,7 +5575,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -5489,19 +5663,6 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
-static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
- return &ss_priv[i - CGROUP_CANFORK_START];
- return NULL;
-}
-
-static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
-{
- void **private = subsys_canfork_priv_p(ss_priv, i);
- return private ? *private : NULL;
-}
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5524,17 +5685,16 @@ void cgroup_fork(struct task_struct *child)
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
-int cgroup_can_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
- for_each_subsys_which(ss, i, &have_canfork_callback) {
- ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ do_each_subsys_mask(ss, i, have_canfork_callback) {
+ ret = ss->can_fork(child);
if (ret)
goto out_revert;
- }
+ } while_each_subsys_mask();
return 0;
@@ -5543,7 +5703,7 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ ss->cancel_fork(child);
}
return ret;
@@ -5556,15 +5716,14 @@ out_revert:
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
-void cgroup_cancel_fork(struct task_struct *child,
- void *ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ ss->cancel_fork(child);
}
/**
@@ -5577,8 +5736,7 @@ void cgroup_cancel_fork(struct task_struct *child,
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child,
- void *old_ss_priv[CGROUP_CANFORK_COUNT])
+void cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
@@ -5621,8 +5779,9 @@ void cgroup_post_fork(struct task_struct *child,
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
+ do_each_subsys_mask(ss, i, have_fork_callback) {
+ ss->fork(child);
+ } while_each_subsys_mask();
}
/**
@@ -5665,8 +5824,9 @@ void cgroup_exit(struct task_struct *tsk)
}
/* see cgroup_post_fork() for details */
- for_each_subsys_which(ss, i, &have_exit_callback)
+ do_each_subsys_mask(ss, i, have_exit_callback) {
ss->exit(tsk);
+ } while_each_subsys_mask();
}
void cgroup_free(struct task_struct *task)
@@ -5675,8 +5835,9 @@ void cgroup_free(struct task_struct *task)
struct cgroup_subsys *ss;
int ssid;
- for_each_subsys_which(ss, ssid, &have_free_callback)
+ do_each_subsys_mask(ss, ssid, have_free_callback) {
ss->free(task);
+ } while_each_subsys_mask();
put_css_set(cset);
}
@@ -5769,6 +5930,33 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_no_v1(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ if (!strcmp(token, "all")) {
+ cgroup_no_v1_mask = U16_MAX;
+ break;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ cgroup_no_v1_mask |= 1 << i;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5782,12 +5970,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct file_system_type *s_type = dentry->d_sb->s_type;
struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp;
/* is @dentry a cgroup dir? */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
+ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
+ !kn || kernfs_type(kn) != KERNFS_DIR)
return ERR_PTR(-EBADF);
rcu_read_lock();
@@ -5822,6 +6011,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
}
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it. Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+
+ kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+ if (kn) {
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ cgrp = kn->priv;
+ cgroup_get(cgrp);
+ } else {
+ cgrp = ERR_PTR(-ENOTDIR);
+ }
+ kernfs_put(kn);
+ } else {
+ cgrp = ERR_PTR(-ENOENT);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+ pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+ cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+ if (cgroup_sk_alloc_disabled)
+ return;
+
+ rcu_read_lock();
+
+ while (true) {
+ struct css_set *cset;
+
+ cset = task_css_set(current);
+ if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ skcd->val = (unsigned long)cset->dfl_cgrp;
+ break;
+ }
+ cpu_relax();
+ }
+
+ rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+ cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2d3df82c54f2..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task, void *private)
+static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index b50d5a167fda..303097b37429 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
- * succeded, otherwise -EAGAIN.
+ * succeeded, otherwise -EAGAIN.
*/
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
@@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task, void **priv_p)
+static int pids_can_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
@@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
return pids_try_charge(pids, 1);
}
-static void pids_cancel_fork(struct task_struct *task, void *priv)
+static void pids_cancel_fork(struct task_struct *task)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index d8560ee3bab7..9ad37b9e44a7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -24,7 +24,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
-struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
@@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu)
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
- static_key_slow_inc(&context_tracking_enabled);
+ static_branch_inc(&context_tracking_enabled);
}
if (initialized)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85ff5e26e23b..6ea42e8da861 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,13 +22,88 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/smpboot.h>
+
#include <trace/events/power.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpuhp.h>
#include "smpboot.h"
+/**
+ * cpuhp_cpu_state - Per cpu hotplug state storage
+ * @state: The current cpu state
+ * @target: The target state
+ * @thread: Pointer to the hotplug thread
+ * @should_run: Thread should execute
+ * @cb_stat: The state for a single callback (install/uninstall)
+ * @cb: Single callback function (install/uninstall)
+ * @result: Result of the operation
+ * @done: Signal completion to the issuer of the task
+ */
+struct cpuhp_cpu_state {
+ enum cpuhp_state state;
+ enum cpuhp_state target;
+#ifdef CONFIG_SMP
+ struct task_struct *thread;
+ bool should_run;
+ enum cpuhp_state cb_state;
+ int (*cb)(unsigned int cpu);
+ int result;
+ struct completion done;
+#endif
+};
+
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+
+/**
+ * cpuhp_step - Hotplug state machine step
+ * @name: Name of the step
+ * @startup: Startup function of the step
+ * @teardown: Teardown function of the step
+ * @skip_onerr: Do not invoke the functions on error rollback
+ * Will go away once the notifiers are gone
+ * @cant_stop: Bringup/teardown can't be stopped at this step
+ */
+struct cpuhp_step {
+ const char *name;
+ int (*startup)(unsigned int cpu);
+ int (*teardown)(unsigned int cpu);
+ bool skip_onerr;
+ bool cant_stop;
+};
+
+static DEFINE_MUTEX(cpuhp_state_mutex);
+static struct cpuhp_step cpuhp_bp_states[];
+static struct cpuhp_step cpuhp_ap_states[];
+
+/**
+ * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * @cpu: The cpu for which the callback should be invoked
+ * @step: The step in the state machine
+ * @cb: The callback function to invoke
+ *
+ * Called from cpu hotplug and from the state register machinery
+ */
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
+ int (*cb)(unsigned int))
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret = 0;
+
+ if (cb) {
+ trace_cpuhp_enter(cpu, st->target, step, cb);
+ ret = cb(cpu);
+ trace_cpuhp_exit(cpu, st->state, step, ret);
+ }
+ return ret;
+}
+
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
+bool cpuhp_tasks_frozen;
+EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
/*
* The following two APIs (cpu_maps_update_begin/done) must be used when
@@ -207,31 +282,281 @@ int __register_cpu_notifier(struct notifier_block *nb)
return raw_notifier_chain_register(&cpu_chain, nb);
}
-static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
int *nr_calls)
{
+ unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ void *hcpu = (void *)(long)cpu;
+
int ret;
- ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+ ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
nr_calls);
return notifier_to_errno(ret);
}
-static int cpu_notify(unsigned long val, void *v)
+static int cpu_notify(unsigned long val, unsigned int cpu)
{
- return __cpu_notify(val, v, -1, NULL);
+ return __cpu_notify(val, cpu, -1, NULL);
}
-#ifdef CONFIG_HOTPLUG_CPU
+/* Notifier wrappers for transitioning to state machine */
+static int notify_prepare(unsigned int cpu)
+{
+ int nr_calls = 0;
+ int ret;
-static void cpu_notify_nofail(unsigned long val, void *v)
+ ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
+ if (ret) {
+ nr_calls--;
+ printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
+ __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
+ }
+ return ret;
+}
+
+static int notify_online(unsigned int cpu)
+{
+ cpu_notify(CPU_ONLINE, cpu);
+ return 0;
+}
+
+static int notify_starting(unsigned int cpu)
{
- BUG_ON(cpu_notify(val, v));
+ cpu_notify(CPU_STARTING, cpu);
+ return 0;
}
+
+static int bringup_wait_for_ap(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ wait_for_completion(&st->done);
+ return st->result;
+}
+
+static int bringup_cpu(unsigned int cpu)
+{
+ struct task_struct *idle = idle_thread_get(cpu);
+ int ret;
+
+ /* Arch-specific enabling code. */
+ ret = __cpu_up(cpu, idle);
+ if (ret) {
+ cpu_notify(CPU_UP_CANCELED, cpu);
+ return ret;
+ }
+ ret = bringup_wait_for_ap(cpu);
+ BUG_ON(!cpu_online(cpu));
+ return ret;
+}
+
+/*
+ * Hotplug state machine related functions
+ */
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
+ struct cpuhp_step *steps)
+{
+ for (st->state++; st->state < st->target; st->state++) {
+ struct cpuhp_step *step = steps + st->state;
+
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, step->startup);
+ }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ struct cpuhp_step *steps, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ for (; st->state > target; st->state--) {
+ struct cpuhp_step *step = steps + st->state;
+
+ ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_down(cpu, st, steps);
+ break;
+ }
+ }
+ return ret;
+}
+
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
+ struct cpuhp_step *steps)
+{
+ for (st->state--; st->state > st->target; st->state--) {
+ struct cpuhp_step *step = steps + st->state;
+
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ }
+}
+
+static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ struct cpuhp_step *steps, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ while (st->state < target) {
+ struct cpuhp_step *step;
+
+ st->state++;
+ step = steps + st->state;
+ ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_up(cpu, st, steps);
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * The cpu hotplug threads manage the bringup and teardown of the cpus
+ */
+static void cpuhp_create(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ init_completion(&st->done);
+}
+
+static int cpuhp_should_run(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+ return st->should_run;
+}
+
+/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
+static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
+
+ return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+}
+
+/* Execute the online startup callbacks. Used to be CPU_ONLINE */
+static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+}
+
+/*
+ * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
+ * callbacks when a state gets [un]installed at runtime.
+ */
+static void cpuhp_thread_fun(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+ int ret = 0;
+
+ /*
+ * Paired with the mb() in cpuhp_kick_ap_work and
+ * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+ */
+ smp_mb();
+ if (!st->should_run)
+ return;
+
+ st->should_run = false;
+
+ /* Single callback invocation for [un]install ? */
+ if (st->cb) {
+ if (st->cb_state < CPUHP_AP_ONLINE) {
+ local_irq_disable();
+ ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+ local_irq_enable();
+ } else {
+ ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+ }
+ } else {
+ /* Cannot happen .... */
+ BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+
+ /* Regular hotplug work */
+ if (st->state < st->target)
+ ret = cpuhp_ap_online(cpu, st);
+ else if (st->state > st->target)
+ ret = cpuhp_ap_offline(cpu, st);
+ }
+ st->result = ret;
+ complete(&st->done);
+}
+
+/* Invoke a single callback on a remote cpu */
+static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
+ int (*cb)(unsigned int))
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ st->cb_state = state;
+ st->cb = cb;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_completion(&st->done);
+ return st->result;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
+{
+ st->result = 0;
+ st->cb = NULL;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+}
+
+static int cpuhp_kick_ap_work(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ enum cpuhp_state state = st->state;
+
+ trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+ __cpuhp_kick_ap_work(st);
+ wait_for_completion(&st->done);
+ trace_cpuhp_exit(cpu, st->state, state, st->result);
+ return st->result;
+}
+
+static struct smp_hotplug_thread cpuhp_threads = {
+ .store = &cpuhp_state.thread,
+ .create = &cpuhp_create,
+ .thread_should_run = cpuhp_should_run,
+ .thread_fn = cpuhp_thread_fun,
+ .thread_comm = "cpuhp/%u",
+ .selfparking = true,
+};
+
+void __init cpuhp_threads_init(void)
+{
+ BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
+ kthread_unpark(this_cpu_read(cpuhp_state.thread));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
-
void unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
@@ -311,57 +636,60 @@ static inline void check_for_tasks(int dead_cpu)
read_unlock(&tasklist_lock);
}
-struct take_cpu_down_param {
- unsigned long mod;
- void *hcpu;
-};
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+ BUG_ON(cpu_notify(val, cpu));
+}
+
+static int notify_down_prepare(unsigned int cpu)
+{
+ int err, nr_calls = 0;
+
+ err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
+ if (err) {
+ nr_calls--;
+ __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
+ }
+ return err;
+}
+
+static int notify_dying(unsigned int cpu)
+{
+ cpu_notify(CPU_DYING, cpu);
+ return 0;
+}
/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
- struct take_cpu_down_param *param = _param;
- int err;
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+ enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
+ int err, cpu = smp_processor_id();
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
- cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Invoke the former CPU_DYING callbacks */
+ for (; st->state > target; st->state--) {
+ struct cpuhp_step *step = cpuhp_ap_states + st->state;
+
+ cpuhp_invoke_callback(cpu, st->state, step->teardown);
+ }
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Park the stopper thread */
- stop_machine_park((long)param->hcpu);
+ stop_machine_park(cpu);
return 0;
}
-/* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu, int tasks_frozen)
+static int takedown_cpu(unsigned int cpu)
{
- int err, nr_calls = 0;
- void *hcpu = (void *)(long)cpu;
- unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
- struct take_cpu_down_param tcd_param = {
- .mod = mod,
- .hcpu = hcpu,
- };
-
- if (num_online_cpus() == 1)
- return -EBUSY;
-
- if (!cpu_online(cpu))
- return -EINVAL;
-
- cpu_hotplug_begin();
-
- err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
- if (err) {
- nr_calls--;
- __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
- pr_warn("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
- goto out_release;
- }
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int err;
/*
* By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -378,6 +706,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
else
synchronize_rcu();
+ /* Park the smpboot threads */
+ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
smpboot_park_threads(cpu);
/*
@@ -389,12 +719,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
- err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
- cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+ cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
irq_unlock_sparse();
- goto out_release;
+ return err;
}
BUG_ON(cpu_online(cpu));
@@ -405,10 +735,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
*
* Wait for the stop thread to go away.
*/
- while (!per_cpu(cpu_dead_idle, cpu))
- cpu_relax();
- smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
- per_cpu(cpu_dead_idle, cpu) = false;
+ wait_for_completion(&st->done);
+ BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
irq_unlock_sparse();
@@ -417,20 +745,104 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
/* This actually kills the CPU. */
__cpu_die(cpu);
- /* CPU is completely dead: tell everyone. Too late to complain. */
tick_cleanup_dead_cpu(cpu);
- cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+ return 0;
+}
+static int notify_dead(unsigned int cpu)
+{
+ cpu_notify_nofail(CPU_DEAD, cpu);
check_for_tasks(cpu);
+ return 0;
+}
+
+static void cpuhp_complete_idle_dead(void *arg)
+{
+ struct cpuhp_cpu_state *st = arg;
+
+ complete(&st->done);
+}
+
+void cpuhp_report_idle_dead(void)
+{
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+ BUG_ON(st->state != CPUHP_AP_OFFLINE);
+ rcu_report_dead(smp_processor_id());
+ st->state = CPUHP_AP_IDLE_DEAD;
+ /*
+ * We cannot call complete after rcu_report_dead() so we delegate it
+ * to an online cpu.
+ */
+ smp_call_function_single(cpumask_first(cpu_online_mask),
+ cpuhp_complete_idle_dead, st, 0);
+}
+
+#else
+#define notify_down_prepare NULL
+#define takedown_cpu NULL
+#define notify_dead NULL
+#define notify_dying NULL
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Requires cpu_add_remove_lock to be held */
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
+ enum cpuhp_state target)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int prev_state, ret = 0;
+ bool hasdied = false;
+
+ if (num_online_cpus() == 1)
+ return -EBUSY;
+
+ if (!cpu_present(cpu))
+ return -EINVAL;
+
+ cpu_hotplug_begin();
-out_release:
+ cpuhp_tasks_frozen = tasks_frozen;
+
+ prev_state = st->state;
+ st->target = target;
+ /*
+ * If the current CPU state is in the range of the AP hotplug thread,
+ * then we need to kick the thread.
+ */
+ if (st->state > CPUHP_TEARDOWN_CPU) {
+ ret = cpuhp_kick_ap_work(cpu);
+ /*
+ * The AP side has done the error rollback already. Just
+ * return the error code..
+ */
+ if (ret)
+ goto out;
+
+ /*
+ * We might have stopped still in the range of the AP hotplug
+ * thread. Nothing to do anymore.
+ */
+ if (st->state > CPUHP_TEARDOWN_CPU)
+ goto out;
+ }
+ /*
+ * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
+ * to do the further cleanups.
+ */
+ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+
+ hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
+out:
cpu_hotplug_done();
- if (!err)
- cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
- return err;
+ /* This post dead nonsense must die */
+ if (!ret && hasdied)
+ cpu_notify_nofail(CPU_POST_DEAD, cpu);
+ return ret;
}
-int cpu_down(unsigned int cpu)
+static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
{
int err;
@@ -441,100 +853,131 @@ int cpu_down(unsigned int cpu)
goto out;
}
- err = _cpu_down(cpu, 0);
+ err = _cpu_down(cpu, 0, target);
out:
cpu_maps_update_done();
return err;
}
+int cpu_down(unsigned int cpu)
+{
+ return do_cpu_down(cpu, CPUHP_OFFLINE);
+}
EXPORT_SYMBOL(cpu_down);
#endif /*CONFIG_HOTPLUG_CPU*/
-/*
- * Unpark per-CPU smpboot kthreads at CPU-online time.
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-static int smpboot_thread_call(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+void notify_cpu_starting(unsigned int cpu)
{
- int cpu = (long)hcpu;
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- switch (action & ~CPU_TASKS_FROZEN) {
+ while (st->state < target) {
+ struct cpuhp_step *step;
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- smpboot_unpark_threads(cpu);
- break;
-
- default:
- break;
+ st->state++;
+ step = cpuhp_ap_states + st->state;
+ cpuhp_invoke_callback(cpu, st->state, step->startup);
}
-
- return NOTIFY_OK;
}
-static struct notifier_block smpboot_thread_notifier = {
- .notifier_call = smpboot_thread_call,
- .priority = CPU_PRI_SMPBOOT,
-};
-
-void smpboot_thread_init(void)
+/*
+ * Called from the idle task. We need to set active here, so we can kick off
+ * the stopper thread and unpark the smpboot threads. If the target state is
+ * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
+ * cpu further.
+ */
+void cpuhp_online_idle(enum cpuhp_state state)
{
- register_cpu_notifier(&smpboot_thread_notifier);
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+ unsigned int cpu = smp_processor_id();
+
+ /* Happens for the boot cpu */
+ if (state != CPUHP_AP_ONLINE_IDLE)
+ return;
+
+ st->state = CPUHP_AP_ONLINE_IDLE;
+
+ /* The cpu is marked online, set it active now */
+ set_cpu_active(cpu, true);
+ /* Unpark the stopper thread and the hotplug thread of this cpu */
+ stop_machine_unpark(cpu);
+ kthread_unpark(st->thread);
+
+ /* Should we go further up ? */
+ if (st->target > CPUHP_AP_ONLINE_IDLE)
+ __cpuhp_kick_ap_work(st);
+ else
+ complete(&st->done);
}
/* Requires cpu_add_remove_lock to be held */
-static int _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
- int ret, nr_calls = 0;
- void *hcpu = (void *)(long)cpu;
- unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle;
+ int ret = 0;
cpu_hotplug_begin();
- if (cpu_online(cpu) || !cpu_present(cpu)) {
+ if (!cpu_present(cpu)) {
ret = -EINVAL;
goto out;
}
- idle = idle_thread_get(cpu);
- if (IS_ERR(idle)) {
- ret = PTR_ERR(idle);
- goto out;
- }
-
- ret = smpboot_create_threads(cpu);
- if (ret)
+ /*
+ * The caller of do_cpu_up might have raced with another
+ * caller. Ignore it for now.
+ */
+ if (st->state >= target)
goto out;
- ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
- if (ret) {
- nr_calls--;
- pr_warn("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
- goto out_notify;
+ if (st->state == CPUHP_OFFLINE) {
+ /* Let it fail before we try to bring the cpu up */
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
}
- /* Arch-specific enabling code. */
- ret = __cpu_up(cpu, idle);
-
- if (ret != 0)
- goto out_notify;
- BUG_ON(!cpu_online(cpu));
+ cpuhp_tasks_frozen = tasks_frozen;
- /* Now call notifier in preparation. */
- cpu_notify(CPU_ONLINE | mod, hcpu);
+ st->target = target;
+ /*
+ * If the current CPU state is in the range of the AP hotplug thread,
+ * then we need to kick the thread once more.
+ */
+ if (st->state > CPUHP_BRINGUP_CPU) {
+ ret = cpuhp_kick_ap_work(cpu);
+ /*
+ * The AP side has done the error rollback already. Just
+ * return the error code..
+ */
+ if (ret)
+ goto out;
+ }
-out_notify:
- if (ret != 0)
- __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ /*
+ * Try to reach the target state. We max out on the BP at
+ * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
+ * responsible for bringing it up to the target state.
+ */
+ target = min((int)target, CPUHP_BRINGUP_CPU);
+ ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
out:
cpu_hotplug_done();
-
return ret;
}
-int cpu_up(unsigned int cpu)
+static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
{
int err = 0;
@@ -558,12 +1001,16 @@ int cpu_up(unsigned int cpu)
goto out;
}
- err = _cpu_up(cpu, 0);
-
+ err = _cpu_up(cpu, 0, target);
out:
cpu_maps_update_done();
return err;
}
+
+int cpu_up(unsigned int cpu)
+{
+ return do_cpu_up(cpu, CPUHP_ONLINE);
+}
EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
@@ -586,7 +1033,7 @@ int disable_nonboot_cpus(void)
if (cpu == first_cpu)
continue;
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
- error = _cpu_down(cpu, 1);
+ error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
@@ -636,7 +1083,7 @@ void enable_nonboot_cpus(void)
for_each_cpu(cpu, frozen_cpus) {
trace_suspend_resume(TPS("CPU_ON"), cpu, true);
- error = _cpu_up(cpu, 1);
+ error = _cpu_up(cpu, 1, CPUHP_ONLINE);
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
@@ -709,26 +1156,463 @@ core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
+#endif /* CONFIG_SMP */
+
+/* Boot processor state steps */
+static struct cpuhp_step cpuhp_bp_states[] = {
+ [CPUHP_OFFLINE] = {
+ .name = "offline",
+ .startup = NULL,
+ .teardown = NULL,
+ },
+#ifdef CONFIG_SMP
+ [CPUHP_CREATE_THREADS]= {
+ .name = "threads:create",
+ .startup = smpboot_create_threads,
+ .teardown = NULL,
+ .cant_stop = true,
+ },
+ /*
+ * Preparatory and dead notifiers. Will be replaced once the notifiers
+ * are converted to states.
+ */
+ [CPUHP_NOTIFY_PREPARE] = {
+ .name = "notify:prepare",
+ .startup = notify_prepare,
+ .teardown = notify_dead,
+ .skip_onerr = true,
+ .cant_stop = true,
+ },
+ /* Kicks the plugged cpu into life */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup = bringup_cpu,
+ .teardown = NULL,
+ .cant_stop = true,
+ },
+ /*
+ * Handled on controll processor until the plugged processor manages
+ * this itself.
+ */
+ [CPUHP_TEARDOWN_CPU] = {
+ .name = "cpu:teardown",
+ .startup = NULL,
+ .teardown = takedown_cpu,
+ .cant_stop = true,
+ },
+#endif
+};
+
+/* Application processor state steps */
+static struct cpuhp_step cpuhp_ap_states[] = {
+#ifdef CONFIG_SMP
+ /* Final state before CPU kills itself */
+ [CPUHP_AP_IDLE_DEAD] = {
+ .name = "idle:dead",
+ },
+ /*
+ * Last state before CPU enters the idle loop to die. Transient state
+ * for synchronization.
+ */
+ [CPUHP_AP_OFFLINE] = {
+ .name = "ap:offline",
+ .cant_stop = true,
+ },
+ /*
+ * Low level startup/teardown notifiers. Run with interrupts
+ * disabled. Will be removed once the notifiers are converted to
+ * states.
+ */
+ [CPUHP_AP_NOTIFY_STARTING] = {
+ .name = "notify:starting",
+ .startup = notify_starting,
+ .teardown = notify_dying,
+ .skip_onerr = true,
+ .cant_stop = true,
+ },
+ /* Entry state on starting. Interrupts enabled from here on. Transient
+ * state for synchronsization */
+ [CPUHP_AP_ONLINE] = {
+ .name = "ap:online",
+ },
+ /* Handle smpboot threads park/unpark */
+ [CPUHP_AP_SMPBOOT_THREADS] = {
+ .name = "smpboot:threads",
+ .startup = smpboot_unpark_threads,
+ .teardown = NULL,
+ },
+ /*
+ * Online/down_prepare notifiers. Will be removed once the notifiers
+ * are converted to states.
+ */
+ [CPUHP_AP_NOTIFY_ONLINE] = {
+ .name = "notify:online",
+ .startup = notify_online,
+ .teardown = notify_down_prepare,
+ },
+#endif
+ /*
+ * The dynamically registered state space is here
+ */
+
+ /* CPU is fully up and running. */
+ [CPUHP_ONLINE] = {
+ .name = "online",
+ .startup = NULL,
+ .teardown = NULL,
+ },
+};
+
+/* Sanity check for callbacks */
+static int cpuhp_cb_check(enum cpuhp_state state)
+{
+ if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
+ return -EINVAL;
+ return 0;
+}
+
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+ /*
+ * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+ * purposes as that state is handled explicitely in cpu_down.
+ */
+ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+ struct cpuhp_step *sp;
+
+ sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+ return sp + state;
+}
+
+static void cpuhp_store_callbacks(enum cpuhp_state state,
+ const char *name,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu))
+{
+ /* (Un)Install the callbacks for further cpu hotplug operations */
+ struct cpuhp_step *sp;
+
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(state);
+ sp->startup = startup;
+ sp->teardown = teardown;
+ sp->name = name;
+ mutex_unlock(&cpuhp_state_mutex);
+}
+
+static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
+{
+ return cpuhp_get_step(state)->teardown;
+}
+
+/*
+ * Call the startup/teardown function for a step either on the AP or
+ * on the current CPU.
+ */
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
+ int (*cb)(unsigned int), bool bringup)
+{
+ int ret;
+
+ if (!cb)
+ return 0;
+ /*
+ * The non AP bound callbacks can fail on bringup. On teardown
+ * e.g. module removal we crash for now.
+ */
+#ifdef CONFIG_SMP
+ if (cpuhp_is_ap_state(state))
+ ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+ else
+ ret = cpuhp_invoke_callback(cpu, state, cb);
+#else
+ ret = cpuhp_invoke_callback(cpu, state, cb);
+#endif
+ BUG_ON(ret && !bringup);
+ return ret;
+}
+
+/*
+ * Called from __cpuhp_setup_state on a recoverable failure.
+ *
+ * Note: The teardown callbacks for rollback are not allowed to fail!
+ */
+static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
+ int (*teardown)(unsigned int cpu))
+{
+ int cpu;
+
+ if (!teardown)
+ return;
+
+ /* Roll back the already executed steps on the other cpus */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpu >= failedcpu)
+ break;
+
+ /* Did we invoke the startup call on that cpu ? */
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, teardown, false);
+ }
+}
+
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+ enum cpuhp_state i;
+
+ mutex_lock(&cpuhp_state_mutex);
+ for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
+ if (cpuhp_ap_states[i].name)
+ continue;
+
+ cpuhp_ap_states[i].name = "Reserved";
+ mutex_unlock(&cpuhp_state_mutex);
+ return i;
+ }
+ mutex_unlock(&cpuhp_state_mutex);
+ WARN(1, "No more dynamic states available for CPU hotplug\n");
+ return -ENOSPC;
+}
+
/**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
- * @cpu: cpu that just started
+ * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+ * @state: The state to setup
+ * @invoke: If true, the startup function is invoked for cpus where
+ * cpu state >= @state
+ * @startup: startup callback function
+ * @teardown: teardown callback function
*
- * This function calls the cpu_chain notifiers with CPU_STARTING.
- * It must be called by the arch code on the new cpu, before the new cpu
- * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ * Returns 0 if successful, otherwise a proper error code
*/
-void notify_cpu_starting(unsigned int cpu)
+int __cpuhp_setup_state(enum cpuhp_state state,
+ const char *name, bool invoke,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu))
{
- unsigned long val = CPU_STARTING;
+ int cpu, ret = 0;
+ int dyn_state = 0;
-#ifdef CONFIG_PM_SLEEP_SMP
- if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
- val = CPU_STARTING_FROZEN;
-#endif /* CONFIG_PM_SLEEP_SMP */
- cpu_notify(val, (void *)(long)cpu);
+ if (cpuhp_cb_check(state) || !name)
+ return -EINVAL;
+
+ get_online_cpus();
+
+ /* currently assignments for the ONLINE state are possible */
+ if (state == CPUHP_AP_ONLINE_DYN) {
+ dyn_state = 1;
+ ret = cpuhp_reserve_state(state);
+ if (ret < 0)
+ goto out;
+ state = ret;
+ }
+
+ cpuhp_store_callbacks(state, name, startup, teardown);
+
+ if (!invoke || !startup)
+ goto out;
+
+ /*
+ * Try to call the startup callback for each present cpu
+ * depending on the hotplug state of the cpu.
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate < state)
+ continue;
+
+ ret = cpuhp_issue_call(cpu, state, startup, true);
+ if (ret) {
+ cpuhp_rollback_install(cpu, state, teardown);
+ cpuhp_store_callbacks(state, NULL, NULL, NULL);
+ goto out;
+ }
+ }
+out:
+ put_online_cpus();
+ if (!ret && dyn_state)
+ return state;
+ return ret;
}
+EXPORT_SYMBOL(__cpuhp_setup_state);
-#endif /* CONFIG_SMP */
+/**
+ * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+ * @state: The state to remove
+ * @invoke: If true, the teardown function is invoked for cpus where
+ * cpu state >= @state
+ *
+ * The teardown callback is currently not allowed to fail. Think
+ * about module removal!
+ */
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+ int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+ int cpu;
+
+ BUG_ON(cpuhp_cb_check(state));
+
+ get_online_cpus();
+
+ if (!invoke || !teardown)
+ goto remove;
+
+ /*
+ * Call the teardown callback for each present cpu depending
+ * on the hotplug state of the cpu. This function is not
+ * allowed to fail currently!
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, teardown, false);
+ }
+remove:
+ cpuhp_store_callbacks(state, NULL, NULL, NULL);
+ put_online_cpus();
+}
+EXPORT_SYMBOL(__cpuhp_remove_state);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
+static ssize_t show_cpuhp_state(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->state);
+}
+static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+
+static ssize_t write_cpuhp_target(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int target, ret;
+
+ ret = kstrtoint(buf, 10, &target);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
+ if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
+ return -EINVAL;
+#else
+ if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
+ return -EINVAL;
+#endif
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(target);
+ ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ if (st->state < target)
+ ret = do_cpu_up(dev->id, target);
+ else
+ ret = do_cpu_down(dev->id, target);
+
+ unlock_device_hotplug();
+ return ret ? ret : count;
+}
+
+static ssize_t show_cpuhp_target(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->target);
+}
+static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static struct attribute *cpuhp_cpu_attrs[] = {
+ &dev_attr_state.attr,
+ &dev_attr_target.attr,
+ NULL
+};
+
+static struct attribute_group cpuhp_cpu_attr_group = {
+ .attrs = cpuhp_cpu_attrs,
+ .name = "hotplug",
+ NULL
+};
+
+static ssize_t show_cpuhp_states(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t cur, res = 0;
+ int i;
+
+ mutex_lock(&cpuhp_state_mutex);
+ for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
+ struct cpuhp_step *sp = cpuhp_get_step(i);
+
+ if (sp->name) {
+ cur = sprintf(buf, "%3d: %s\n", i, sp->name);
+ buf += cur;
+ res += cur;
+ }
+ }
+ mutex_unlock(&cpuhp_state_mutex);
+ return res;
+}
+static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+
+static struct attribute *cpuhp_cpu_root_attrs[] = {
+ &dev_attr_states.attr,
+ NULL
+};
+
+static struct attribute_group cpuhp_cpu_root_attr_group = {
+ .attrs = cpuhp_cpu_root_attrs,
+ .name = "hotplug",
+ NULL
+};
+
+static int __init cpuhp_sysfs_init(void)
+{
+ int cpu, ret;
+
+ ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+ &cpuhp_cpu_root_attr_group);
+ if (ret)
+ return ret;
+
+ for_each_possible_cpu(cpu) {
+ struct device *dev = get_cpu_device(cpu);
+
+ if (!dev)
+ continue;
+ ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+device_initcall(cpuhp_sysfs_init);
+#endif
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -759,71 +1643,55 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
- = CPU_BITS_ALL;
+struct cpumask __cpu_possible_mask __read_mostly
+ = {CPU_BITS_ALL};
#else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+struct cpumask __cpu_possible_mask __read_mostly;
#endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_mask);
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
-void set_cpu_possible(unsigned int cpu, bool possible)
-{
- if (possible)
- cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
-}
-
-void set_cpu_present(unsigned int cpu, bool present)
+void init_cpu_present(const struct cpumask *src)
{
- if (present)
- cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
+ cpumask_copy(&__cpu_present_mask, src);
}
-void set_cpu_online(unsigned int cpu, bool online)
+void init_cpu_possible(const struct cpumask *src)
{
- if (online) {
- cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- } else {
- cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
- }
+ cpumask_copy(&__cpu_possible_mask, src);
}
-void set_cpu_active(unsigned int cpu, bool active)
+void init_cpu_online(const struct cpumask *src)
{
- if (active)
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
+ cpumask_copy(&__cpu_online_mask, src);
}
-void init_cpu_present(const struct cpumask *src)
+/*
+ * Activate the first processor.
+ */
+void __init boot_cpu_init(void)
{
- cpumask_copy(to_cpumask(cpu_present_bits), src);
-}
+ int cpu = smp_processor_id();
-void init_cpu_possible(const struct cpumask *src)
-{
- cpumask_copy(to_cpumask(cpu_possible_bits), src);
+ /* Mark the boot cpu "present", "online" etc for SMP and UP case */
+ set_cpu_online(cpu, true);
+ set_cpu_active(cpu, true);
+ set_cpu_present(cpu, true);
+ set_cpu_possible(cpu, true);
}
-void init_cpu_online(const struct cpumask *src)
+/*
+ * Must be called _AFTER_ setting up the per_cpu areas
+ */
+void __init boot_cpu_state_init(void)
{
- cpumask_copy(to_cpumask(cpu_online_bits), src);
+ per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02a8ea5c9963..90899837ea78 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -51,6 +51,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
+#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
@@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
- time_t time; /* clock (secs) when val computed */
+ time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
@@ -286,6 +287,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock);
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
/*
* CPU / memory hotplug is handled asynchronously.
*/
@@ -971,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
/*
- * cpuset_migrate_mm
- *
- * Migrate memory region from one set of nodes to another.
- *
- * Temporarilly set tasks mems_allowed to target nodes of migration,
- * so that the migration code can allocate pages on these nodes.
- *
- * While the mm_struct we are migrating is typically from some
- * other task, the task_struct mems_allowed that we are hacking
- * is for our current task, which must allocate new pages for that
- * migrating memory region.
+ * Migrate memory region from one set of nodes to another. This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management. All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
*/
+struct cpuset_migrate_mm_work {
+ struct work_struct work;
+ struct mm_struct *mm;
+ nodemask_t from;
+ nodemask_t to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+ struct cpuset_migrate_mm_work *mwork =
+ container_of(work, struct cpuset_migrate_mm_work, work);
+
+ /* on a wq worker, no need to worry about %current's mems_allowed */
+ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+ mmput(mwork->mm);
+ kfree(mwork);
+}
+
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
- struct task_struct *tsk = current;
-
- tsk->mems_allowed = *to;
+ struct cpuset_migrate_mm_work *mwork;
- do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+ mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+ if (mwork) {
+ mwork->mm = mm;
+ mwork->from = *from;
+ mwork->to = *to;
+ INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+ queue_work(cpuset_migrate_mm_wq, &mwork->work);
+ } else {
+ mmput(mm);
+ }
+}
- rcu_read_lock();
- guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
- rcu_read_unlock();
+void cpuset_post_attach_flush(void)
+{
+ flush_workqueue(cpuset_migrate_mm_wq);
}
/*
@@ -1096,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
- mmput(mm);
+ else
+ mmput(mm);
}
css_task_iter_end(&it);
@@ -1374,7 +1398,7 @@ out:
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
@@ -1390,8 +1414,11 @@ static void fmeter_init(struct fmeter *fmp)
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
- time_t now = get_seconds();
- time_t ticks = now - fmp->time;
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
if (ticks == 0)
return;
@@ -1541,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs)) {
+ if (is_memory_migrate(cs))
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- }
- mmput(mm);
+ else
+ mmput(mm);
}
}
@@ -1710,6 +1737,7 @@ out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
+ flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
@@ -2061,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.bind = cpuset_bind,
.legacy_cftypes = files,
- .early_init = 1,
+ .early_init = true,
};
/**
@@ -2355,6 +2383,9 @@ void __init cpuset_init_smp(void)
top_cpuset.effective_mems = node_states[N_MEMORY];
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+ cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+ BUG_ON(!cpuset_migrate_mm_wq);
}
/**
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..0c0cd8a62285 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds);
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}
/**
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index e1dbf4a2c69e..90ff129c88a2 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
- " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " Boot the kernel with rodata=off\n"
" OR use hw breaks: help bph\n");
}
-#endif
return 1;
}
return 0;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4121345498e0..2a20c0dfdafc 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv)
continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
- mod->core_size, (void *)mod);
+ mod->core_layout.size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
#endif
@@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->module_core);
+ kdb_printf(" 0x%p", mod->core_layout.base);
#ifdef CONFIG_MODULE_UNLOAD
{
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..435c14a45118 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 9c418002b8c1..343c22f5e867 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -159,15 +159,24 @@ put_callchain_entry(int rctx)
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
- int rctx;
- struct perf_callchain_entry *entry;
-
- int kernel = !event->attr.exclude_callchain_kernel;
- int user = !event->attr.exclude_callchain_user;
+ bool kernel = !event->attr.exclude_callchain_kernel;
+ bool user = !event->attr.exclude_callchain_user;
+ /* Disallow cross-task user callchains. */
+ bool crosstask = event->ctx->task && event->ctx->task != current;
if (!kernel && !user)
return NULL;
+ return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+}
+
+struct perf_callchain_entry *
+get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+ bool crosstask, bool add_mark)
+{
+ struct perf_callchain_entry *entry;
+ int rctx;
+
entry = get_callchain_entry(&rctx);
if (rctx == -1)
return NULL;
@@ -175,10 +184,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!entry)
goto exit_put;
- entry->nr = 0;
+ entry->nr = init_nr;
if (kernel && !user_mode(regs)) {
- perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+ if (add_mark)
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
}
@@ -191,13 +201,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
}
if (regs) {
- /*
- * Disallow cross-task user callchains.
- */
- if (event->ctx->task && event->ctx->task != current)
+ if (crosstask)
goto exit_put;
- perf_callchain_store(entry, PERF_CONTEXT_USER);
+ if (add_mark)
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
perf_callchain_user(entry, regs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cfc227ccfceb..712570dddacd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
#include <asm/irq_regs.h>
-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);
struct remote_function_call {
@@ -66,8 +64,17 @@ static void remote_function(void *data)
struct task_struct *p = tfc->p;
if (p) {
- tfc->ret = -EAGAIN;
- if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ /* -EAGAIN */
+ if (task_cpu(p) != smp_processor_id())
+ return;
+
+ /*
+ * Now that we're on right CPU with IRQs disabled, we can test
+ * if we hit the right task without races.
+ */
+
+ tfc->ret = -ESRCH; /* No such (running) process */
+ if (p != current)
return;
}
@@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
.p = p,
.func = func,
.info = info,
- .ret = -ESRCH, /* No such (running) process */
+ .ret = -EAGAIN,
};
+ int ret;
- if (task_curr(p))
- smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ do {
+ ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ if (!ret)
+ ret = data.ret;
+ } while (ret == -EAGAIN);
- return data.ret;
+ return ret;
}
/**
@@ -126,11 +137,168 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-#define EVENT_OWNER_KERNEL ((void *) -1)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
{
- return event->owner == EVENT_OWNER_KERNEL;
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ * - removing the last event from a task ctx; this is relatively straight
+ * forward and is done in __perf_remove_from_context.
+ *
+ * - adding the first event to a task ctx; this is tricky because we cannot
+ * rely on ctx->is_active and therefore cannot use event_function_call().
+ * See perf_install_in_context().
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+ struct perf_event_context *, void *);
+
+struct event_function_struct {
+ struct perf_event *event;
+ event_f func;
+ void *data;
+};
+
+static int event_function(void *info)
+{
+ struct event_function_struct *efs = info;
+ struct perf_event *event = efs->event;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ int ret = 0;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ /*
+ * Since we do the IPI call without holding ctx->lock things can have
+ * changed, double check we hit the task we set out to hit.
+ */
+ if (ctx->task) {
+ if (ctx->task != current) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+
+ /*
+ * We only use event_function_call() on established contexts,
+ * and event_function() is only ever called when active (or
+ * rather, we'll have bailed in task_function_call() or the
+ * above ctx->task != current test), therefore we must have
+ * ctx->is_active here.
+ */
+ WARN_ON_ONCE(!ctx->is_active);
+ /*
+ * And since we have ctx->is_active, cpuctx->task_ctx must
+ * match.
+ */
+ WARN_ON_ONCE(task_ctx != ctx);
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ efs->func(event, cpuctx, ctx, efs->data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return ret;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ int ret = event_function(&efs);
+ WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ if (!event->parent) {
+ /*
+ * If this is a !child event, we must hold ctx::mutex to
+ * stabilize the the event->ctx relation. See
+ * perf_event_ctx_lock().
+ */
+ lockdep_assert_held(&ctx->mutex);
+ }
+
+ if (!task) {
+ cpu_function_call(event->cpu, event_function, &efs);
+ return;
+ }
+
+ if (task == TASK_TOMBSTONE)
+ return;
+
+again:
+ if (!task_function_call(task, event_function, &efs))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ func(event, NULL, ctx, data);
+ raw_spin_unlock_irq(&ctx->lock);
}
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
@@ -148,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
+ EVENT_TIME = 0x4,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -155,7 +324,13 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
@@ -337,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- raw_spin_lock(&cpuctx->ctx.lock);
- if (ctx)
- raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -548,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
* we are holding the rcu lock
*/
cgrp1 = perf_cgroup_from_task(task, NULL);
-
- /*
- * next is NULL when called from perf_event_enable_on_exec()
- * that will systematically cause a cgroup_switch()
- */
- if (next)
- cgrp2 = perf_cgroup_from_task(next, NULL);
+ cgrp2 = perf_cgroup_from_task(next, NULL);
/*
* only schedule out current cgroup events if we know
@@ -580,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
* we are holding the rcu lock
*/
cgrp1 = perf_cgroup_from_task(task, NULL);
-
- /* prev can never be NULL */
cgrp2 = perf_cgroup_from_task(prev, NULL);
/*
@@ -886,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx)
if (atomic_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
- if (ctx->task)
+ if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
}
@@ -903,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex nests and those are:
*
* - perf_event_exit_task_context() [ child , 0 ]
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event() [ parent, 1 ]
+ * perf_event_exit_event()
+ * put_event() [ parent, 1 ]
*
* - perf_event_init_context() [ parent, 0 ]
* inherit_task_group()
@@ -948,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx)
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
@@ -1047,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
+ *
* This has to cope with with the fact that until it is locked,
* the context could get moved to another task.
*/
@@ -1087,9 +1232,12 @@ retry:
goto retry;
}
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (ctx->task == TASK_TOMBSTONE ||
+ !atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
+ } else {
+ WARN_ON_ONCE(ctx->task != task);
}
}
rcu_read_unlock();
@@ -1149,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event)
/*
* Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
u64 run_end;
+ lockdep_assert_held(&ctx->lock);
+
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
+
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
@@ -1215,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ lockdep_assert_held(&ctx->lock);
+
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
@@ -1417,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event)) {
ctx->nr_cgroups--;
+ /*
+ * Because cgroup events are always per-cpu events, this will
+ * always be called from the right CPU.
+ */
cpuctx = __get_cpu_context(ctx);
/*
- * if there are no more cgroup events
- * then cler cgrp to avoid stale pointer
- * in update_cgrp_time_from_cpuctx()
+ * If there are no more cgroup events then clear cgrp to avoid
+ * stale pointer in update_cgrp_time_from_cpuctx().
*/
if (!ctx->nr_cgroups)
cpuctx->cgrp = NULL;
@@ -1499,45 +1654,11 @@ out:
perf_event__header_size(tmp);
}
-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
+ return event->state == PERF_EVENT_STATE_DEAD;
}
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
-}
-
-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
@@ -1580,14 +1701,14 @@ event_sched_out(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1598,9 +1719,6 @@ event_sched_out(struct perf_event *event,
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}
@@ -1624,10 +1742,7 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
-struct remove_event {
- struct perf_event *event;
- bool detach_group;
-};
+#define DETACH_GROUP 0x01UL
/*
* Cross CPU call to remove a performance event
@@ -1635,34 +1750,31 @@ struct remove_event {
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ unsigned long flags = (unsigned long)info;
- raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
- if (re->detach_group)
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
- if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+
+ if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
- cpuctx->task_ctx = NULL;
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ cpuctx->task_ctx = NULL;
+ }
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}
-
/*
* Remove the event from a task's (or a CPU's) list of events.
*
- * CPU events are removed with a smp call. For task events we only
- * call when the task is on a CPU.
- *
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since
@@ -1670,96 +1782,32 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
- struct remove_event re = {
- .event = event,
- .detach_group = detach_group,
- };
-
- lockdep_assert_held(&ctx->mutex);
-
- if (!task) {
- /*
- * Per cpu events are removed via an smp call. The removal can
- * fail if the CPU is currently offline, but in that case we
- * already called __perf_remove_from_context from
- * perf_event_exit_cpu.
- */
- cpu_function_call(event->cpu, __perf_remove_from_context, &re);
- return;
- }
-
-retry:
- if (!task_function_call(task, __perf_remove_from_context, &re))
- return;
+ lockdep_assert_held(&event->ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * If we failed to find a running task, but find the context active now
- * that we've acquired the ctx->lock, retry.
- */
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
- goto retry;
- }
-
- /*
- * Since the task isn't running, its safe to remove the event, us
- * holding the ctx->lock ensures the task won't get scheduled in.
- */
- if (detach_group)
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}
/*
* Cross CPU call to disable a performance event
*/
-int __perf_event_disable(void *info)
+static void __perf_event_disable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- /*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- *
- * Can trigger due to concurrent perf_event_context_sched_out()
- * flipping contexts around.
- */
- if (ctx->task && cpuctx->task_ctx != ctx)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
-
- /*
- * If the event is on, turn it off.
- * If it is in error state, leave it in error state.
- */
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
- if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
- else
- event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
- }
-
- raw_spin_unlock(&ctx->lock);
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return;
- return 0;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
}
/*
@@ -1770,7 +1818,8 @@ int __perf_event_disable(void *info)
* remains valid. This condition is satisifed when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
* When called from perf_pending_event it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
@@ -1778,43 +1827,20 @@ int __perf_event_disable(void *info)
static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
-
- if (!task) {
- /*
- * Disable the event on the cpu that it's on
- */
- cpu_function_call(event->cpu, __perf_event_disable, event);
- return;
- }
-
-retry:
- if (!task_function_call(task, __perf_event_disable, event))
- return;
raw_spin_lock_irq(&ctx->lock);
- /*
- * If the event is still active, we need to retry the cross-call.
- */
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (event->state <= PERF_EVENT_STATE_OFF) {
raw_spin_unlock_irq(&ctx->lock);
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
- goto retry;
- }
-
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_OFF;
+ return;
}
raw_spin_unlock_irq(&ctx->lock);
+
+ event_function_call(event, __perf_event_disable, NULL);
+}
+
+void perf_event_disable_local(struct perf_event *event)
+{
+ event_function_local(event, __perf_event_disable, NULL);
}
/*
@@ -1927,9 +1953,6 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);
@@ -2048,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = tstamp;
}
-static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
struct task_struct *task)
@@ -2067,10 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *task_ctx)
+{
+ perf_pmu_disable(cpuctx->ctx.pmu);
+ if (task_ctx)
+ task_ctx_sched_out(cpuctx, task_ctx);
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
/*
* Cross CPU call to install and enable a performance event
*
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
*/
static int __perf_install_in_context(void *info)
{
@@ -2078,72 +2127,59 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- struct task_struct *task = current;
-
- perf_ctx_lock(cpuctx, task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- /*
- * If there was an active task_ctx schedule it out.
- */
- if (task_ctx)
- task_ctx_sched_out(task_ctx);
+ bool activate = true;
+ int ret = 0;
- /*
- * If the context we're installing events in is not the
- * active task_ctx, flip them.
- */
- if (ctx->task && task_ctx != ctx) {
- if (task_ctx)
- raw_spin_unlock(&task_ctx->lock);
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx->task) {
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- }
- if (task_ctx) {
- cpuctx->task_ctx = task_ctx;
- task = task_ctx->task;
- }
-
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /* If we're on the wrong CPU, try again */
+ if (task_cpu(ctx->task) != smp_processor_id()) {
+ ret = -ESRCH;
+ goto unlock;
+ }
- update_context_time(ctx);
- /*
- * update cgrp time only if current cgrp
- * matches event->cgrp. Must be done before
- * calling add_event_to_ctx()
- */
- update_cgrp_time_from_event(event);
+ /*
+ * If we're on the right CPU, see if the task we target is
+ * current, if not we don't have to activate the ctx, a future
+ * context switch will do that for us.
+ */
+ if (ctx->task != current)
+ activate = false;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
- add_event_to_ctx(event, ctx);
+ } else if (task_ctx) {
+ raw_spin_lock(&task_ctx->lock);
+ }
- /*
- * Schedule everything back in
- */
- perf_event_sched_in(cpuctx, task_ctx, task);
+ if (activate) {
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ add_event_to_ctx(event, ctx);
+ ctx_resched(cpuctx, task_ctx);
+ } else {
+ add_event_to_ctx(event, ctx);
+ }
- perf_pmu_enable(cpuctx->ctx.pmu);
+unlock:
perf_ctx_unlock(cpuctx, task_ctx);
- return 0;
+ return ret;
}
/*
- * Attach a performance event to a context
+ * Attach a performance event to a context.
*
- * First we add the event to the list with the hardware enable bit
- * in event->hw_config cleared.
- *
- * If the event is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
+ * Very similar to event_function_call, see comment there.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
struct perf_event *event,
int cpu)
{
- struct task_struct *task = ctx->task;
+ struct task_struct *task = READ_ONCE(ctx->task);
lockdep_assert_held(&ctx->mutex);
@@ -2152,39 +2188,45 @@ perf_install_in_context(struct perf_event_context *ctx,
event->cpu = cpu;
if (!task) {
- /*
- * Per cpu events are installed via an smp call and
- * the install is always successful.
- */
cpu_function_call(cpu, __perf_install_in_context, event);
return;
}
-retry:
- if (!task_function_call(task, __perf_install_in_context, event))
+ /*
+ * Should not happen, we validate the ctx is still alive before calling.
+ */
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
return;
- raw_spin_lock_irq(&ctx->lock);
/*
- * If we failed to find a running task, but find the context active now
- * that we've acquired the ctx->lock, retry.
+ * Installing events is tricky because we cannot rely on ctx->is_active
+ * to be set in case this is the nr_events 0 -> 1 transition.
*/
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+again:
+ /*
+ * Cannot use task_function_call() because we need to run on the task's
+ * CPU regardless of whether its current or not.
+ */
+ if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ task = ctx->task;
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
/*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
+ * Cannot happen because we already checked above (which also
+ * cannot happen), and we hold ctx->mutex, which serializes us
+ * against perf_event_exit_task_context().
*/
- task = ctx->task;
- goto retry;
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
}
-
+ raw_spin_unlock_irq(&ctx->lock);
/*
- * Since the task isn't running, its safe to add the event, us holding
- * the ctx->lock ensures the task won't get scheduled in.
+ * Since !ctx->is_active doesn't mean anything, we must IPI
+ * unconditionally.
*/
- add_event_to_ctx(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
+ goto again;
}
/*
@@ -2211,80 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event)
/*
* Cross CPU call to enable a performance event
*/
-static int __perf_event_enable(void *info)
+static void __perf_event_enable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- int err;
+ struct perf_event_context *task_ctx;
- /*
- * There's a time window between 'ctx->is_active' check
- * in perf_event_enable function and this place having:
- * - IRQs on
- * - ctx->lock unlocked
- *
- * where the task could be killed and 'ctx' deactivated
- * by perf_event_exit_task.
- */
- if (!ctx->is_active)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
-
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto unlock;
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
+ return;
- /*
- * set current task's cgroup time reference point
- */
- perf_cgroup_set_timestamp(current, ctx);
+ if (ctx->is_active)
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
__perf_event_mark_enabled(event);
+ if (!ctx->is_active)
+ return;
+
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- goto unlock;
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ return;
}
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
-
- if (!group_can_go_on(event, cpuctx, 1)) {
- err = -EEXIST;
- } else {
- if (event == leader)
- err = group_sched_in(event, cpuctx, ctx);
- else
- err = event_sched_in(event, cpuctx, ctx);
- }
-
- if (err) {
- /*
- * If this event can't go on and it's part of a
- * group, then the whole group has to come off.
- */
- if (leader != event) {
- group_sched_out(leader, cpuctx, ctx);
- perf_mux_hrtimer_restart(cpuctx);
- }
- if (leader->attr.pinned) {
- update_group_times(leader);
- leader->state = PERF_EVENT_STATE_ERROR;
- }
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ return;
}
-unlock:
- raw_spin_unlock(&ctx->lock);
+ task_ctx = cpuctx->task_ctx;
+ if (ctx->task)
+ WARN_ON_ONCE(task_ctx != ctx);
- return 0;
+ ctx_resched(cpuctx, task_ctx);
}
/*
@@ -2299,58 +2308,26 @@ unlock:
static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
- if (!task) {
- /*
- * Enable the event on the cpu that it's on
- */
- cpu_function_call(event->cpu, __perf_event_enable, event);
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
+ raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto out;
-
/*
* If the event is in error state, clear that first.
- * That way, if we see the event in error state below, we
- * know that it has gone back into error state, as distinct
- * from the task having been scheduled away before the
- * cross-call arrived.
+ *
+ * That way, if we see the event in error state below, we know that it
+ * has gone back into error state, as distinct from the task having
+ * been scheduled away before the cross-call arrived.
*/
if (event->state == PERF_EVENT_STATE_ERROR)
event->state = PERF_EVENT_STATE_OFF;
-
-retry:
- if (!ctx->is_active) {
- __perf_event_mark_enabled(event);
- goto out;
- }
-
raw_spin_unlock_irq(&ctx->lock);
- if (!task_function_call(task, __perf_event_enable, event))
- return;
-
- raw_spin_lock_irq(&ctx->lock);
-
- /*
- * If the context is active and the event is still off,
- * we need to retry the cross-call.
- */
- if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
- /*
- * task could have been flipped by a concurrent
- * perf_event_context_sched_out()
- */
- task = ctx->task;
- goto retry;
- }
-
-out:
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_event_enable, NULL);
}
/*
@@ -2400,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
- struct perf_event *event;
int is_active = ctx->is_active;
+ struct perf_event *event;
- ctx->is_active &= ~event_type;
- if (likely(!ctx->nr_events))
+ lockdep_assert_held(&ctx->lock);
+
+ if (likely(!ctx->nr_events)) {
+ /*
+ * See __perf_remove_from_context().
+ */
+ WARN_ON_ONCE(ctx->is_active);
+ if (ctx->task)
+ WARN_ON_ONCE(cpuctx->task_ctx);
return;
+ }
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- if (!ctx->nr_active)
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* update (and stop) ctx time */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
+ if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
perf_pmu_disable(ctx->pmu);
- if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+ if (is_active & EVENT_PINNED) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
- if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+ if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
@@ -2576,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- /*
- * XXX do we need a memory barrier of sorts
- * wrt to rcu_dereference() of perf_event_ctxp
- */
- task->perf_event_ctxp[ctxn] = next_ctx;
- next->perf_event_ctxp[ctxn] = ctx;
- ctx->task = next;
- next_ctx->task = task;
+ WRITE_ONCE(ctx->task, next);
+ WRITE_ONCE(next_ctx->task, task);
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * RCU_INIT_POINTER here is safe because we've not
+ * modified the ctx and the above modification of
+ * ctx->task and ctx->task_ctx_data are immaterial
+ * since those values are always verified under
+ * ctx->lock which we're now holding.
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+
do_switch = 0;
perf_event_sync_stat(ctx, next_ctx);
@@ -2599,8 +2604,7 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
+ task_ctx_sched_out(cpuctx, ctx);
raw_spin_unlock(&ctx->lock);
}
}
@@ -2695,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_sched_out(task, next);
}
-static void task_ctx_sched_out(struct perf_event_context *ctx)
-{
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- if (!cpuctx->task_ctx)
- return;
-
- if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
- return;
-
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
-}
-
/*
* Called with IRQs disabled
*/
@@ -2783,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx,
enum event_type_t event_type,
struct task_struct *task)
{
- u64 now;
int is_active = ctx->is_active;
+ u64 now;
+
+ lockdep_assert_held(&ctx->lock);
- ctx->is_active |= event_type;
if (likely(!ctx->nr_events))
return;
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
+ ctx->is_active |= (event_type | EVENT_TIME);
+ if (ctx->task) {
+ if (!is_active)
+ cpuctx->task_ctx = ctx;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* start ctx time */
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
+ }
+
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+ if (is_active & EVENT_PINNED)
ctx_pinned_sched_in(ctx, cpuctx);
/* Then walk through the lower prio flexible groups */
- if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+ if (is_active & EVENT_FLEXIBLE)
ctx_flexible_sched_in(ctx, cpuctx);
}
@@ -2831,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* cpu flexible, task flexible.
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-
- if (ctx->nr_events)
- cpuctx->task_ctx = ctx;
-
- perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
-
+ perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
}
@@ -2858,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev,
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * If cgroup events exist on this CPU, then we need to check if we have
+ * to switch in PMU state; cgroup event are system-wide mode only.
+ *
+ * Since cgroup events are CPU events, we must schedule these in before
+ * we schedule in the task events.
+ */
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_sched_in(prev, task);
+
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
@@ -2865,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
perf_event_context_sched_in(ctx, task);
}
- /*
- * if cgroup events exist on this CPU, then we need
- * to check if we have to switch in PMU state.
- * cgroup event are system-wide mode only
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -3109,17 +3112,6 @@ done:
return rotate;
}
-#ifdef CONFIG_NO_HZ_FULL
-bool perf_event_can_stop_tick(void)
-{
- if (atomic_read(&nr_freq_events) ||
- __this_cpu_read(perf_throttled_count))
- return false;
- else
- return true;
-}
-#endif
-
void perf_event_task_tick(void)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
@@ -3130,6 +3122,7 @@ void perf_event_task_tick(void)
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
+ tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
perf_adjust_freq_unthr_context(ctx, throttled);
@@ -3157,46 +3150,31 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(int ctxn)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
- int ret;
local_irq_save(flags);
ctx = current->perf_event_ctxp[ctxn];
if (!ctx || !ctx->nr_events)
goto out;
- /*
- * We must ctxsw out cgroup events to avoid conflict
- * when invoking perf_task_event_sched_in() later on
- * in this function. Otherwise we end up trying to
- * ctxswin cgroup events which are already scheduled
- * in.
- */
- perf_cgroup_sched_out(current, NULL);
-
- raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(ctx);
-
- list_for_each_entry(event, &ctx->event_list, event_entry) {
- ret = event_enable_on_exec(event, ctx);
- if (ret)
- enabled = 1;
- }
+ cpuctx = __get_cpu_context(ctx);
+ perf_ctx_lock(cpuctx, ctx);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ enabled |= event_enable_on_exec(event, ctx);
/*
- * Unclone this context if we enabled any event.
+ * Unclone and reschedule this context if we enabled any event.
*/
- if (enabled)
+ if (enabled) {
clone_ctx = unclone_ctx(ctx);
+ ctx_resched(cpuctx, ctx);
+ }
+ perf_ctx_unlock(cpuctx, ctx);
- raw_spin_unlock(&ctx->lock);
-
- /*
- * Also calls ctxswin for cgroup events, if any:
- */
- perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
@@ -3392,7 +3370,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3434,7 +3411,7 @@ find_lively_task_by_vpid(pid_t vpid)
/* Reuse ptrace permission checks for now. */
err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto errout;
return task;
@@ -3577,13 +3554,37 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}
+#ifdef CONFIG_NO_HZ_FULL
+static DEFINE_SPINLOCK(nr_freq_lock);
+#endif
+
+static void unaccount_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ spin_lock(&nr_freq_lock);
+ if (atomic_dec_and_test(&nr_freq_events))
+ tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
+ spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void unaccount_freq_event(void)
+{
+ if (tick_nohz_full_enabled())
+ unaccount_freq_event_nohz();
+ else
+ atomic_dec(&nr_freq_events);
+}
+
static void unaccount_event(struct perf_event *event)
{
+ bool dec = false;
+
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -3591,19 +3592,32 @@ static void unaccount_event(struct perf_event *event)
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
- atomic_dec(&nr_freq_events);
+ unaccount_freq_event();
if (event->attr.context_switch) {
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
atomic_dec(&nr_switch_events);
}
if (is_cgroup_event(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (has_branch_stack(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
+
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }
unaccount_event_cpu(event, event->cpu);
}
+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
+}
+
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -3614,7 +3628,7 @@ static void unaccount_event(struct perf_event *event)
* 3) two matching events on the same context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
@@ -3689,29 +3703,6 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3733,7 +3724,25 @@ static void _free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}
/*
@@ -3760,14 +3769,13 @@ static void perf_remove_from_owner(struct perf_event *event)
struct task_struct *owner;
rcu_read_lock();
- owner = ACCESS_ONCE(event->owner);
/*
- * Matches the smp_wmb() in perf_event_exit_task(). If we observe
- * !owner it means the list deletion is complete and we can indeed
- * free this event, otherwise we need to serialize on
+ * Matches the smp_store_release() in perf_event_exit_task(). If we
+ * observe !owner it means the list deletion is complete and we can
+ * indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- smp_read_barrier_depends();
+ owner = lockless_dereference(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -3795,8 +3803,10 @@ static void perf_remove_from_owner(struct perf_event *event)
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ smp_store_release(&event->owner, NULL);
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3804,37 +3814,111 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *child, *tmp;
+
+ /*
+ * If we got here through err_file: fput(event_file); we will not have
+ * attached to a context yet.
+ */
+ if (!ctx) {
+ WARN_ON_ONCE(event->attach_state &
+ (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+ goto no_ctx;
+ }
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ perf_remove_from_context(event, DETACH_GROUP);
+
+ raw_spin_lock_irq(&ctx->lock);
/*
- * There are two ways this annotation is useful:
+ * Mark this even as STATE_DEAD, there is no external reference to it
+ * anymore.
*
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list.
*
- * 2) there is a lock-inversion with mmap_sem through
- * perf_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
*/
- ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
+ event->state = PERF_EVENT_STATE_DEAD;
+ raw_spin_unlock_irq(&ctx->lock);
+
perf_event_ctx_unlock(event, ctx);
- _free_event(event);
-}
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
-int perf_event_release_kernel(struct perf_event *event)
-{
- put_event(event);
+ /*
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+ ctx = lockless_dereference(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+ *
+ * Since the event cannot get freed while we hold the
+ * child_mutex, the context must also exist and have a !0
+ * reference count.
+ */
+ get_ctx(ctx);
+
+ /*
+ * Now that we have a ctx ref, we can drop child_mutex, and
+ * acquire ctx::mutex without fear of it going away. Then we
+ * can re-acquire child_mutex.
+ */
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ /*
+ * Now that we hold ctx::mutex and child_mutex, revalidate our
+ * state, if child is still the first entry, it didn't get freed
+ * and we can continue doing so.
+ */
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_del(&child->child_list);
+ free_event(child);
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+no_ctx:
+ put_event(event); /* Must be the 'last' reference */
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -3844,46 +3928,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -4027,7 +4075,7 @@ static bool is_event_hup(struct perf_event *event)
{
bool no_children;
- if (event->state != PERF_EVENT_STATE_EXIT)
+ if (event->state > PERF_EVENT_STATE_EXIT)
return false;
mutex_lock(&event->child_mutex);
@@ -4112,7 +4160,7 @@ static void _perf_event_reset(struct perf_event *event)
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
@@ -4144,20 +4192,14 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-struct period_event {
- struct perf_event *event;
- u64 value;
-};
-
-static int __perf_event_period(void *info)
+static void __perf_event_period(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct period_event *pe = info;
- struct perf_event *event = pe->event;
- struct perf_event_context *ctx = event->ctx;
- u64 value = pe->value;
+ u64 value = *((u64 *)info);
bool active;
- raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
event->attr.sample_freq = value;
} else {
@@ -4177,16 +4219,10 @@ static int __perf_event_period(void *info)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
- struct period_event pe = { .event = event, };
- struct perf_event_context *ctx = event->ctx;
- struct task_struct *task;
u64 value;
if (!is_sampling_event(event))
@@ -4201,34 +4237,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
- task = ctx->task;
- pe.value = value;
-
- if (!task) {
- cpu_function_call(event->cpu, __perf_event_period, &pe);
- return 0;
- }
-
-retry:
- if (!task_function_call(task, __perf_event_period, &pe))
- return 0;
-
- raw_spin_lock_irq(&ctx->lock);
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
- task = ctx->task;
- goto retry;
- }
-
- if (event->attr.freq) {
- event->attr.sample_freq = value;
- } else {
- event->attr.sample_period = value;
- event->hw.sample_period = value;
- }
-
- local64_set(&event->hw.period_left, 0);
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_event_period, &value);
return 0;
}
@@ -4940,9 +4949,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval < 0)
return retval;
@@ -5000,7 +5009,7 @@ static void perf_pending_event(struct irq_work *entry)
if (event->pending_disable) {
event->pending_disable = 0;
- __perf_event_disable(event);
+ perf_event_disable_local(event);
}
if (event->pending_wakeup) {
@@ -6427,9 +6436,9 @@ static int __perf_event_overflow(struct perf_event *event,
if (unlikely(throttle
&& hwc->interrupts >= max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
- tick_nohz_full_kick();
ret = 1;
}
}
@@ -6788,7 +6797,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
kfree_rcu(hlist, rcu_head);
}
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+static void swevent_hlist_put_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -6800,15 +6809,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-static void swevent_hlist_put(struct perf_event *event)
+static void swevent_hlist_put(void)
{
int cpu;
for_each_possible_cpu(cpu)
- swevent_hlist_put_cpu(event, cpu);
+ swevent_hlist_put_cpu(cpu);
}
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+static int swevent_hlist_get_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
int err = 0;
@@ -6831,14 +6840,13 @@ exit:
return err;
}
-static int swevent_hlist_get(struct perf_event *event)
+static int swevent_hlist_get(void)
{
- int err;
- int cpu, failed_cpu;
+ int err, cpu, failed_cpu;
get_online_cpus();
for_each_possible_cpu(cpu) {
- err = swevent_hlist_get_cpu(event, cpu);
+ err = swevent_hlist_get_cpu(cpu);
if (err) {
failed_cpu = cpu;
goto fail;
@@ -6851,7 +6859,7 @@ fail:
for_each_possible_cpu(cpu) {
if (cpu == failed_cpu)
break;
- swevent_hlist_put_cpu(event, cpu);
+ swevent_hlist_put_cpu(cpu);
}
put_online_cpus();
@@ -6867,7 +6875,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
static_key_slow_dec(&perf_swevent_enabled[event_id]);
- swevent_hlist_put(event);
+ swevent_hlist_put();
}
static int perf_swevent_init(struct perf_event *event)
@@ -6898,7 +6906,7 @@ static int perf_swevent_init(struct perf_event *event)
if (!event->parent) {
int err;
- err = swevent_hlist_get(event);
+ err = swevent_hlist_get();
if (err)
return err;
@@ -7819,31 +7827,75 @@ static void account_event_cpu(struct perf_event *event, int cpu)
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}
+/* Freq events need the tick to stay alive (see perf_event_task_tick). */
+static void account_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ /* Lock so we don't race with concurrent unaccount */
+ spin_lock(&nr_freq_lock);
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
+ spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void account_freq_event(void)
+{
+ if (tick_nohz_full_enabled())
+ account_freq_event_nohz();
+ else
+ atomic_inc(&nr_freq_events);
+}
+
+
static void account_event(struct perf_event *event)
{
+ bool inc = false;
+
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
- if (event->attr.freq) {
- if (atomic_inc_return(&nr_freq_events) == 1)
- tick_nohz_full_kick_all();
- }
+ if (event->attr.freq)
+ account_freq_event();
if (event->attr.context_switch) {
atomic_inc(&nr_switch_events);
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
}
if (has_branch_stack(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (is_cgroup_event(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
+
+ if (inc) {
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
+
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_sched();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:
account_event_cpu(event, event->cpu);
}
@@ -7979,6 +8031,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ /* symmetric to unaccount_event() in _free_event() */
+ account_event(event);
+
return event;
err_per_task:
@@ -8342,8 +8397,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- account_event(event);
-
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -8462,10 +8515,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
gctx = group_leader->ctx;
mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ if (gctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
if (!perf_event_validate_size(event)) {
err = -E2BIG;
goto err_locked;
@@ -8490,11 +8552,11 @@ SYSCALL_DEFINE5(perf_event_open,
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
@@ -8547,6 +8609,8 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__header_size(event);
perf_event__id_header_size(event);
+ event->owner = current;
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
@@ -8556,8 +8620,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_online_cpus();
- event->owner = current;
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -8582,7 +8644,12 @@ err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
err_cpus:
put_online_cpus();
err_task:
@@ -8624,9 +8691,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
}
/* Mark owner so we could distinguish it from user events. */
- event->owner = EVENT_OWNER_KERNEL;
-
- account_event(event);
+ event->owner = TASK_TOMBSTONE;
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
@@ -8636,12 +8701,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_unlock;
+ }
+
if (!exclusive_event_installable(event, ctx)) {
- mutex_unlock(&ctx->mutex);
- perf_unpin_context(ctx);
- put_ctx(ctx);
err = -EBUSY;
- goto err_free;
+ goto err_unlock;
}
perf_install_in_context(ctx, event, cpu);
@@ -8650,6 +8717,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_unlock:
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_free:
free_event(event);
err:
@@ -8674,7 +8745,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -8741,33 +8812,15 @@ static void sync_child_event(struct perf_event *child_event,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
-
- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}
static void
-__perf_event_exit_task(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
@@ -8780,57 +8833,86 @@ __perf_event_exit_task(struct perf_event *child_event,
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- perf_remove_from_context(child_event, !!child_event->parent);
+ raw_spin_lock_irq(&child_ctx->lock);
+ WARN_ON_ONCE(child_ctx->is_active);
+
+ if (parent_event)
+ perf_group_detach(child_event);
+ list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ raw_spin_unlock_irq(&child_ctx->lock);
/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!parent_event) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * Child events can be cleaned up.
+ */
+
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll() for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *next;
struct perf_event_context *child_ctx, *clone_ctx = NULL;
- unsigned long flags;
+ struct perf_event *child_event, *next;
+
+ WARN_ON_ONCE(child != current);
- if (likely(!child->perf_event_ctxp[ctxn]))
+ child_ctx = perf_pin_task_context(child, ctxn);
+ if (!child_ctx)
return;
- local_irq_save(flags);
/*
- * We can't reschedule here because interrupts are disabled,
- * and either child is current or it is a task that can't be
- * scheduled, so we are now safe from rescheduling changing
- * our context.
+ * In order to reduce the amount of tricky in ctx tear-down, we hold
+ * ctx::mutex over the entire thing. This serializes against almost
+ * everything that wants to access the ctx.
+ *
+ * The exception is sys_perf_event_open() /
+ * perf_event_create_kernel_count() which does find_get_context()
+ * without ctx::mutex (it cannot because of the move_group double mutex
+ * lock thing). See the comments in perf_install_in_context().
*/
- child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+ mutex_lock(&child_ctx->mutex);
/*
- * Take the context lock here so that if find_get_context is
- * reading child->perf_event_ctxp, we wait until it has
- * incremented the context's refcount before we do put_ctx below.
+ * In a single ctx::lock section, de-schedule the events and detach the
+ * context from the task such that we cannot ever get it scheduled back
+ * in.
*/
- raw_spin_lock(&child_ctx->lock);
- task_ctx_sched_out(child_ctx);
- child->perf_event_ctxp[ctxn] = NULL;
+ raw_spin_lock_irq(&child_ctx->lock);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
/*
- * If this context is a clone; unclone it so it can't get
- * swapped to another process while we're removing all
- * the events from it.
+ * Now that the context is inactive, destroy the task <-> ctx relation
+ * and mark the context dead.
*/
+ RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ put_ctx(child_ctx); /* cannot be last */
+ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
+ put_task_struct(current); /* cannot be last */
+
clone_ctx = unclone_ctx(child_ctx);
- update_context_time(child_ctx);
- raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+ raw_spin_unlock_irq(&child_ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -8842,20 +8924,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
perf_event_task(child, child_ctx, 0);
- /*
- * We can recurse on the same lock type through:
- *
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event()
- * mutex_lock(&ctx->mutex)
- *
- * But since its the parent context it won't be the same instance.
- */
- mutex_lock(&child_ctx->mutex);
-
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
@@ -8880,8 +8950,7 @@ void perf_event_exit_task(struct task_struct *child)
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
- smp_wmb();
- event->owner = NULL;
+ smp_store_release(&event->owner, NULL);
}
mutex_unlock(&child->perf_event_mutex);
@@ -8964,21 +9033,20 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
{
- int err;
- struct fd f;
- struct perf_event *event;
+ struct file *file;
- err = perf_fget_light(fd, &f);
- if (err)
- return ERR_PTR(err);
+ file = fget_raw(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
- event = f.file->private_data;
- atomic_long_inc(&event->refcount);
- fdput(f);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- return event;
+ return file;
}
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -9021,8 +9089,16 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel(), such that either we must observe
+ * is_orphaned_event() or they will observe us on the child_list.
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
@@ -9070,8 +9146,6 @@ inherit_event(struct perf_event *parent_event,
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
@@ -9276,7 +9350,7 @@ static void perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
- if (swhash->hlist_refcount > 0) {
+ if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -9289,13 +9363,14 @@ static void perf_event_init_cpu(int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *event;
- rcu_read_lock();
- list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
- __perf_remove_from_context(&re);
- rcu_read_unlock();
+ raw_spin_lock(&ctx->lock);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ raw_spin_unlock(&ctx->lock);
}
static void perf_event_exit_cpu_context(int cpu)
@@ -9351,11 +9426,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
perf_event_init_cpu(cpu);
break;
- case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
@@ -9384,9 +9457,6 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
-
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.
@@ -9406,6 +9476,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
return 0;
}
+EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
static int __init perf_event_sysfs_init(void)
{
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4ccc26..3f8cb1e14588 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
* current task.
*/
if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
- __perf_event_disable(bp);
+ perf_event_disable_local(bp);
else
perf_event_disable(bp);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c1274c..4199b6d193f5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -182,8 +182,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
/* Callchain handling */
extern struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs);
-extern int get_callchain_buffers(void);
-extern void put_callchain_buffers(void);
static inline int get_recursion_context(int *recursion)
{
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index adfdc0536117..1faad2cfdb9e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
__free_page(page);
}
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+ int pg;
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
+}
+
int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
@@ -547,30 +566,11 @@ out:
if (!ret)
rb->aux_pgoff = pgoff;
else
- rb_free_aux(rb);
+ __rb_free_aux(rb);
return ret;
}
-static void __rb_free_aux(struct ring_buffer *rb)
-{
- int pg;
-
- if (rb->aux_priv) {
- rb->free_aux(rb->aux_priv);
- rb->free_aux = NULL;
- rb->aux_priv = NULL;
- }
-
- if (rb->aux_nr_pages) {
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
-
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
- }
-}
-
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..5f6ce931f1ea 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ false);
if (err)
return err;
@@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;
get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
+ page_add_new_anon_rmap(kpage, vma, addr, false);
+ mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
+ mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
@@ -1177,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
goto free_area;
area->xol_mapping.name = "[uprobes]";
+ area->xol_mapping.fault = NULL;
area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
diff --git a/kernel/exit.c b/kernel/exit.c
index 07110c6020a0..10e088237fed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,8 +59,6 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct *tsk);
-
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -1120,8 +1118,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p) &&
- !(p->jobctl & JOBCTL_LISTENING))
+ if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
diff --git a/kernel/fork.c b/kernel/fork.c
index 1155eac61687..accb7221d547 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
+ if (page)
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ 1 << THREAD_SIZE_ORDER);
+
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ struct page *page = virt_to_page(ti);
+
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ -(1 << THREAD_SIZE_ORDER));
+ __free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -300,9 +308,9 @@ void __init fork_init(void)
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
#endif
/* create a slab on which task_structs can be allocated */
- task_struct_cachep =
- kmem_cache_create("task_struct", arch_task_struct_size,
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+ task_struct_cachep = kmem_cache_create("task_struct",
+ arch_task_struct_size, ARCH_MIN_TASKALIGN,
+ SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
#endif
/* do the arch specific task caches init */
@@ -414,7 +422,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
mm->total_vm = oldmm->total_vm;
- mm->shared_vm = oldmm->shared_vm;
+ mm->data_vm = oldmm->data_vm;
mm->exec_vm = oldmm->exec_vm;
mm->stack_vm = oldmm->stack_vm;
@@ -433,8 +441,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -vma_pages(mpnt));
+ vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
charge = 0;
@@ -1250,7 +1257,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1349,9 +1355,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqlock_init(&p->vtime_seqlock);
+ seqcount_init(&p->vtime_seqcount);
p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_SLEEPING;
+ p->vtime_snap_whence = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
@@ -1527,7 +1533,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
- retval = cgroup_can_fork(p, cgrp_ss_priv);
+ retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
@@ -1609,7 +1615,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p, cgrp_ss_priv);
+ cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1619,7 +1625,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
bad_fork_cancel_cgroup:
- cgroup_cancel_fork(p, cgrp_ss_priv);
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1849,16 +1855,19 @@ void __init proc_caches_init(void)
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
/*
* FIXME! The "sizeof(struct mm_struct)" currently includes the
* whole struct cpumask for the OFFSTACK case. We could change
@@ -1868,8 +1877,9 @@ void __init proc_caches_init(void)
*/
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+ NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..a5d2e74c89e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -124,16 +124,16 @@
* futex_wait(futex, val);
*
* waiters++; (a)
- * mb(); (A) <-- paired with -.
- * |
- * lock(hash_bucket(futex)); |
- * |
- * uval = *futex; |
- * | *futex = newval;
- * | sys_futex(WAKE, futex);
- * | futex_wake(futex);
- * |
- * `-------> mb(); (B)
+ * smp_mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `--------> smp_mb(); (B)
* if (uval == val)
* queue();
* unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
/*
* Ensure futex_get_mm() implies a full barrier such that
* get_futex_key() implies a full barrier. This is relied upon
- * as full barrier (B), see the ordering comment above.
+ * as smp_mb(); (B), see the ordering comment above.
*/
smp_mb__after_atomic();
}
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies MB (B) */
+ ihold(key->shared.inode); /* implies smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
- futex_get_mm(key); /* implies MB (B) */
+ futex_get_mm(key); /* implies smp_mb(); (B) */
break;
default:
/*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
* mm, therefore the only purpose of calling get_futex_key_refs
* is because we need the barrier for the lockless waiter check.
*/
- smp_mb(); /* explicit MB (B) */
+ smp_mb(); /* explicit smp_mb(); (B) */
}
}
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -496,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies MB (B) */
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
return 0;
}
@@ -519,46 +520,22 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
+ /*
+ * The treatment of mapping from this point on is critical. The page
+ * lock protects many things but in this context the page lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the page lock is not needed in all cases being
+ * considered here and page lock forces unnecessarily serialization
+ * From this point on, mapping will be re-verified if necessary and
+ * page lock will be acquired only if it is unavoidable
+ */
+ page = compound_head(page);
+ mapping = READ_ONCE(page->mapping);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,25 +547,38 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Page lock is required to identify which special case above
+ * applies. If this is really a shmem page then the page lock
+ * will prevent unexpected transitions.
+ */
+ lock_page(page);
+ shmem_swizzled = PageSwapCache(page) || page->mapping;
+ unlock_page(page);
+ put_page(page);
+
if (shmem_swizzled)
goto again;
+
return -EFAULT;
}
/*
* Private mappings are handled in a simple way.
*
+ * If the futex key is stored on an anonymous page, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -601,17 +591,75 @@ again:
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
+
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
} else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the page->mapping must be traversed. Ordinarily this should
+ * be stabilised under page lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update the radix tree or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(page->mapping) != mapping) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ /*
+ * Take a reference unless it is about to be freed. Previously
+ * this reference was taken by ihold under the page lock
+ * pinning the inode in place so i_lock was unnecessary. The
+ * only way for this check to fail is if the inode was
+ * truncated in parallel so warn for now if this happens.
+ *
+ * We are not calling into get_futex_key_refs() in file-backed
+ * cases, therefore a successful atomic_inc return below will
+ * guarantee that get_futex_key() will still imply smp_mb(); (B).
+ */
+ if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ /* Should be impossible but lets be paranoid for now */
+ if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+ err = -EFAULT;
+ rcu_read_unlock();
+ iput(inode);
+
+ goto out;
+ }
+
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = inode;
key->shared.pgoff = basepage_index(page);
+ rcu_read_unlock();
}
- get_futex_key_refs(key); /* implies MB (B) */
-
out:
- unlock_page(page_head);
- put_page(page_head);
+ put_page(page);
return err;
}
@@ -639,7 +687,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE, NULL);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -725,9 +773,12 @@ static struct futex_pi_state * alloc_pi_state(void)
}
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
@@ -1223,7 +1274,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (pi_state->owner != current)
return -EINVAL;
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -1249,22 +1300,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else if (curval != uval)
ret = -EINVAL;
if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
}
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
- raw_spin_lock_irq(&new_owner->pi_lock);
+ raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
+ raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
@@ -1706,31 +1757,35 @@ retry_private:
* exist yet, look it up one more time to ensure we have a
* reference to it. If the lock was taken, ret contains the
* vpid of the top waiter task.
+ * If the lock was not taken, we have pi_state and an initial
+ * refcount on it. In case of an error we have nothing.
*/
if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
/*
- * If we acquired the lock, then the user
- * space value of uaddr2 should be vpid. It
- * cannot be changed by the top waiter as it
- * is blocked on hb2 lock if it tries to do
- * so. If something fiddled with it behind our
- * back the pi state lookup might unearth
- * it. So we rather use the known value than
- * rereading and handing potential crap to
- * lookup_pi_state.
+ * If we acquired the lock, then the user space value
+ * of uaddr2 should be vpid. It cannot be changed by
+ * the top waiter as it is blocked on hb2 lock if it
+ * tries to do so. If something fiddled with it behind
+ * our back the pi state lookup might unearth it. So
+ * we rather use the known value than rereading and
+ * handing potential crap to lookup_pi_state.
+ *
+ * If that call succeeds then we have pi_state and an
+ * initial refcount on it.
*/
ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
case 0:
+ /* We hold a reference on the pi state. */
break;
+
+ /* If the above failed, then pi_state is NULL */
case -EFAULT:
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1746,8 +1801,6 @@ retry_private:
* exit to complete.
* - The user space value changed.
*/
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1801,30 +1854,58 @@ retry_private:
* of requeue_pi if we couldn't acquire the lock atomically.
*/
if (requeue_pi) {
- /* Prepare the waiter to take the rt_mutex. */
+ /*
+ * Prepare the waiter to take the rt_mutex. Take a
+ * refcount on the pi_state and store the pointer in
+ * the futex_q object of the waiter.
+ */
atomic_inc(&pi_state->refcount);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
if (ret == 1) {
- /* We got the lock. */
+ /*
+ * We got the lock. We do neither drop the
+ * refcount on pi_state nor clear
+ * this->pi_state because the waiter needs the
+ * pi_state for cleaning up the user space
+ * value. It will drop the refcount after
+ * doing so.
+ */
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
} else if (ret) {
- /* -EDEADLK */
+ /*
+ * rt_mutex_start_proxy_lock() detected a
+ * potential deadlock when we tried to queue
+ * that waiter. Drop the pi_state reference
+ * which we took above and remove the pointer
+ * to the state from the waiters futex_q
+ * object.
+ */
this->pi_state = NULL;
- free_pi_state(pi_state);
- goto out_unlock;
+ put_pi_state(pi_state);
+ /*
+ * We stop queueing more waiters and let user
+ * space deal with the mess.
+ */
+ break;
}
}
requeue_futex(this, hb1, hb2, &key2);
drop_count++;
}
+ /*
+ * We took an extra initial reference to the pi_state either
+ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+ * need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
out_unlock:
- free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
@@ -1866,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock); /* implies MB (A) */
+ spin_lock(&hb->lock); /* implies smp_mb(); (A) */
return hb;
}
@@ -1929,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
/* In the common case we don't take the spinlock, which is nice. */
retry:
- lock_ptr = q->lock_ptr;
- barrier();
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
spin_lock(lock_ptr);
/*
@@ -1973,7 +2058,7 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q);
BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
@@ -2129,11 +2214,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* we returned due to timeout or signal without taking the
* rt_mutex. Too late.
*/
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
if (!owner)
owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
@@ -2755,6 +2840,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ /*
+ * Drop the reference to the pi state which
+ * the requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
}
} else {
@@ -2881,7 +2971,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
}
ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
goto err_unlock;
head = p->robust_list;
@@ -3046,7 +3136,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+ cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 55c8c9349cfe..4ae3232e7a28 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
}
ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
goto err_unlock;
head = p->compat_robust_list;
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 7080ae1eb6c1..2f9df37940a0 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -123,11 +123,6 @@ void gcov_enable_events(void)
}
#ifdef CONFIG_MODULES
-static inline int within(void *addr, void *start, unsigned long size)
-{
- return ((addr >= start) && (addr < start + size));
-}
-
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within(info, mod->module_core, mod->core_size)) {
+ if (within_module((unsigned long)info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3b48dab80164..3bbfd6a9c475 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
+# Generic IRQ IPI support
+config GENERIC_IRQ_IPI
+ bool
+
# Generic MSI interrupt support
config GENERIC_MSI_IRQ
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2fc9cbdf35b6..2ee42e95a3ce 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 15206453b12a..2f9f2b0e79f2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq)
goto out_unlock;
}
+ kstat_incr_irqs_this_cpu(desc);
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
@@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
goto out_unlock;
}
+ kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
out_unlock:
@@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc)
goto out_unlock;
}
+ kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
cond_unmask_irq(desc);
@@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc)
goto out;
}
+ kstat_incr_irqs_this_cpu(desc);
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
@@ -950,6 +950,7 @@ void irq_chip_ack_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_ack(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_ack_parent);
/**
* irq_chip_mask_parent - Mask the parent interrupt
@@ -960,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_mask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
/**
* irq_chip_unmask_parent - Unmask the parent interrupt
@@ -970,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_unmask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_unmask_parent);
/**
* irq_chip_eoi_parent - Invoke EOI on the parent interrupt
@@ -980,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_eoi(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
/**
* irq_chip_set_affinity_parent - Set affinity on the parent interrupt
@@ -1015,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_type_parent);
/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a302cf9a2126..a15b5485b446 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,9 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
- struct irqaction *action = desc->action;
+ struct irqaction *action;
- do {
+ for_each_action_of_desc(desc, action) {
irqreturn_t res;
trace_irq_handler_entry(irq, action);
@@ -172,8 +172,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
}
retval |= res;
- action = action->next;
- } while (action);
+ }
add_interrupt_randomness(irq, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c66905..09be2c903c6d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
+#define for_each_action_of_desc(desc, act) \
+ for (act = desc->act; act; act = act->next)
+
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
unsigned int check);
@@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
__irq_put_desc_unlock(desc, flags, false);
}
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
+
/*
* Manipulation functions for irq_data.state
*/
@@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
return __irqd_to_state(d) & mask;
}
+#undef __irqd_to_state
+
static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
new file mode 100644
index 000000000000..c37f34b00a11
--- /dev/null
+++ b/kernel/irq/ipi.c
@@ -0,0 +1,326 @@
+/*
+ * linux/kernel/irq/ipi.c
+ *
+ * Copyright (C) 2015 Imagination Technologies Ltd
+ * Author: Qais Yousef <qais.yousef@imgtec.com>
+ *
+ * This file contains driver APIs to the IPI subsystem.
+ */
+
+#define pr_fmt(fmt) "genirq/ipi: " fmt
+
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
+/**
+ * irq_reserve_ipi() - Setup an IPI to destination cpumask
+ * @domain: IPI domain
+ * @dest: cpumask of cpus which can receive the IPI
+ *
+ * Allocate a virq that can be used to send IPI to any CPU in dest mask.
+ *
+ * On success it'll return linux irq number and 0 on failure
+ */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+ const struct cpumask *dest)
+{
+ unsigned int nr_irqs, offset;
+ struct irq_data *data;
+ int virq, i;
+
+ if (!domain ||!irq_domain_is_ipi(domain)) {
+ pr_warn("Reservation on a non IPI domain\n");
+ return 0;
+ }
+
+ if (!cpumask_subset(dest, cpu_possible_mask)) {
+ pr_warn("Reservation is not in possible_cpu_mask\n");
+ return 0;
+ }
+
+ nr_irqs = cpumask_weight(dest);
+ if (!nr_irqs) {
+ pr_warn("Reservation for empty destination mask\n");
+ return 0;
+ }
+
+ if (irq_domain_is_ipi_single(domain)) {
+ /*
+ * If the underlying implementation uses a single HW irq on
+ * all cpus then we only need a single Linux irq number for
+ * it. We have no restrictions vs. the destination mask. The
+ * underlying implementation can deal with holes nicely.
+ */
+ nr_irqs = 1;
+ offset = 0;
+ } else {
+ unsigned int next;
+
+ /*
+ * The IPI requires a seperate HW irq on each CPU. We require
+ * that the destination mask is consecutive. If an
+ * implementation needs to support holes, it can reserve
+ * several IPI ranges.
+ */
+ offset = cpumask_first(dest);
+ /*
+ * Find a hole and if found look for another set bit after the
+ * hole. For now we don't support this scenario.
+ */
+ next = cpumask_next_zero(offset, dest);
+ if (next < nr_cpu_ids)
+ next = cpumask_next(next, dest);
+ if (next < nr_cpu_ids) {
+ pr_warn("Destination mask has holes\n");
+ return 0;
+ }
+ }
+
+ virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+ if (virq <= 0) {
+ pr_warn("Can't reserve IPI, failed to alloc descs\n");
+ return 0;
+ }
+
+ virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
+ (void *) dest, true);
+
+ if (virq <= 0) {
+ pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
+ goto free_descs;
+ }
+
+ for (i = 0; i < nr_irqs; i++) {
+ data = irq_get_irq_data(virq + i);
+ cpumask_copy(data->common->affinity, dest);
+ data->common->ipi_offset = offset;
+ }
+ return virq;
+
+free_descs:
+ irq_free_descs(virq, nr_irqs);
+ return 0;
+}
+
+/**
+ * irq_destroy_ipi() - unreserve an IPI that was previously allocated
+ * @irq: linux irq number to be destroyed
+ *
+ * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
+ * all virqs associated with them.
+ */
+void irq_destroy_ipi(unsigned int irq)
+{
+ struct irq_data *data = irq_get_irq_data(irq);
+ struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+ struct irq_domain *domain;
+ unsigned int nr_irqs;
+
+ if (!irq || !data || !ipimask)
+ return;
+
+ domain = data->domain;
+ if (WARN_ON(domain == NULL))
+ return;
+
+ if (!irq_domain_is_ipi(domain)) {
+ pr_warn("Trying to destroy a non IPI domain!\n");
+ return;
+ }
+
+ if (irq_domain_is_ipi_per_cpu(domain))
+ nr_irqs = cpumask_weight(ipimask);
+ else
+ nr_irqs = 1;
+
+ irq_domain_free_irqs(irq, nr_irqs);
+}
+
+/**
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
+ * @irq: linux irq number
+ * @cpu: the target cpu
+ *
+ * When dealing with coprocessors IPI, we need to inform the coprocessor of
+ * the hwirq it needs to use to receive and send IPIs.
+ *
+ * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ */
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
+{
+ struct irq_data *data = irq_get_irq_data(irq);
+ struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+
+ if (!data || !ipimask || cpu > nr_cpu_ids)
+ return INVALID_HWIRQ;
+
+ if (!cpumask_test_cpu(cpu, ipimask))
+ return INVALID_HWIRQ;
+
+ /*
+ * Get the real hardware irq number if the underlying implementation
+ * uses a seperate irq per cpu. If the underlying implementation uses
+ * a single hardware irq for all cpus then the IPI send mechanism
+ * needs to take care of the cpu destinations.
+ */
+ if (irq_domain_is_ipi_per_cpu(data->domain))
+ data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
+
+ return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
+}
+EXPORT_SYMBOL_GPL(ipi_get_hwirq);
+
+static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
+ const struct cpumask *dest, unsigned int cpu)
+{
+ struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+
+ if (!chip || !ipimask)
+ return -EINVAL;
+
+ if (!chip->ipi_send_single && !chip->ipi_send_mask)
+ return -EINVAL;
+
+ if (cpu > nr_cpu_ids)
+ return -EINVAL;
+
+ if (dest) {
+ if (!cpumask_subset(dest, ipimask))
+ return -EINVAL;
+ } else {
+ if (!cpumask_test_cpu(cpu, ipimask))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * __ipi_send_single - send an IPI to a target Linux SMP CPU
+ * @desc: pointer to irq_desc of the IRQ
+ * @cpu: destination CPU, must in the destination mask passed to
+ * irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+#ifdef DEBUG
+ /*
+ * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+ * Since the callers should be arch or core code which is generally
+ * trusted, only check for errors when debugging.
+ */
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+ return -EINVAL;
+#endif
+ if (!chip->ipi_send_single) {
+ chip->ipi_send_mask(data, cpumask_of(cpu));
+ return 0;
+ }
+
+ /* FIXME: Store this information in irqdata flags */
+ if (irq_domain_is_ipi_per_cpu(data->domain) &&
+ cpu != data->common->ipi_offset) {
+ /* use the correct data for that cpu */
+ unsigned irq = data->irq + cpu - data->common->ipi_offset;
+
+ data = irq_get_irq_data(irq);
+ }
+ chip->ipi_send_single(data, cpu);
+ return 0;
+}
+
+/**
+ * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * @desc: pointer to irq_desc of the IRQ
+ * @dest: dest CPU(s), must be a subset of the mask passed to
+ * irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ unsigned int cpu;
+
+#ifdef DEBUG
+ /*
+ * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+ * Since the callers should be arch or core code which is generally
+ * trusted, only check for errors when debugging.
+ */
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+ return -EINVAL;
+#endif
+ if (chip->ipi_send_mask) {
+ chip->ipi_send_mask(data, dest);
+ return 0;
+ }
+
+ if (irq_domain_is_ipi_per_cpu(data->domain)) {
+ unsigned int base = data->irq;
+
+ for_each_cpu(cpu, dest) {
+ unsigned irq = base + cpu - data->common->ipi_offset;
+
+ data = irq_get_irq_data(irq);
+ chip->ipi_send_single(data, cpu);
+ }
+ } else {
+ for_each_cpu(cpu, dest)
+ chip->ipi_send_single(data, cpu);
+ }
+ return 0;
+}
+
+/**
+ * ipi_send_single - Send an IPI to a single CPU
+ * @virq: linux irq number from irq_reserve_ipi()
+ * @cpu: destination CPU, must in the destination mask passed to
+ * irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_single(unsigned int virq, unsigned int cpu)
+{
+ struct irq_desc *desc = irq_to_desc(virq);
+ struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+ struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+ return -EINVAL;
+
+ return __ipi_send_single(desc, cpu);
+}
+EXPORT_SYMBOL_GPL(ipi_send_single);
+
+/**
+ * ipi_send_mask - Send an IPI to target CPU(s)
+ * @virq: linux irq number from irq_reserve_ipi()
+ * @dest: dest CPU(s), must be a subset of the mask passed to
+ * irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
+{
+ struct irq_desc *desc = irq_to_desc(virq);
+ struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+ struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+ return -EINVAL;
+
+ return __ipi_send_mask(desc, dest);
+}
+EXPORT_SYMBOL_GPL(ipi_send_mask);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 239e2ae2c947..0ccd028817d7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -24,10 +24,27 @@
static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpulist_parse(str, irq_default_affinity);
+ /*
+ * Set at least the boot cpu. We don't want to end up with
+ * bugreports caused by random comandline masks
+ */
+ cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+ return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
static void __init init_irq_default_affinity(void)
{
- alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
- cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!irq_default_affinity)
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+ if (cpumask_empty(irq_default_affinity))
+ cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
@@ -159,6 +176,7 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
raw_spin_lock_init(&desc->lock);
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ init_rcu_head(&desc->rcu);
desc_set_defaults(irq, desc, node, owner);
@@ -171,6 +189,15 @@ err_desc:
return NULL;
}
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+ struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+
+ free_masks(desc);
+ free_percpu(desc->kstat_irqs);
+ kfree(desc);
+}
+
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -187,9 +214,12 @@ static void free_desc(unsigned int irq)
delete_irq_desc(irq);
mutex_unlock(&sparse_irq_lock);
- free_masks(desc);
- free_percpu(desc->kstat_irqs);
- kfree(desc);
+ /*
+ * We free the descriptor, masks and stat fields via RCU. That
+ * allows demultiplex interrupts to do rcu based management of
+ * the child interrupts.
+ */
+ call_rcu(&desc->rcu, delayed_free_desc);
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 22aa9612ef7c..3a519a01118b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
-static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
- irq_hw_number_t hwirq, int node);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -60,6 +58,7 @@ struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
fwid->fwnode.type = FWNODE_IRQCHIP;
return &fwid->fwnode;
}
+EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode);
/**
* irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
@@ -70,13 +69,14 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
{
struct irqchip_fwid *fwid;
- if (WARN_ON(fwnode->type != FWNODE_IRQCHIP))
+ if (WARN_ON(!is_fwnode_irqchip(fwnode)))
return;
fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
kfree(fwid->name);
kfree(fwid);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
@@ -573,10 +573,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
unsigned int type = IRQ_TYPE_NONE;
int virq;
- if (fwspec->fwnode)
- domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
- else
+ if (fwspec->fwnode) {
+ domain = irq_find_matching_fwnode(fwspec->fwnode,
+ DOMAIN_BUS_WIRED);
+ if (!domain)
+ domain = irq_find_matching_fwnode(fwspec->fwnode,
+ DOMAIN_BUS_ANY);
+ } else {
domain = irq_default_domain;
+ }
if (!domain) {
pr_warn("no irq domain found for %s !\n",
@@ -833,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = {
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-static int irq_domain_alloc_descs(int virq, unsigned int cnt,
- irq_hw_number_t hwirq, int node)
+int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
+ int node)
{
unsigned int hint;
@@ -888,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
return domain;
}
+EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
static void irq_domain_insert_irq(int virq)
{
@@ -1013,6 +1019,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
/**
* irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
@@ -1037,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return 0;
}
+EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
/**
* irq_domain_set_info - Set the complete data for a @virq in @domain
@@ -1058,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
__irq_set_handler(virq, handler, 0, handler_name);
irq_set_handler_data(virq, handler_data);
}
+EXPORT_SYMBOL(irq_domain_set_info);
/**
* irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
@@ -1069,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
irq_data->chip = &no_irq_chip;
irq_data->chip_data = NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
/**
* irq_domain_free_irqs_common - Clear irq_data and free the parent
@@ -1125,9 +1135,9 @@ static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
}
}
-static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
- unsigned int irq_base,
- unsigned int nr_irqs, void *arg)
+int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
{
int ret = 0;
struct irq_domain *parent = domain->parent;
@@ -1266,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
nr_irqs, arg);
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
/**
* irq_domain_free_irqs_parent - Free interrupts from parent domain
@@ -1283,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
irq_domain_free_irqs_recursive(domain->parent, irq_base,
nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
@@ -1343,6 +1355,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
/**
* irq_domain_set_info - Set the complete data for a @virq in @domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6ead200370da..64731e84c982 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq)
*/
void irq_set_thread_affinity(struct irq_desc *desc)
{
- struct irqaction *action = desc->action;
+ struct irqaction *action;
- while (action) {
+ for_each_action_of_desc(desc, action)
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
- action = action->next;
- }
}
#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
return;
raw_spin_lock_irqsave(&desc->lock, flags);
- for (action = desc->action; action; action = action->next) {
+ for_each_action_of_desc(desc, action) {
if (action->dev_id == dev_id) {
if (action->thread)
__irq_wake_thread(desc, action);
@@ -1609,6 +1607,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
struct irq_desc *desc;
int retval;
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
* otherwise we'll have trouble later trying to figure out
@@ -1699,9 +1700,13 @@ EXPORT_SYMBOL(request_threaded_irq);
int request_any_context_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev_id)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc;
int ret;
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
+ desc = irq_to_desc(irq);
if (!desc)
return -EINVAL;
@@ -1743,6 +1748,31 @@ out:
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
+/**
+ * irq_percpu_is_enabled - Check whether the per cpu irq is enabled
+ * @irq: Linux irq number to check for
+ *
+ * Must be called from a non migratable context. Returns the enable
+ * state of a per cpu interrupt on the current cpu.
+ */
+bool irq_percpu_is_enabled(unsigned int irq)
+{
+ unsigned int cpu = smp_processor_id();
+ struct irq_desc *desc;
+ unsigned long flags;
+ bool is_enabled;
+
+ desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return false;
+
+ is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+ irq_put_desc_unlock(desc, flags);
+
+ return is_enabled;
+}
+EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
+
void disable_percpu_irq(unsigned int irq)
{
unsigned int cpu = smp_processor_id();
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6b0c0b74a2a1..38e89ce7b071 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
if (irq_find_mapping(domain, hwirq) > 0)
return -EEXIST;
- ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
- if (ret < 0)
- return ret;
+ if (domain->parent) {
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+ }
for (i = 0; i < nr_irqs; i++) {
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
@@ -252,6 +254,60 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
&msi_domain_ops, info);
}
+int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ int ret;
+
+ ret = ops->msi_check(domain, info, dev);
+ if (ret == 0)
+ ret = ops->msi_prepare(domain, dev, nvec, arg);
+
+ return ret;
+}
+
+int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
+ int virq, int nvec, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ struct msi_desc *desc;
+ int ret = 0;
+
+ for_each_msi_entry(desc, dev) {
+ /* Don't even try the multi-MSI brain damage. */
+ if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
+ continue;
+
+ ops->set_desc(arg, desc);
+ /* Assumes the domain mutex is held! */
+ ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg);
+ if (ret)
+ break;
+
+ irq_set_msi_desc_off(virq, 0, desc);
+ }
+
+ if (ret) {
+ /* Mop up the damage */
+ for_each_msi_entry(desc, dev) {
+ if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
+ continue;
+
+ irq_domain_free_irqs_common(domain, desc->irq, 1);
+ }
+ }
+
+ return ret;
+}
+
/**
* msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
* @domain: The domain to allocate from
@@ -270,9 +326,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_desc *desc;
int i, ret, virq = -1;
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a2c02fd5d6d0..4e1b94726818 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
int ret = 1;
raw_spin_lock_irqsave(&desc->lock, flags);
- for (action = desc->action ; action; action = action->next) {
+ for_each_action_of_desc(desc, action) {
if ((action != new_action) && action->name &&
!strcmp(new_action->name, action->name)) {
ret = 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32144175458d..5707f97a3e6a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
* desc->lock here. See synchronize_irq().
*/
raw_spin_lock_irqsave(&desc->lock, flags);
- action = desc->action;
- while (action) {
+ for_each_action_of_desc(desc, action) {
printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
if (action->thread_fn)
printk(KERN_CONT " threaded [<%p>] %pf",
action->thread_fn, action->thread_fn);
printk(KERN_CONT "\n");
- action = action->next;
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5c5987f10819..fafd1a3ef0da 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -38,6 +38,7 @@
* during the second link stage.
*/
extern const unsigned long kallsyms_addresses[] __weak;
+extern const int kallsyms_offsets[] __weak;
extern const u8 kallsyms_names[] __weak;
/*
@@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak;
extern const unsigned long kallsyms_num_syms
__attribute__((weak, section(".rodata")));
+extern const unsigned long kallsyms_relative_base
+__attribute__((weak, section(".rodata")));
+
extern const u8 kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
@@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos)
return name - kallsyms_names;
}
+static unsigned long kallsyms_sym_address(int idx)
+{
+ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
+ return kallsyms_addresses[idx];
+
+ /* values are unsigned offsets if --absolute-percpu is not in effect */
+ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+
+ /* ...otherwise, positive offsets are absolute values */
+ if (kallsyms_offsets[idx] >= 0)
+ return kallsyms_offsets[idx];
+
+ /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
+ return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
+}
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
@@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name)
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
- return kallsyms_addresses[i];
+ return kallsyms_sym_address(i);
}
return module_kallsyms_lookup_name(name);
}
@@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+ ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
}
@@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long i, low, high, mid;
/* This kernel should never had been booted. */
- BUG_ON(!kallsyms_addresses);
+ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
+ BUG_ON(!kallsyms_addresses);
+ else
+ BUG_ON(!kallsyms_offsets);
/* Do a binary search on the sorted kallsyms_addresses array. */
low = 0;
@@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
while (high - low > 1) {
mid = low + (high - low) / 2;
- if (kallsyms_addresses[mid] <= addr)
+ if (kallsyms_sym_address(mid) <= addr)
low = mid;
else
high = mid;
@@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
* Search for the first aliased symbol. Aliased
* symbols are symbols with the same address.
*/
- while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low))
--low;
- symbol_start = kallsyms_addresses[low];
+ symbol_start = kallsyms_sym_address(low);
/* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
- if (kallsyms_addresses[i] > symbol_start) {
- symbol_end = kallsyms_addresses[i];
+ if (kallsyms_sym_address(i) > symbol_start) {
+ symbol_end = kallsyms_sym_address(i);
break;
}
}
@@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
unsigned off = iter->nameoff;
iter->module_name[0] = '\0';
- iter->value = kallsyms_addresses[iter->pos];
+ iter->value = kallsyms_sym_address(iter->pos);
iter->type = kallsyms_get_symbol_type(off);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 0aa69ea1d8fd..3a47fa998fe0 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
&task2->signal->cred_guard_mutex);
if (ret)
goto err;
- if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
- !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+ if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
+ !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) {
ret = -EPERM;
goto err_unlock;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d873b64fbddc..ee70aef5cd81 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
if (ret)
goto out_free_image;
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_image;
-
- /* Enable the special crash kernel control page allocation policy. */
if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 11b64a63c0f8..8d34308ea449 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -66,13 +66,15 @@ struct resource crashk_res = {
.name = "Crash kernel",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
};
struct resource crashk_low_res = {
.name = "Crash kernel",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
};
int kexec_should_crash(struct task_struct *p)
@@ -310,12 +312,9 @@ static void kimage_free_pages(struct page *page)
void kimage_free_page_list(struct list_head *list)
{
- struct list_head *pos, *next;
+ struct page *page, *next;
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
+ list_for_each_entry_safe(page, next, list, lru) {
list_del(&page->lru);
kimage_free_pages(page);
}
@@ -853,7 +852,12 @@ struct kimage *kexec_image;
struct kimage *kexec_crash_image;
int kexec_load_disabled;
-void crash_kexec(struct pt_regs *regs)
+/*
+ * No panic_cpu check version of crash_kexec(). This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
@@ -876,6 +880,29 @@ void crash_kexec(struct pt_regs *regs)
}
}
+void crash_kexec(struct pt_regs *regs)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ this_cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+ if (old_cpu == PANIC_CPU_INVALID) {
+ /* This is the 1st CPU which comes here, so go ahead. */
+ __crash_kexec(regs);
+
+ /*
+ * Reset panic_cpu to allow another panic()/crash_kexec()
+ * call.
+ */
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+ }
+}
+
size_t crash_get_memory_size(void)
{
size_t size = 0;
@@ -934,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size)
ram_res->start = end;
ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
ram_res->name = "System RAM";
crashk_res.end = end - 1;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0028d2..c72d2ff5896e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -18,6 +18,7 @@
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/fs.h>
#include <crypto/hash.h>
#include <crypto/sha.h>
#include <linux/syscalls.h>
@@ -33,65 +34,6 @@ size_t __weak kexec_purgatory_size = 0;
static int kexec_calculate_store_digests(struct kimage *image);
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
- struct fd f = fdget(fd);
- int ret;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -EBADF;
-
- ret = vfs_getattr(&f.file->f_path, &stat);
- if (ret)
- goto out;
-
- if (stat.size > INT_MAX) {
- ret = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- *buf = vmalloc(stat.size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(*buf);
- ret = bytes;
- goto out;
- }
-
- if (bytes == 0)
- break;
- pos += bytes;
- }
-
- if (pos != stat.size) {
- ret = -EBADF;
- vfree(*buf);
- goto out;
- }
-
- *buf_len = pos;
-out:
- fdput(f);
- return ret;
-}
-
/* Architectures can provide this probe function */
int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
unsigned long buf_len)
@@ -109,11 +51,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
return -EINVAL;
}
+#ifdef CONFIG_KEXEC_VERIFY_SIG
int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
unsigned long buf_len)
{
return -EKEYREJECTED;
}
+#endif
/* Apply relocations of type RELA */
int __weak
@@ -180,16 +124,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
{
int ret = 0;
void *ldata;
+ loff_t size;
- ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
- &image->kernel_buf_len);
+ ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf,
+ &size, INT_MAX, READING_KEXEC_IMAGE);
if (ret)
return ret;
+ image->kernel_buf_len = size;
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
image->kernel_buf_len);
-
if (ret)
goto out;
@@ -204,10 +149,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
#endif
/* It is possible that there no initramfs is being loaded */
if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
- &image->initrd_buf_len);
+ ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf,
+ &size, INT_MAX,
+ READING_KEXEC_INITRAMFS);
if (ret)
goto out;
+ image->initrd_buf_len = size;
}
if (cmdline_len) {
@@ -522,10 +469,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
/* Walk the RAM ranges and allocate a suitable range for the buffer */
if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res("Crash kernel",
- IORESOURCE_MEM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
+ ret = walk_iomem_res_desc(crashk_res.desc,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
else
ret = walk_system_ram_res(0, -1, kbuf,
locate_mem_hole_callback);
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index e4392a698ad4..0a52315d9c62 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image,
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
+struct kexec_sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+/*
+ * Keeps track of buffer parameters as provided by caller for requesting
+ * memory placement of buffer.
+ */
+struct kexec_buf {
+ struct kimage *image;
+ char *buffer;
+ unsigned long bufsz;
+ unsigned long mem;
+ unsigned long memsz;
+ unsigned long buf_align;
+ unsigned long buf_min;
+ unsigned long buf_max;
+ bool top_down; /* allocate from top of memory hole */
+};
+
void kimage_file_post_load_cleanup(struct kimage *image);
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e83b26464061..152da4a48867 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -20,7 +20,7 @@
#include <linux/capability.h>
#include <linux/compiler.h>
-#include <linux/rcupdate.h> /* rcu_expedited */
+#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(fscaps);
+#ifndef CONFIG_TINY_RCU
int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", rcu_expedited);
+ return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(rcu_expedited);
+int rcu_normal;
+static ssize_t rcu_normal_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+}
+static ssize_t rcu_normal_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_normal))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_normal);
+#endif /* #ifndef CONFIG_TINY_RCU */
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
@@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = {
&kexec_crash_size_attr.attr,
&vmcoreinfo_attr.attr,
#endif
+#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
+ &rcu_normal_attr.attr,
+#endif
NULL
};
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743a7e..b5c30d9f46c5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
* of times)
*/
-#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
+#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index db545cbcdb89..d68fbf63b083 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
+#include <asm/cacheflush.h>
/**
* struct klp_ops - structure for tracking registered ftrace ops structs
@@ -98,12 +99,12 @@ static void klp_find_object_module(struct klp_object *obj)
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
- * a going module handler instead.
+ * klp_module_going() instead.
*/
mod = find_module(obj->name);
/*
- * Do not mess work of the module coming and going notifiers.
- * Note that the patch might still be needed before the going handler
+ * Do not mess work of klp_module_coming() and klp_module_going().
+ * Note that the patch might still be needed before klp_module_going()
* is called. Module functions can be called even in the GOING state
* until mod->exit() finishes. This is especially important for
* patches that modify semantic of the functions.
@@ -135,13 +136,8 @@ struct klp_find_arg {
const char *objname;
const char *name;
unsigned long addr;
- /*
- * If count == 0, the symbol was not found. If count == 1, a unique
- * match was found and addr is set. If count > 1, there is
- * unresolvable ambiguity among "count" number of symbols with the same
- * name in the same object.
- */
unsigned long count;
+ unsigned long pos;
};
static int klp_find_callback(void *data, const char *name,
@@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name,
if (args->objname && strcmp(args->objname, mod->name))
return 0;
- /*
- * args->addr might be overwritten if another match is found
- * but klp_find_object_symbol() handles this and only returns the
- * addr if count == 1.
- */
args->addr = addr;
args->count++;
+ /*
+ * Finish the search when the symbol is found for the desired position
+ * or the position is not defined for a non-unique symbol.
+ */
+ if ((args->pos && (args->count == args->pos)) ||
+ (!args->pos && (args->count > 1)))
+ return 1;
+
return 0;
}
static int klp_find_object_symbol(const char *objname, const char *name,
- unsigned long *addr)
+ unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
.objname = objname,
.name = name,
.addr = 0,
- .count = 0
+ .count = 0,
+ .pos = sympos,
};
mutex_lock(&module_mutex);
kallsyms_on_each_symbol(klp_find_callback, &args);
mutex_unlock(&module_mutex);
- if (args.count == 0)
+ /*
+ * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+ * otherwise ensure the symbol position count matches sympos.
+ */
+ if (args.addr == 0)
pr_err("symbol '%s' not found in symbol table\n", name);
- else if (args.count > 1)
- pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
- args.count, name, objname);
- else {
+ else if (args.count > 1 && sympos == 0) {
+ pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n",
+ name, objname);
+ } else if (sympos != args.count && sympos > 0) {
+ pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n",
+ sympos, name, objname ? objname : "vmlinux");
+ } else {
*addr = args.addr;
return 0;
}
@@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-struct klp_verify_args {
- const char *name;
- const unsigned long addr;
-};
-
-static int klp_verify_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
-{
- struct klp_verify_args *args = data;
-
- if (!mod &&
- !strcmp(args->name, name) &&
- args->addr == addr)
- return 1;
-
- return 0;
-}
-
-static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
-{
- struct klp_verify_args args = {
- .name = name,
- .addr = addr,
- };
- int ret;
-
- mutex_lock(&module_mutex);
- ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
- mutex_unlock(&module_mutex);
-
- if (!ret) {
- pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
- name, addr);
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int klp_find_verify_func_addr(struct klp_object *obj,
- struct klp_func *func)
-{
- int ret;
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old_addr accordingly */
- if (kaslr_enabled() && func->old_addr)
- func->old_addr += kaslr_offset();
-#endif
-
- if (!func->old_addr || klp_is_module(obj))
- ret = klp_find_object_symbol(obj->name, func->old_name,
- &func->old_addr);
- else
- ret = klp_verify_vmlinux_symbol(func->old_name,
- func->old_addr);
-
- return ret;
-}
-
/*
* external symbols are located outside the parent object (where the parent
* object is either vmlinux or the kmod being patched).
@@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name,
}
preempt_enable();
- /* otherwise check if it's in another .o within the patch module */
- return klp_find_object_symbol(pmod->name, name, addr);
+ /*
+ * Check if it's in another .o within the patch module. This also
+ * checks that the external symbol is unique.
+ */
+ return klp_find_object_symbol(pmod->name, name, 0, addr);
}
static int klp_write_object_relocations(struct module *pmod,
struct klp_object *obj)
{
- int ret;
+ int ret = 0;
+ unsigned long val;
struct klp_reloc *reloc;
if (WARN_ON(!klp_is_object_loaded(obj)))
@@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod,
if (WARN_ON(!obj->relocs))
return -EINVAL;
+ module_disable_ro(pmod);
+
for (reloc = obj->relocs; reloc->name; reloc++) {
- if (!klp_is_module(obj)) {
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* If KASLR has been enabled, adjust old value accordingly */
- if (kaslr_enabled())
- reloc->val += kaslr_offset();
-#endif
- ret = klp_verify_vmlinux_symbol(reloc->name,
- reloc->val);
- if (ret)
- return ret;
- } else {
- /* module, reloc->val needs to be discovered */
- if (reloc->external)
- ret = klp_find_external_symbol(pmod,
- reloc->name,
- &reloc->val);
- else
- ret = klp_find_object_symbol(obj->mod->name,
- reloc->name,
- &reloc->val);
- if (ret)
- return ret;
- }
+ /* discover the address of the referenced symbol */
+ if (reloc->external) {
+ if (reloc->sympos > 0) {
+ pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
+ reloc->name);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = klp_find_external_symbol(pmod, reloc->name, &val);
+ } else
+ ret = klp_find_object_symbol(obj->name,
+ reloc->name,
+ reloc->sympos,
+ &val);
+ if (ret)
+ goto out;
+
ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
- reloc->val + reloc->addend);
+ val + reloc->addend);
if (ret) {
pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
- reloc->name, reloc->val, ret);
- return ret;
+ reloc->name, val, ret);
+ goto out;
}
}
- return 0;
+out:
+ module_enable_ro(pmod);
+ return ret;
}
static void notrace klp_ftrace_handler(unsigned long ip,
@@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/<object>
- * /sys/kernel/livepatch/<patch>/<object>/<func>
+ * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
INIT_LIST_HEAD(&func->stack_node);
func->state = KLP_DISABLED;
+ /* The format for the sysfs directory is <function,sympos> where sympos
+ * is the nth occurrence of this symbol in kallsyms for the patched
+ * object. If the user selects 0 for old_sympos, then 1 will be used
+ * since a unique symbol will be the first occurrence.
+ */
return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s", func->old_name);
+ &obj->kobj, "%s,%lu", func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
}
klp_for_each_func(obj, func) {
- ret = klp_find_verify_func_addr(obj, func);
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ func->old_sympos,
+ &func->old_addr);
if (ret)
return ret;
}
@@ -910,103 +866,108 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
-static int klp_module_notify_coming(struct klp_patch *patch,
- struct klp_object *obj)
+int klp_module_coming(struct module *mod)
{
- struct module *pmod = patch->mod;
- struct module *mod = obj->mod;
int ret;
+ struct klp_patch *patch;
+ struct klp_object *obj;
- ret = klp_init_object_loaded(patch, obj);
- if (ret) {
- pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n",
- pmod->name, mod->name, ret);
- return ret;
- }
+ if (WARN_ON(mod->state != MODULE_STATE_COMING))
+ return -EINVAL;
- if (patch->state == KLP_DISABLED)
- return 0;
+ mutex_lock(&klp_mutex);
+ /*
+ * Each module has to know that klp_module_coming()
+ * has been called. We never know what module will
+ * get patched by a new patch.
+ */
+ mod->klp_alive = true;
- pr_notice("applying patch '%s' to loading module '%s'\n",
- pmod->name, mod->name);
+ list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
- ret = klp_enable_object(obj);
- if (ret)
- pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
- pmod->name, mod->name, ret);
- return ret;
-}
+ obj->mod = mod;
-static void klp_module_notify_going(struct klp_patch *patch,
- struct klp_object *obj)
-{
- struct module *pmod = patch->mod;
- struct module *mod = obj->mod;
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret) {
+ pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n",
+ patch->mod->name, obj->mod->name, ret);
+ goto err;
+ }
+
+ if (patch->state == KLP_DISABLED)
+ break;
+
+ pr_notice("applying patch '%s' to loading module '%s'\n",
+ patch->mod->name, obj->mod->name);
- if (patch->state == KLP_DISABLED)
- goto disabled;
+ ret = klp_enable_object(obj);
+ if (ret) {
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ patch->mod->name, obj->mod->name, ret);
+ goto err;
+ }
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- pmod->name, mod->name);
+ break;
+ }
+ }
- klp_disable_object(obj);
+ mutex_unlock(&klp_mutex);
-disabled:
+ return 0;
+
+err:
+ /*
+ * If a patch is unsuccessfully applied, return
+ * error to the module loader.
+ */
+ pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
+ patch->mod->name, obj->mod->name, obj->mod->name);
+ mod->klp_alive = false;
klp_free_object_loaded(obj);
+ mutex_unlock(&klp_mutex);
+
+ return ret;
}
-static int klp_module_notify(struct notifier_block *nb, unsigned long action,
- void *data)
+void klp_module_going(struct module *mod)
{
- int ret;
- struct module *mod = data;
struct klp_patch *patch;
struct klp_object *obj;
- if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
- return 0;
+ if (WARN_ON(mod->state != MODULE_STATE_GOING &&
+ mod->state != MODULE_STATE_COMING))
+ return;
mutex_lock(&klp_mutex);
-
/*
- * Each module has to know that the notifier has been called.
- * We never know what module will get patched by a new patch.
+ * Each module has to know that klp_module_going()
+ * has been called. We never know what module will
+ * get patched by a new patch.
*/
- if (action == MODULE_STATE_COMING)
- mod->klp_alive = true;
- else /* MODULE_STATE_GOING */
- mod->klp_alive = false;
+ mod->klp_alive = false;
list_for_each_entry(patch, &klp_patches, list) {
klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
- if (action == MODULE_STATE_COMING) {
- obj->mod = mod;
- ret = klp_module_notify_coming(patch, obj);
- if (ret) {
- obj->mod = NULL;
- pr_warn("patch '%s' is in an inconsistent state!\n",
- patch->mod->name);
- }
- } else /* MODULE_STATE_GOING */
- klp_module_notify_going(patch, obj);
+ if (patch->state != KLP_DISABLED) {
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_disable_object(obj);
+ }
+ klp_free_object_loaded(obj);
break;
}
}
mutex_unlock(&klp_mutex);
-
- return 0;
}
-static struct notifier_block klp_module_nb = {
- .notifier_call = klp_module_notify,
- .priority = INT_MIN+1, /* called late but before ftrace notifier */
-};
-
static int __init klp_init(void)
{
int ret;
@@ -1017,21 +978,11 @@ static int __init klp_init(void)
return -EINVAL;
}
- ret = register_module_notifier(&klp_module_nb);
- if (ret)
- return ret;
-
klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
- if (!klp_root_kobj) {
- ret = -ENOMEM;
- goto unregister;
- }
+ if (!klp_root_kobj)
+ return -ENOMEM;
return 0;
-
-unregister:
- unregister_module_notifier(&klp_module_nb);
- return ret;
}
module_init(klp_init);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace56618f6..53ab2f85d77e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
return ret;
}
-static int lockdep_initialized;
-
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
@@ -150,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
}
#ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
- cpu_lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
static inline u64 lockstat_clock(void)
{
@@ -292,7 +289,7 @@ LIST_HEAD(all_lock_classes);
#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
#define classhashentry(key) (classhash_table + __classhashfn((key)))
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
/*
* We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +300,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
* The hash key of the lock dependency chains is a hash itself too:
@@ -434,19 +431,6 @@ unsigned int max_lockdep_depth;
#ifdef CONFIG_DEBUG_LOCKDEP
/*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
- .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
- .entries = lockdep_init_trace_data,
-};
-
-/*
* Various lockdep statistics:
*/
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
@@ -666,23 +650,9 @@ static inline struct lock_class *
look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * If the architecture calls into lockdep before initializing
- * the hashes then we'll warn about it later. (we cannot printk
- * right now)
- */
- if (unlikely(!lockdep_initialized)) {
- lockdep_init();
- lockdep_init_error = 1;
- lock_init_error = lock->name;
- save_stack_trace(&lockdep_init_trace);
- }
-#endif
-
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
printk(KERN_ERR
@@ -719,7 +689,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -742,7 +712,7 @@ static inline struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +744,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
}
@@ -805,7 +775,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
*/
- list_add_tail_rcu(&class->hash_entry, hash_head);
+ hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
* Add it to the global list of classes:
*/
@@ -1822,7 +1792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int trylock_loop)
+ struct held_lock *next, int distance, int *stack_saved)
{
struct lock_list *entry;
int ret;
@@ -1883,8 +1853,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
- if (!trylock_loop && !save_trace(&trace))
- return 0;
+ if (!*stack_saved) {
+ if (!save_trace(&trace))
+ return 0;
+ *stack_saved = 1;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -1907,6 +1880,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+ /* We drop graph lock, so another thread can overwrite trace. */
+ *stack_saved = 0;
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
@@ -1929,7 +1904,7 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int trylock_loop = 0;
+ int stack_saved = 0;
struct held_lock *hlock;
/*
@@ -1956,7 +1931,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
- distance, trylock_loop))
+ distance, &stack_saved))
return 0;
/*
* Stop after the first non-trylock entry,
@@ -1979,7 +1954,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (curr->held_locks[depth].irq_context !=
curr->held_locks[depth-1].irq_context)
break;
- trylock_loop = 1;
}
return 1;
out_bug:
@@ -2007,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
}
/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ int i;
+ struct held_lock *hlock_curr;
+
+ for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+ hlock_curr = curr->held_locks + i;
+ if (hlock_curr->irq_context != hlock->irq_context)
+ break;
+
+ }
+
+ return ++i;
+}
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+ struct held_lock *hlock,
+ struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ int i, j, id;
+
+ i = get_first_held_lock(curr, hlock);
+
+ if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+ return 0;
+
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ id = curr->held_locks[i].class_idx - 1;
+
+ if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+/*
* Look up a dependency chain. If the key is not present yet then
* add it and return 1 - in this case the new dependency chain is
* validated. If the key is already hashed, return 0.
@@ -2017,9 +2038,8 @@ static inline int lookup_chain_cache(struct task_struct *curr,
u64 chain_key)
{
struct lock_class *class = hlock_class(hlock);
- struct list_head *hash_head = chainhashentry(chain_key);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- struct held_lock *hlock_curr;
int i, j;
/*
@@ -2033,10 +2053,13 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry_rcu(chain, hash_head, entry) {
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
if (very_verbose(class))
printk("\nhash chain already cached, key: "
"%016Lx tail class: [%p] %s\n",
@@ -2057,7 +2080,7 @@ cache_hit:
/*
* We have to walk the chain again locked - to avoid duplicates:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ hlist_for_each_entry(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
graph_unlock();
goto cache_hit;
@@ -2074,13 +2097,7 @@ cache_hit:
chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
- /* Find the first held_lock of current chain */
- for (i = curr->lockdep_depth - 1; i >= 0; i--) {
- hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock->irq_context)
- break;
- }
- i++;
+ i = get_first_held_lock(curr, hlock);
chain->depth = curr->lockdep_depth + 1 - i;
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
@@ -2091,7 +2108,7 @@ cache_hit:
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
- list_add_tail_rcu(&chain->entry, hash_head);
+ hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -2168,7 +2185,7 @@ static void check_chain_key(struct task_struct *curr)
{
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
- unsigned int i, id;
+ unsigned int i;
u64 chain_key = 0;
for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2185,17 +2202,16 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
- id = hlock->class_idx - 1;
/*
* Whoops ran out of static storage again?
*/
- if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
chain_key = 0;
- chain_key = iterate_chain_key(chain_key, id);
+ chain_key = iterate_chain_key(chain_key, hlock->class_idx);
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -3073,7 +3089,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
struct task_struct *curr = current;
struct lock_class *class = NULL;
struct held_lock *hlock;
- unsigned int depth, id;
+ unsigned int depth;
int chain_head = 0;
int class_idx;
u64 chain_key;
@@ -3176,11 +3192,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* The 'key ID' is what is the most compact key value to drive
* the hash, not class->key.
*/
- id = class - lock_classes;
/*
* Whoops, we did it again.. ran straight out of our static allocation.
*/
- if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
return 0;
chain_key = curr->curr_chain_key;
@@ -3198,7 +3213,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = 0;
chain_head = 1;
}
- chain_key = iterate_chain_key(chain_key, id);
+ chain_key = iterate_chain_key(chain_key, class_idx);
if (nest_lock && !__lock_is_held(nest_lock))
return print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -3875,7 +3890,7 @@ void lockdep_reset(void)
nr_process_chains = 0;
debug_locks = 1;
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
raw_local_irq_restore(flags);
}
@@ -3894,7 +3909,7 @@ static void zap_class(struct lock_class *class)
/*
* Unhash the class and remove it from the all_lock_classes list:
*/
- list_del_rcu(&class->hash_entry);
+ hlist_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3932,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
void lockdep_free_key_range(void *start, unsigned long size)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i;
int locked;
@@ -3930,9 +3945,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
*/
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3962,7 +3975,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
void lockdep_reset_lock(struct lockdep_map *lock)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i, j;
int locked;
@@ -3987,9 +4000,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
locked = graph_lock();
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4013,28 +4024,6 @@ out_restore:
raw_local_irq_restore(flags);
}
-void lockdep_init(void)
-{
- int i;
-
- /*
- * Some architectures have their own start_kernel()
- * code which calls lockdep_init(), while we also
- * call lockdep_init() from the start_kernel() itself,
- * and we want to initialize the hashes only once:
- */
- if (lockdep_initialized)
- return;
-
- for (i = 0; i < CLASSHASH_SIZE; i++)
- INIT_LIST_HEAD(classhash_table + i);
-
- for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
-
- lockdep_initialized = 1;
-}
-
void __init lockdep_info(void)
{
printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4050,6 @@ void __init lockdep_info(void)
printk(" per task-struct memory footprint: %lu bytes\n",
sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- if (lockdep_init_error) {
- printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
- printk("Call stack leading to lockdep invocation was:\n");
- print_stack_trace(&lockdep_init_trace, 0);
- }
-#endif
}
static void
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5b9102a47ea5..c835270f0c2f 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;
- prev = xchg_acquire(lock, node);
+ /*
+ * We rely on the full barrier with global transitivity implied by the
+ * below xchg() to order the initialization stores above against any
+ * observation of @node. And to provide the ACQUIRE ordering associated
+ * with a LOCK primitive.
+ */
+ prev = xchg(lock, node);
if (likely(prev == NULL)) {
/*
* Lock acquired, don't need to set node->locked to 1. Threads
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0551c219c40e..e364b424b019 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -716,6 +716,7 @@ static inline void
__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
{
unsigned long flags;
+ WAKE_Q(wake_q);
/*
* As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
struct mutex_waiter, list);
debug_mutex_wake_waiter(lock, waiter);
-
- wake_up_process(waiter->task);
+ wake_q_add(&wake_q, waiter->task);
}
spin_unlock_mutex(&lock->wait_lock, flags);
+ wake_up_q(&wake_q);
}
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..ce2f75e32ae1 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
struct __qspinlock *l = (void *)lock;
- return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+ /*
+ * Use release semantics to make sure that the MCS node is properly
+ * initialized before changing the tail code.
+ */
+ return (u32)xchg_release(&l->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Use release semantics to make sure that the MCS node is
+ * properly initialized before changing the tail code.
+ */
+ old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
- struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
#define pv_enabled() false
#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
-#define pv_wait_head __pv_wait_head
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new)
new |= _Q_PENDING_VAL;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Acquire semantic is required here as the function may
+ * return immediately if the lock was free.
+ */
+ old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
@@ -342,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
- cpu_relax();
+ smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
/*
* take ownership and clear the pending bit.
@@ -382,6 +397,7 @@ queue:
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
+ next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
@@ -391,8 +407,18 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
- pv_wait_node(node);
+ pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
}
/*
@@ -406,11 +432,22 @@ queue:
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * smp_cond_acquire() call. As the next PV queue head hasn't been
+ * designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
*/
- pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
- cpu_relax();
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+
+ smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+locked:
/*
* claim the lock:
*
@@ -422,11 +459,17 @@ queue:
* to grab the lock.
*/
for (;;) {
- if (val != tail) {
+ /* In the PV case we might already have _Q_LOCKED_VAL set */
+ if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock);
break;
}
- old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ /*
+ * The smp_cond_acquire() call above has provided the necessary
+ * acquire semantics required for locking. At most two
+ * iterations of this loop may be ran.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
@@ -434,10 +477,12 @@ queue:
}
/*
- * contended path; wait for next, release.
+ * contended path; wait for next if not observed yet, release.
*/
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
+ if (!next) {
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
@@ -462,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b..21ede57f68b3 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
@@ -41,6 +55,96 @@ struct pv_node {
};
/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+
+ qstat_inc(qstat_pv_lock_stealing, ret);
+ return ret;
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !READ_ONCE(l->locked) &&
+ (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+ == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ int val = atomic_read(&lock->val);
+
+ for (;;) {
+ int old, new;
+
+ if (val & _Q_LOCKED_MASK)
+ break;
+
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ old = val;
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ val = atomic_cmpxchg(&lock->val, old, new);
+
+ if (val == old)
+ return 1;
+ }
+ return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +204,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
+ int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
+ qstat_hop(hopcnt);
return &he->lock;
}
}
@@ -144,6 +251,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
}
/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
* Initialize the PV part of the mcs_spinlock node.
*/
static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +282,23 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ int waitcnt = 0;
int loop;
+ bool wait_early;
- for (;;) {
- for (loop = SPIN_THRESHOLD; loop; loop--) {
+ /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+ for (;; waitcnt++) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
cpu_relax();
}
@@ -184,12 +313,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/
smp_store_mb(pn->state, vcpu_halted);
- if (!READ_ONCE(node->locked))
+ if (!READ_ONCE(node->locked)) {
+ qstat_inc(qstat_pv_wait_node, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
+ qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
+ }
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that value
- * so that pv_wait_head() knows to not also try to hash this lock.
+ * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@@ -200,6 +334,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
}
/*
@@ -212,8 +347,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
@@ -242,14 +378,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
}
/*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
+ int waitcnt = 0;
int loop;
/*
@@ -259,12 +400,30 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
- for (;;) {
+ /*
+ * Tracking # of slowpath locking operations
+ */
+ qstat_inc(qstat_pv_lock_slowpath, true);
+
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
- if (!READ_ONCE(l->locked))
- return;
+ if (trylock_clear_pending(lock))
+ goto gotlock;
cpu_relax();
}
+ clear_pending(lock);
+
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
@@ -280,51 +439,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
- * The lock is free and _Q_SLOW_VAL has never
- * been set. Therefore we need to unhash before
- * getting the lock.
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
*/
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return;
+ goto gotlock;
}
}
+ WRITE_ONCE(pn->state, vcpu_halted);
+ qstat_inc(qstat_pv_wait_head, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
- * wakeup and so the vCPU should wait again after spinning for
- * a while.
+ * wakeup or another vCPU has stolen the lock. The current
+ * vCPU should spin again.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
- * Lock is unlocked now; the caller will acquire it without waiting.
- * As with pv_wait_node() we rely on the caller to do a load-acquire
- * for us.
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
*/
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
}
/*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
*/
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 locked;
-
- /*
- * We must not unlock if SLOW, because in that case we must first
- * unhash. Otherwise it would be possible to have multiple @lock
- * entries, which would be BAD.
- */
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
- return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@@ -338,7 +496,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();
@@ -361,14 +519,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
+ qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu);
}
+
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
*/
#include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000000..eb2a2c9bc3fc
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,290 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ * pv_hash_hops - average # of hops per hashing operation
+ * pv_kick_unlock - # of vCPU kicks issued at unlock time
+ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
+ * pv_latency_kick - average latency (ns) of vCPU kick operation
+ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_slowpath - # of locking operations via the slowpath
+ * pv_lock_stealing - # of lock stealing operations
+ * pv_spurious_wakeup - # of spurious wakeups
+ * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_wait_early - # of early vCPU wait's
+ * pv_wait_head - # of vCPU wait's at the queue head
+ * pv_wait_node - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+ qstat_pv_hash_hops,
+ qstat_pv_kick_unlock,
+ qstat_pv_kick_wake,
+ qstat_pv_latency_kick,
+ qstat_pv_latency_wake,
+ qstat_pv_lock_slowpath,
+ qstat_pv_lock_stealing,
+ qstat_pv_spurious_wakeup,
+ qstat_pv_wait_again,
+ qstat_pv_wait_early,
+ qstat_pv_wait_head,
+ qstat_pv_wait_node,
+ qstat_num, /* Total number of statistical counters */
+ qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+ [qstat_pv_hash_hops] = "pv_hash_hops",
+ [qstat_pv_kick_unlock] = "pv_kick_unlock",
+ [qstat_pv_kick_wake] = "pv_kick_wake",
+ [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+ [qstat_pv_latency_kick] = "pv_latency_kick",
+ [qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_slowpath] = "pv_lock_slowpath",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
+ [qstat_pv_wait_again] = "pv_wait_again",
+ [qstat_pv_wait_early] = "pv_wait_early",
+ [qstat_pv_wait_head] = "pv_wait_head",
+ [qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_reset_cnts] = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, counter, len;
+ u64 stat = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ counter = (long)(file->f_inode->i_private);
+
+ if (counter >= qstat_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ stat += per_cpu(qstats[counter], cpu);
+ /*
+ * Need to sum additional counter for some of them
+ */
+ switch (counter) {
+
+ case qstat_pv_latency_kick:
+ case qstat_pv_hash_hops:
+ kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ break;
+
+ case qstat_pv_latency_wake:
+ kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ break;
+ }
+ }
+
+ if (counter == qstat_pv_hash_hops) {
+ u64 frac;
+
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((counter == qstat_pv_latency_kick) ||
+ (counter == qstat_pv_latency_wake)) {
+ stat = 0;
+ if (kicks)
+ stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+ .read = qstat_read,
+ .write = qstat_write,
+ .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+ struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+ int i;
+
+ if (!d_qstat) {
+ pr_warn("Could not create 'qlockstat' debugfs directory\n");
+ return 0;
+ }
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < qstat_num; i++)
+ debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat);
+
+ debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat);
+ return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+ if (cond)
+ this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+ this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(qstats[qstat_pv_latency_wake],
+ sched_clock() - *pkick_time);
+ qstat_inc(qstat_pv_kick_wake, true);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8251e75dd9c0..3e746607abe5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int ret = 0, depth = 0;
struct rt_mutex *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;
detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);
/*
* [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* operations.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/*
* No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/*
* Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;
out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
-
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
takeit:
/* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
- unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (owner == task)
return -EDEADLK;
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
task->pi_blocked_on = waiter;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
if (!owner)
return 0;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
return res;
}
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Remove the top waiter from the current tasks pi waiter tree and
* queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
wake_q_add(wake_q, waiter->task);
}
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
* have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
/*
* Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
rt_mutex_dequeue_pi(owner, waiter);
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
schedule();
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
RB_CLEAR_NODE(&waiter.pi_tree_entry);
RB_CLEAR_NODE(&waiter.tree_entry);
- raw_spin_lock(&lock->wait_lock);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* Remove pending timer: */
if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret;
/*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return 0;
/*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
*/
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_rt_mutex_unlock(lock);
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
+ if (unlock_rt_mutex_safe(lock, flags) == true)
return false;
/* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
/*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
mark_wakeup_next_waiter(wake_q, lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* check PI boosting */
return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_INTERRUPTIBLE);
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return ret;
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7658d32c5c78..584febd13e2e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -10,8 +10,11 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <linux/radix-tree.h>
+#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
+#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
@@ -26,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
static void *try_ram_remap(resource_size_t offset, size_t size)
{
- struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+ unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
- if (!PageHighMem(page))
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
return __va(offset);
return NULL; /* fallback to ioremap_cache */
}
@@ -44,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* being mapped does not have i/o side effects and the __iomem
* annotation is not applicable.
*
- * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
* Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
@@ -53,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
* cache or are written through to memory and never exist in a
* cache-dirty state with respect to program visibility. Attempts to
- * map "System RAM" with this mapping type will fail.
+ * map System RAM with this mapping type will fail.
*/
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
- int is_ram = region_intersects(offset, size, "System RAM");
+ int is_ram = region_intersects(offset, size,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
void *addr = NULL;
if (is_ram == REGION_MIXED) {
@@ -73,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
- * the requested range is potentially in "System RAM"
+ * the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size);
@@ -85,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* If we don't have a mapping yet and more request flags are
* pending then we will be attempting to establish a new virtual
* address mapping. Enforce that this mapping is not aliasing
- * "System RAM"
+ * System RAM.
*/
if (!addr && is_ram == REGION_INTERSECTS && flags) {
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
@@ -111,7 +115,7 @@ EXPORT_SYMBOL(memunmap);
static void devm_memremap_release(struct device *dev, void *res)
{
- memunmap(res);
+ memunmap(*(void **)res);
}
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
@@ -133,8 +137,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
if (addr) {
*ptr = addr;
devres_add(dev, ptr);
- } else
+ } else {
devres_free(ptr);
+ return ERR_PTR(-ENXIO);
+ }
return addr;
}
@@ -147,25 +153,135 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
+pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
+{
+ return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
+}
+EXPORT_SYMBOL(phys_to_pfn_t);
+
#ifdef CONFIG_ZONE_DEVICE
+static DEFINE_MUTEX(pgmap_lock);
+static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
+#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
struct page_map {
struct resource res;
+ struct percpu_ref *ref;
+ struct dev_pagemap pgmap;
+ struct vmem_altmap altmap;
};
-static void devm_memremap_pages_release(struct device *dev, void *res)
+void get_zone_device_page(struct page *page)
+{
+ percpu_ref_get(page->pgmap->ref);
+}
+EXPORT_SYMBOL(get_zone_device_page);
+
+void put_zone_device_page(struct page *page)
{
- struct page_map *page_map = res;
+ put_dev_pagemap(page->pgmap);
+}
+EXPORT_SYMBOL(put_zone_device_page);
+
+static void pgmap_radix_release(struct resource *res)
+{
+ resource_size_t key, align_start, align_size, align_end;
+
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ align_end = align_start + align_size - 1;
+
+ mutex_lock(&pgmap_lock);
+ for (key = res->start; key <= res->end; key += SECTION_SIZE)
+ radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&pgmap_lock);
+}
+
+static unsigned long pfn_first(struct page_map *page_map)
+{
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+ const struct resource *res = &page_map->res;
+ struct vmem_altmap *altmap = pgmap->altmap;
+ unsigned long pfn;
+
+ pfn = res->start >> PAGE_SHIFT;
+ if (altmap)
+ pfn += vmem_altmap_offset(altmap);
+ return pfn;
+}
+
+static unsigned long pfn_end(struct page_map *page_map)
+{
+ const struct resource *res = &page_map->res;
+
+ return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+#define for_each_device_pfn(pfn, map) \
+ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
+
+static void devm_memremap_pages_release(struct device *dev, void *data)
+{
+ struct page_map *page_map = data;
+ struct resource *res = &page_map->res;
+ resource_size_t align_start, align_size;
+ struct dev_pagemap *pgmap = &page_map->pgmap;
+
+ if (percpu_ref_tryget_live(pgmap->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(pgmap->ref);
+ }
/* pages are dead and unused, undo the arch mapping */
- arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ arch_remove_memory(align_start, align_size);
+ pgmap_radix_release(res);
+ dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
+ "%s: failed to free all reserved pages\n", __func__);
}
-void *devm_memremap_pages(struct device *dev, struct resource *res)
+/* assumes rcu_read_lock() held at entry */
+struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
{
- int is_ram = region_intersects(res->start, resource_size(res),
- "System RAM");
struct page_map *page_map;
- int error, nid;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT);
+ return page_map ? &page_map->pgmap : NULL;
+}
+
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @res: "host memory" address range
+ * @ref: a live per-cpu reference count
+ * @altmap: optional descriptor for allocating the memmap from @res
+ *
+ * Notes:
+ * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
+ * (or devm release event).
+ *
+ * 2/ @res is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct resource *res,
+ struct percpu_ref *ref, struct vmem_altmap *altmap)
+{
+ resource_size_t key, align_start, align_size, align_end;
+ struct dev_pagemap *pgmap;
+ struct page_map *page_map;
+ int error, nid, is_ram;
+ unsigned long pfn;
+
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+ - align_start;
+ is_ram = region_intersects(align_start, align_size,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -176,25 +292,124 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
+ if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
+ dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
+ __func__);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (!ref)
+ return ERR_PTR(-EINVAL);
+
page_map = devres_alloc_node(devm_memremap_pages_release,
sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
if (!page_map)
return ERR_PTR(-ENOMEM);
+ pgmap = &page_map->pgmap;
memcpy(&page_map->res, res, sizeof(*res));
+ pgmap->dev = dev;
+ if (altmap) {
+ memcpy(&page_map->altmap, altmap, sizeof(*altmap));
+ pgmap->altmap = &page_map->altmap;
+ }
+ pgmap->ref = ref;
+ pgmap->res = &page_map->res;
+
+ mutex_lock(&pgmap_lock);
+ error = 0;
+ align_end = align_start + align_size - 1;
+ for (key = align_start; key <= align_end; key += SECTION_SIZE) {
+ struct dev_pagemap *dup;
+
+ rcu_read_lock();
+ dup = find_dev_pagemap(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(dev, "%s: %pr collides with mapping for %s\n",
+ __func__, res, dev_name(dup->dev));
+ error = -EBUSY;
+ break;
+ }
+ error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT,
+ page_map);
+ if (error) {
+ dev_err(dev, "%s: failed: %d\n", __func__, error);
+ break;
+ }
+ }
+ mutex_unlock(&pgmap_lock);
+ if (error)
+ goto err_radix;
+
nid = dev_to_node(dev);
if (nid < 0)
nid = numa_mem_id();
- error = arch_add_memory(nid, res->start, resource_size(res), true);
- if (error) {
- devres_free(page_map);
- return ERR_PTR(error);
- }
+ error = arch_add_memory(nid, align_start, align_size, true);
+ if (error)
+ goto err_add_memory;
+
+ for_each_device_pfn(pfn, page_map) {
+ struct page *page = pfn_to_page(pfn);
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer. It is a bug if a ZONE_DEVICE page is ever
+ * freed or placed on a driver-private list. Seed the
+ * storage with LIST_POISON* values.
+ */
+ list_del(&page->lru);
+ page->pgmap = pgmap;
+ }
devres_add(dev, page_map);
return __va(res->start);
+
+ err_add_memory:
+ err_radix:
+ pgmap_radix_release(res);
+ devres_free(page_map);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ return altmap->reserve + altmap->free;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+{
+ /*
+ * 'memmap_start' is the virtual address for the first "struct
+ * page" in this range of the vmemmap array. In the case of
+ * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
+ * pointer arithmetic, so we can perform this to_vmem_altmap()
+ * conversion without concern for the initialization state of
+ * the struct page fields.
+ */
+ struct page *page = (struct page *) memmap_start;
+ struct dev_pagemap *pgmap;
+
+ /*
+ * Unconditionally retrieve a dev_pagemap associated with the
+ * given physical address, this is only for use in the
+ * arch_{add|remove}_memory() for setting up and tearing down
+ * the memmap.
+ */
+ rcu_read_lock();
+ pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+ rcu_read_unlock();
+
+ return pgmap ? pgmap->altmap : NULL;
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 38c7bd5583ff..041200ca4a2d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -53,6 +53,7 @@
#include <asm/sections.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
+#include <linux/livepatch.h>
#include <linux/async.h>
#include <linux/percpu.h>
#include <linux/kmemleak.h>
@@ -80,15 +81,6 @@
# define debug_align(X) (X)
#endif
-/*
- * Given BASE and SIZE this macro calculates the number of pages the
- * memory regions occupies
- */
-#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
- (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
- PFN_DOWN((unsigned long)BASE) + 1) \
- : (0UL))
-
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
@@ -108,13 +100,6 @@ static LIST_HEAD(modules);
* Use a latched RB-tree for __module_address(); this allows us to use
* RCU-sched lookups of the address from any context.
*
- * Because modules have two address ranges: init and core, we need two
- * latch_tree_nodes entries. Therefore we need the back-pointer from
- * mod_tree_node.
- *
- * Because init ranges are short lived we mark them unlikely and have placed
- * them outside the critical cacheline in struct module.
- *
* This is conditional on PERF_EVENTS || TRACING because those can really hit
* __module_address() hard by doing a lot of stack unwinding; potentially from
* NMI context.
@@ -122,24 +107,16 @@ static LIST_HEAD(modules);
static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
-
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->module_init;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- return (unsigned long)mod->module_core;
+ return (unsigned long)layout->base;
}
static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
{
- struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
- struct module *mod = mtn->mod;
-
- if (unlikely(mtn == &mod->mtn_init))
- return (unsigned long)mod->init_size;
+ struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
- return (unsigned long)mod->core_size;
+ return (unsigned long)layout->size;
}
static __always_inline bool
@@ -197,23 +174,23 @@ static void __mod_tree_remove(struct mod_tree_node *node)
*/
static void mod_tree_insert(struct module *mod)
{
- mod->mtn_core.mod = mod;
- mod->mtn_init.mod = mod;
+ mod->core_layout.mtn.mod = mod;
+ mod->init_layout.mtn.mod = mod;
- __mod_tree_insert(&mod->mtn_core);
- if (mod->init_size)
- __mod_tree_insert(&mod->mtn_init);
+ __mod_tree_insert(&mod->core_layout.mtn);
+ if (mod->init_layout.size)
+ __mod_tree_insert(&mod->init_layout.mtn);
}
static void mod_tree_remove_init(struct module *mod)
{
- if (mod->init_size)
- __mod_tree_remove(&mod->mtn_init);
+ if (mod->init_layout.size)
+ __mod_tree_remove(&mod->init_layout.mtn);
}
static void mod_tree_remove(struct module *mod)
{
- __mod_tree_remove(&mod->mtn_core);
+ __mod_tree_remove(&mod->core_layout.mtn);
mod_tree_remove_init(mod);
}
@@ -267,9 +244,9 @@ static void __mod_update_bounds(void *base, unsigned int size)
static void mod_update_bounds(struct module *mod)
{
- __mod_update_bounds(mod->module_core, mod->core_size);
- if (mod->init_size)
- __mod_update_bounds(mod->module_init, mod->init_size);
+ __mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+ if (mod->init_layout.size)
+ __mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
}
#ifdef CONFIG_KGDB_KDB
@@ -327,6 +304,9 @@ struct load_info {
struct _ddebug *debug;
unsigned int num_debug;
bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -1005,6 +985,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
mod->exit();
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+
async_synchronize_full();
/* Store the name of the last unloaded module for diagnostic purposes */
@@ -1214,7 +1197,7 @@ struct module_attribute module_uevent =
static ssize_t show_coresize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->core_size);
+ return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
}
static struct module_attribute modinfo_coresize =
@@ -1223,7 +1206,7 @@ static struct module_attribute modinfo_coresize =
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->init_size);
+ return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
}
static struct module_attribute modinfo_initsize =
@@ -1873,64 +1856,75 @@ static void mod_sysfs_teardown(struct module *mod)
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
+ *
+ * General layout of module is:
+ * [text] [read-only-data] [writable data]
+ * text_size -----^ ^ ^
+ * ro_size ------------------------| |
+ * size -------------------------------------------|
+ *
+ * These values are always page-aligned (as is base)
*/
-void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+static void frob_text(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
- unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base,
+ layout->text_size >> PAGE_SHIFT);
+}
- if (end_pfn > begin_pfn)
- set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+static void frob_rodata(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->text_size,
+ (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
}
-static void set_section_ro_nx(void *base,
- unsigned long text_size,
- unsigned long ro_size,
- unsigned long total_size)
+static void frob_writable_data(const struct module_layout *layout,
+ int (*set_memory)(unsigned long start, int num_pages))
{
- /* begin and end PFNs of the current subsection */
- unsigned long begin_pfn;
- unsigned long end_pfn;
+ BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+ set_memory((unsigned long)layout->base + layout->ro_size,
+ (layout->size - layout->ro_size) >> PAGE_SHIFT);
+}
- /*
- * Set RO for module text and RO-data:
- * - Always protect first page.
- * - Do not protect last partial page.
- */
- if (ro_size > 0)
- set_page_attributes(base, base + ro_size, set_memory_ro);
+/* livepatching wants to disable read-only so it can frob module. */
+void module_disable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
+ frob_rodata(&mod->init_layout, set_memory_rw);
+}
- /*
- * Set NX permissions for module data:
- * - Do not protect first partial page.
- * - Always protect last page.
- */
- if (total_size > text_size) {
- begin_pfn = PFN_UP((unsigned long)base + text_size);
- end_pfn = PFN_UP((unsigned long)base + total_size);
- if (end_pfn > begin_pfn)
- set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
- }
+void module_enable_ro(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_rodata(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
+ frob_rodata(&mod->init_layout, set_memory_ro);
}
-static void unset_module_core_ro_nx(struct module *mod)
+static void module_enable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_core + mod->core_text_size,
- mod->module_core + mod->core_size,
- set_memory_x);
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_nx);
+ frob_writable_data(&mod->core_layout, set_memory_nx);
+ frob_rodata(&mod->init_layout, set_memory_nx);
+ frob_writable_data(&mod->init_layout, set_memory_nx);
}
-static void unset_module_init_ro_nx(struct module *mod)
+static void module_disable_nx(const struct module *mod)
{
- set_page_attributes(mod->module_init + mod->init_text_size,
- mod->module_init + mod->init_size,
- set_memory_x);
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_ro_size,
- set_memory_rw);
+ frob_rodata(&mod->core_layout, set_memory_x);
+ frob_writable_data(&mod->core_layout, set_memory_x);
+ frob_rodata(&mod->init_layout, set_memory_x);
+ frob_writable_data(&mod->init_layout, set_memory_x);
}
/* Iterate through all modules and set each module's text as RW */
@@ -1942,16 +1936,9 @@ void set_all_modules_text_rw(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_rw);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_rw);
- }
+
+ frob_text(&mod->core_layout, set_memory_rw);
+ frob_text(&mod->init_layout, set_memory_rw);
}
mutex_unlock(&module_mutex);
}
@@ -1965,23 +1952,25 @@ void set_all_modules_text_ro(void)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_ro);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_ro);
- }
+
+ frob_text(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_ro);
}
mutex_unlock(&module_mutex);
}
+
+static void disable_ro_nx(const struct module_layout *layout)
+{
+ frob_text(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_rw);
+ frob_rodata(layout, set_memory_x);
+ frob_writable_data(layout, set_memory_x);
+}
+
#else
-static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static void unset_module_core_ro_nx(struct module *mod) { }
-static void unset_module_init_ro_nx(struct module *mod) { }
+static void disable_ro_nx(const struct module_layout *layout) { }
+static void module_enable_nx(const struct module *mod) { }
+static void module_disable_nx(const struct module *mod) { }
#endif
void __weak module_memfree(void *module_region)
@@ -2033,19 +2022,19 @@ static void free_module(struct module *mod)
synchronize_sched();
mutex_unlock(&module_mutex);
- /* This may be NULL, but that's OK */
- unset_module_init_ro_nx(mod);
+ /* This may be empty, but that's OK */
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
+ module_memfree(mod->init_layout.base);
kfree(mod->args);
percpu_modfree(mod);
/* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
- unset_module_core_ro_nx(mod);
- module_memfree(mod->module_core);
+ disable_ro_nx(&mod->core_layout);
+ module_memfree(mod->core_layout.base);
#ifdef CONFIG_MPU
update_protections(current->mm);
@@ -2248,20 +2237,20 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| strstarts(sname, ".init"))
continue;
- s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
+ s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->core_size = debug_align(mod->core_size);
- mod->core_text_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.text_size = mod->core_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->core_size = debug_align(mod->core_size);
- mod->core_ro_size = mod->core_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
+ mod->core_layout.ro_size = mod->core_layout.size;
break;
case 3: /* whole core */
- mod->core_size = debug_align(mod->core_size);
+ mod->core_layout.size = debug_align(mod->core_layout.size);
break;
}
}
@@ -2277,21 +2266,21 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| s->sh_entsize != ~0UL
|| !strstarts(sname, ".init"))
continue;
- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
+ s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
| INIT_OFFSET_MASK);
pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
- mod->init_size = debug_align(mod->init_size);
- mod->init_text_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.text_size = mod->init_layout.size;
break;
case 1: /* RO: text and ro-data */
- mod->init_size = debug_align(mod->init_size);
- mod->init_ro_size = mod->init_size;
+ mod->init_layout.size = debug_align(mod->init_layout.size);
+ mod->init_layout.ro_size = mod->init_layout.size;
break;
case 3: /* whole init */
- mod->init_size = debug_align(mod->init_size);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
break;
}
}
@@ -2401,7 +2390,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
if (sym->st_shndx == SHN_UNDEF)
return 'U';
- if (sym->st_shndx == SHN_ABS)
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
return 'a';
if (sym->st_shndx >= SHN_LORESERVE)
return '?';
@@ -2430,7 +2419,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum, unsigned int pcpundx)
{
const Elf_Shdr *sec;
@@ -2439,6 +2428,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
|| !src->st_name)
return false;
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
sec = sechdrs + src->st_shndx;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
@@ -2466,7 +2460,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+ symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
info->index.sym) | INIT_OFFSET_MASK;
pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
@@ -2476,26 +2470,38 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
strtab_size += strlen(&info->strtab[src[i].st_name])+1;
ndst++;
}
}
/* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_size += strtab_size;
- mod->core_size = debug_align(mod->core_size);
+ info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_layout.size += strtab_size;
+ mod->core_layout.size = debug_align(mod->core_layout.size);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+ strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- mod->init_size = debug_align(mod->init_size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod->init_layout.size = ALIGN(mod->init_layout.size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod->init_layout.size;
+ mod->init_layout.size += sizeof(struct mod_kallsyms);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
}
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
static void add_kallsyms(struct module *mod, const struct load_info *info)
{
unsigned int i, ndst;
@@ -2504,28 +2510,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
char *s;
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- mod->symtab = (void *)symsec->sh_addr;
- mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Set up to point into init section. */
+ mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+ mod->kallsyms->symtab = (void *)symsec->sh_addr;
+ mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
- mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
/* Set types up while we still have access to sections. */
- for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
- mod->core_symtab = dst = mod->module_core + info->symoffs;
- mod->core_strtab = s = mod->module_core + info->stroffs;
- src = mod->symtab;
- for (ndst = i = 0; i < mod->num_symtab; i++) {
+ for (i = 0; i < mod->kallsyms->num_symtab; i++)
+ mod->kallsyms->symtab[i].st_info
+ = elf_type(&mod->kallsyms->symtab[i], info);
+
+ /* Now populate the cut down core kallsyms for after init. */
+ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+ mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+ src = mod->kallsyms->symtab;
+ for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_strtab;
- s += strlcpy(s, &mod->strtab[src[i].st_name],
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
KSYM_NAME_LEN) + 1;
}
}
- mod->core_num_syms = ndst;
+ mod->core_kallsyms.num_symtab = ndst;
}
#else
static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2665,7 +2677,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
- err = security_kernel_module_from_file(NULL);
+ err = security_kernel_read_file(NULL, READING_MODULE);
if (err)
return err;
@@ -2683,63 +2695,6 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
return 0;
}
-/* Sets info->hdr and info->len. */
-static int copy_module_from_fd(int fd, struct load_info *info)
-{
- struct fd f = fdget(fd);
- int err;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -ENOEXEC;
-
- err = security_kernel_module_from_file(f.file);
- if (err)
- goto out;
-
- err = vfs_getattr(&f.file->f_path, &stat);
- if (err)
- goto out;
-
- if (stat.size > INT_MAX) {
- err = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- err = -EINVAL;
- goto out;
- }
-
- info->hdr = vmalloc(stat.size);
- if (!info->hdr) {
- err = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(info->hdr);
- err = bytes;
- goto out;
- }
- if (bytes == 0)
- break;
- pos += bytes;
- }
- info->len = pos;
-
-out:
- fdput(f);
- return err;
-}
-
static void free_copy(struct load_info *info)
{
vfree(info->hdr);
@@ -2964,7 +2919,7 @@ static int move_module(struct module *mod, struct load_info *info)
void *ptr;
/* Do the allocs. */
- ptr = module_alloc(mod->core_size);
+ ptr = module_alloc(mod->core_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2974,11 +2929,11 @@ static int move_module(struct module *mod, struct load_info *info)
if (!ptr)
return -ENOMEM;
- memset(ptr, 0, mod->core_size);
- mod->module_core = ptr;
+ memset(ptr, 0, mod->core_layout.size);
+ mod->core_layout.base = ptr;
- if (mod->init_size) {
- ptr = module_alloc(mod->init_size);
+ if (mod->init_layout.size) {
+ ptr = module_alloc(mod->init_layout.size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -2987,13 +2942,13 @@ static int move_module(struct module *mod, struct load_info *info)
*/
kmemleak_ignore(ptr);
if (!ptr) {
- module_memfree(mod->module_core);
+ module_memfree(mod->core_layout.base);
return -ENOMEM;
}
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ memset(ptr, 0, mod->init_layout.size);
+ mod->init_layout.base = ptr;
} else
- mod->module_init = NULL;
+ mod->init_layout.base = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -3005,10 +2960,10 @@ static int move_module(struct module *mod, struct load_info *info)
continue;
if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->module_init
+ dest = mod->init_layout.base
+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
else
- dest = mod->module_core + shdr->sh_entsize;
+ dest = mod->core_layout.base + shdr->sh_entsize;
if (shdr->sh_type != SHT_NOBITS)
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
@@ -3070,12 +3025,12 @@ static void flush_module_icache(const struct module *mod)
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
- if (mod->module_init)
- flush_icache_range((unsigned long)mod->module_init,
- (unsigned long)mod->module_init
- + mod->init_size);
- flush_icache_range((unsigned long)mod->module_core,
- (unsigned long)mod->module_core + mod->core_size);
+ if (mod->init_layout.base)
+ flush_icache_range((unsigned long)mod->init_layout.base,
+ (unsigned long)mod->init_layout.base
+ + mod->init_layout.size);
+ flush_icache_range((unsigned long)mod->core_layout.base,
+ (unsigned long)mod->core_layout.base + mod->core_layout.size);
set_fs(old_fs);
}
@@ -3133,8 +3088,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
- module_memfree(mod->module_core);
+ module_memfree(mod->init_layout.base);
+ module_memfree(mod->core_layout.base);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3221,7 +3176,7 @@ static noinline int do_init_module(struct module *mod)
ret = -ENOMEM;
goto fail;
}
- freeinit->module_init = mod->module_init;
+ freeinit->module_init = mod->init_layout.base;
/*
* We want to find out whether @mod uses async during init. Clear
@@ -3274,17 +3229,16 @@ static noinline int do_init_module(struct module *mod)
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
mod_tree_remove_init(mod);
- unset_module_init_ro_nx(mod);
+ disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
+ mod->init_layout.base = NULL;
+ mod->init_layout.size = 0;
+ mod->init_layout.ro_size = 0;
+ mod->init_layout.text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
@@ -3306,6 +3260,8 @@ fail:
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);
return ret;
@@ -3373,25 +3329,15 @@ static int complete_formation(struct module *mod, struct load_info *info)
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
-
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+ /* Set RO and NX regions */
+ module_enable_ro(mod);
+ module_enable_nx(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
return 0;
out:
@@ -3399,6 +3345,20 @@ out:
return err;
}
+static int prepare_coming_module(struct module *mod)
+{
+ int err;
+
+ ftrace_module_enable(mod);
+ err = klp_module_coming(mod);
+ if (err)
+ return err;
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+ return 0;
+}
+
static int unknown_module_param_cb(char *param, char *val, const char *modname,
void *arg)
{
@@ -3513,13 +3473,17 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto ddebug_cleanup;
+ err = prepare_coming_module(mod);
+ if (err)
+ goto bug_cleanup;
+
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, NULL,
+ -32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
- goto bug_cleanup;
+ goto coming_cleanup;
} else if (after_dashes) {
pr_warn("%s: parameters '%s' after `--' ignored\n",
mod->name, after_dashes);
@@ -3528,7 +3492,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Link in to syfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
- goto bug_cleanup;
+ goto coming_cleanup;
/* Get rid of temporary copy. */
free_copy(info);
@@ -3538,18 +3502,20 @@ static int load_module(struct load_info *info, const char __user *uargs,
return do_init_module(mod);
+ coming_cleanup:
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+
bug_cleanup:
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
-
/* we can't deallocate the module until we clear memory protection */
- unset_module_init_ro_nx(mod);
- unset_module_core_ro_nx(mod);
+ module_disable_ro(mod);
+ module_disable_nx(mod);
ddebug_cleanup:
dynamic_debug_remove(info->debug);
@@ -3578,7 +3544,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
*/
ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->module_core, mod->core_size);
+ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
module_deallocate(mod, info);
free_copy:
@@ -3608,8 +3574,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
- int err;
struct load_info info = { };
+ loff_t size;
+ void *hdr;
+ int err;
err = may_init_module();
if (err)
@@ -3621,9 +3589,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
|MODULE_INIT_IGNORE_VERMAGIC))
return -EINVAL;
- err = copy_module_from_fd(fd, &info);
+ err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX,
+ READING_MODULE);
if (err)
return err;
+ info.hdr = hdr;
+ info.len = size;
return load_module(&info, uargs, flags);
}
@@ -3646,6 +3617,11 @@ static inline int is_arm_mapping_symbol(const char *str)
&& (str[2] == '\0' || str[2] == '.');
}
+static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
static const char *get_ksymbol(struct module *mod,
unsigned long addr,
unsigned long *size,
@@ -3653,41 +3629,42 @@ static const char *get_ksymbol(struct module *mod,
{
unsigned int i, best = 0;
unsigned long nextval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
else
- nextval = (unsigned long)mod->module_core+mod->core_text_size;
+ nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
- for (i = 1; i < mod->num_symtab; i++) {
- if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
continue;
/* We ignore unnamed symbols: they're uninformative
* and inserted at a whim. */
- if (mod->symtab[i].st_value <= addr
- && mod->symtab[i].st_value > mod->symtab[best].st_value
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ if (*symname(kallsyms, i) == '\0'
+ || is_arm_mapping_symbol(symname(kallsyms, i)))
+ continue;
+
+ if (kallsyms->symtab[i].st_value <= addr
+ && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
best = i;
- if (mod->symtab[i].st_value > addr
- && mod->symtab[i].st_value < nextval
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
- nextval = mod->symtab[i].st_value;
+ if (kallsyms->symtab[i].st_value > addr
+ && kallsyms->symtab[i].st_value < nextval)
+ nextval = kallsyms->symtab[i].st_value;
}
if (!best)
return NULL;
if (size)
- *size = nextval - mod->symtab[best].st_value;
+ *size = nextval - kallsyms->symtab[best].st_value;
if (offset)
- *offset = addr - mod->symtab[best].st_value;
- return mod->strtab + mod->symtab[best].st_name;
+ *offset = addr - kallsyms->symtab[best].st_value;
+ return symname(kallsyms, best);
}
/* For kallsyms to ask for address resolution. NULL means not found. Careful
@@ -3777,19 +3754,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (symnum < mod->num_symtab) {
- *value = mod->symtab[symnum].st_value;
- *type = mod->symtab[symnum].st_info;
- strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- KSYM_NAME_LEN);
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ *value = kallsyms->symtab[symnum].st_value;
+ *type = kallsyms->symtab[symnum].st_info;
+ strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
preempt_enable();
return 0;
}
- symnum -= mod->num_symtab;
+ symnum -= kallsyms->num_symtab;
}
preempt_enable();
return -ERANGE;
@@ -3798,11 +3777,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
static unsigned long mod_find_symname(struct module *mod, const char *name)
{
unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
- for (i = 0; i < mod->num_symtab; i++)
- if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
- mod->symtab[i].st_info != 'U')
- return mod->symtab[i].st_value;
+ for (i = 0; i < kallsyms->num_symtab; i++)
+ if (strcmp(name, symname(kallsyms, i)) == 0 &&
+ kallsyms->symtab[i].st_info != 'U')
+ return kallsyms->symtab[i].st_value;
return 0;
}
@@ -3841,11 +3821,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
module_assert_mutex();
list_for_each_entry(mod, &modules, list) {
+ /* We hold module_mutex: no need for rcu_dereference_sched */
+ struct mod_kallsyms *kallsyms = mod->kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- for (i = 0; i < mod->num_symtab; i++) {
- ret = fn(data, mod->strtab + mod->symtab[i].st_name,
- mod, mod->symtab[i].st_value);
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ ret = fn(data, symname(kallsyms, i),
+ mod, kallsyms->symtab[i].st_value);
if (ret != 0)
return ret;
}
@@ -3905,7 +3888,7 @@ static int m_show(struct seq_file *m, void *p)
return 0;
seq_printf(m, "%s %u",
- mod->name, mod->init_size + mod->core_size);
+ mod->name, mod->init_layout.size + mod->core_layout.size);
print_unload_info(m, mod);
/* Informative for users. */
@@ -3914,7 +3897,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->core_layout.base);
/* Taints info */
if (mod->taints)
@@ -4057,8 +4040,8 @@ struct module *__module_text_address(unsigned long addr)
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
- if (!within(addr, mod->module_init, mod->init_text_size)
- && !within(addr, mod->module_core, mod->core_text_size))
+ if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+ && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
mod = NULL;
}
return mod;
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6528a79d998d..64b9dead4a07 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -11,10 +11,17 @@
#include <linux/kernel.h>
#include <linux/errno.h>
+#include <linux/string.h>
#include <keys/system_keyring.h>
#include <crypto/public_key.h>
#include "module-internal.h"
+enum pkey_id_type {
+ PKEY_ID_PGP, /* OpenPGP generated key ID */
+ PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */
+ PKEY_ID_PKCS7, /* Signature in PKCS#7 message */
+};
+
/*
* Module signature information block.
*
diff --git a/kernel/panic.c b/kernel/panic.c
index 4b150bc0c6c1..fa400852bf6c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,7 @@
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
+#include <linux/bug.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -61,6 +62,17 @@ void __weak panic_smp_self_stop(void)
cpu_relax();
}
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak nmi_panic_self_stop(struct pt_regs *regs)
+{
+ panic_smp_self_stop();
+}
+
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -71,17 +83,17 @@ void __weak panic_smp_self_stop(void)
*/
void panic(const char *fmt, ...)
{
- static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
va_list args;
long i, i_next = 0;
int state = 0;
+ int old_cpu, this_cpu;
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
* there is nothing to prevent an interrupt handler (that runs
- * after the panic_lock is acquired) from invoking panic again.
+ * after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
@@ -94,8 +106,16 @@ void panic(const char *fmt, ...)
* multiple parallel invocations of panic, all other CPUs either
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
+ *
+ * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
+ * comes here, so go ahead.
+ * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+ * panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- if (!spin_trylock(&panic_lock))
+ this_cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+
+ if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
console_verbose();
@@ -117,9 +137,11 @@ void panic(const char *fmt, ...)
* everything else.
* If we want to run this after calling panic_notifiers, pass
* the "crash_kexec_post_notifiers" option to the kernel.
+ *
+ * Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!crash_kexec_post_notifiers)
- crash_kexec(NULL);
+ __crash_kexec(NULL);
/*
* Note smp_send_stop is the usual smp shutdown function, which
@@ -142,9 +164,11 @@ void panic(const char *fmt, ...)
* panic_notifiers and dumping kmsg before kdump.
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
+ *
+ * Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (crash_kexec_post_notifiers)
- crash_kexec(NULL);
+ __crash_kexec(NULL);
bust_spinlocks(0);
@@ -157,8 +181,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_trylock();
- console_unlock();
+ console_flush_on_panic();
if (!panic_blink)
panic_blink = no_blink;
@@ -427,20 +450,25 @@ void oops_exit(void)
kmsg_dump(KMSG_DUMP_OOPS);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-struct slowpath_args {
+struct warn_args {
const char *fmt;
va_list args;
};
-static void warn_slowpath_common(const char *file, int line, void *caller,
- unsigned taint, struct slowpath_args *args)
+void __warn(const char *file, int line, void *caller, unsigned taint,
+ struct pt_regs *regs, struct warn_args *args)
{
disable_trace_on_warning();
pr_warn("------------[ cut here ]------------\n");
- pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
- raw_smp_processor_id(), current->pid, file, line, caller);
+
+ if (file)
+ pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
+ raw_smp_processor_id(), current->pid, file, line,
+ caller);
+ else
+ pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
+ raw_smp_processor_id(), current->pid, caller);
if (args)
vprintk(args->fmt, args->args);
@@ -457,20 +485,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
}
print_modules();
- dump_stack();
+
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
print_oops_end_marker();
+
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
}
+#ifdef WANT_WARN_ON_SLOWPATH
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
{
- struct slowpath_args args;
+ struct warn_args args;
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, &args);
+ __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
+ &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
@@ -478,20 +513,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt);
void warn_slowpath_fmt_taint(const char *file, int line,
unsigned taint, const char *fmt, ...)
{
- struct slowpath_args args;
+ struct warn_args args;
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- taint, &args);
+ __warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, NULL);
+ __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 78b3d9f80d44..4d73a834c7e6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -588,7 +588,7 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- /* Veryify no one has done anything silly */
+ /* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
/* bump default and minimum pid_max based on number of cpus */
@@ -604,5 +604,5 @@ void __init pidmap_init(void)
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 02e8dfaa1ce2..68d3ebc12601 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -235,7 +235,7 @@ config PM_TRACE_RTC
config APM_EMULATION
tristate "Advanced Power Management Emulation"
- depends on PM && SYS_SUPPORTS_APM_EMULATION
+ depends on SYS_SUPPORTS_APM_EMULATION
help
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7342a24f559..aa0f26b58426 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1158,6 +1158,22 @@ static int __init kaslr_nohibernate_setup(char *str)
return nohibernate_setup(str);
}
+static int __init page_poison_nohibernate_setup(char *str)
+{
+#ifdef CONFIG_PAGE_POISONING_ZERO
+ /*
+ * The zeroing option for page poison skips the checks on alloc.
+ * since hibernation doesn't save free pages there's no way to
+ * guarantee the pages will still be zeroed.
+ */
+ if (!strcmp(str, "on")) {
+ pr_info("Disabling hibernation due to page poisoning\n");
+ return nohibernate_setup(str);
+ }
+#endif
+ return 1;
+}
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
@@ -1166,3 +1182,4 @@ __setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
__setup("kaslr", kaslr_nohibernate_setup);
+__setup("page_poison=", page_poison_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b2dd4d999900..27946975eff0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
}
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
return show_trace_dev_match(buf, PAGE_SIZE);
}
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index caadb566e82b..efe1b3b17c88 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = S_IRUGO, \
+ }, \
+ .show = _name##_show, \
+}
+
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 564f786df470..df058bed53ce 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only)
unsigned long end_time;
unsigned int todo;
bool wq_busy = false;
- struct timeval start, end;
- u64 elapsed_msecs64;
+ ktime_t start, end, elapsed;
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
- do_gettimeofday(&start);
+ start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only)
sleep_usecs *= 2;
}
- do_gettimeofday(&end);
- elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
- do_div(elapsed_msecs64, NSEC_PER_MSEC);
- elapsed_msecs = elapsed_msecs64;
+ end = ktime_get_boottime();
+ elapsed = ktime_sub(end, start);
+ elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
pr_cont("\n");
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f9fe133c13e2..230a77225e2e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -248,7 +248,7 @@ static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+ pr_info("suspend debug: Waiting for %d second(s).\n",
pm_test_delay);
mdelay(pm_test_delay * 1000);
return 1;
@@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: late suspend of devices failed\n");
+ pr_err("PM: late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ pr_err("PM: noirq suspend of devices failed\n");
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ce8826f1053..bfbf284e4218 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/uio.h>
#include <asm/uaccess.h>
+#include <asm-generic/sections.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -232,7 +233,11 @@ struct printk_log {
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
-};
+}
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+__packed __aligned(4)
+#endif
+;
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
@@ -273,11 +278,7 @@ static u32 clear_idx;
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
#define LOG_ALIGN __alignof__(struct printk_log)
-#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
@@ -366,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty)
static int log_make_free_space(u32 msg_size)
{
- while (log_first_seq < log_next_seq) {
- if (logbuf_has_space(msg_size, false))
- return 0;
+ while (log_first_seq < log_next_seq &&
+ !logbuf_has_space(msg_size, false)) {
/* drop old messages until we have enough contiguous space */
log_first_idx = log_next(log_first_idx);
log_first_seq++;
}
+ if (clear_seq < log_first_seq) {
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
/* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, true))
+ if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
return 0;
return -ENOMEM;
@@ -853,6 +858,7 @@ void log_buf_kexec_setup(void)
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(clear_idx);
VMCOREINFO_SYMBOL(log_next_idx);
/*
* Export struct printk_log size and field offsets. User space tools can
@@ -1215,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u32 idx;
enum log_flags prev;
- if (clear_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1482,58 +1482,6 @@ static void zap_locks(void)
sema_init(&console_sem, 1);
}
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if (con->flags & CON_ANYTIME)
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(unsigned int cpu)
-{
- return cpu_online(cpu) || have_callable_console();
-}
-
-/*
- * Try to get console ownership to actually show the kernel
- * messages from a 'printk'. Return true (and with the
- * console_lock held, and 'console_locked' set) if it
- * is successful, false otherwise.
- */
-static int console_trylock_for_printk(void)
-{
- unsigned int cpu = smp_processor_id();
-
- if (!console_trylock())
- return 0;
- /*
- * If we can't use the console, we need to release the console
- * semaphore by hand to avoid flushing the buffer. We need to hold the
- * console semaphore in order to do this test safely.
- */
- if (!can_use_console(cpu)) {
- console_locked = 0;
- up_console_sem();
- return 0;
- }
- return 1;
-}
-
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1660,7 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static int recursion_bug;
+ static bool recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len = 0;
@@ -1680,7 +1628,6 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- /* This stops the holder of console_sem just where we want him */
local_irq_save(flags);
this_cpu = smp_processor_id();
@@ -1696,7 +1643,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* it can be printed at the next appropriate moment:
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = 1;
+ recursion_bug = true;
local_irq_restore(flags);
return 0;
}
@@ -1704,6 +1651,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
lockdep_off();
+ /* This stops the holder of console_sem just where we want him */
raw_spin_lock(&logbuf_lock);
logbuf_cpu = this_cpu;
@@ -1711,7 +1659,7 @@ asmlinkage int vprintk_emit(int facility, int level,
static const char recursion_msg[] =
"BUG: recent printk recursion!";
- recursion_bug = 0;
+ recursion_bug = false;
/* emit KERN_CRIT message */
printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
NULL, 0, recursion_msg,
@@ -1809,20 +1757,12 @@ asmlinkage int vprintk_emit(int facility, int level,
if (!in_sched) {
lockdep_off();
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
-
- /*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock_for_printk())
+ if (console_trylock())
console_unlock();
- preempt_enable();
lockdep_on();
}
@@ -2173,7 +2113,20 @@ int console_trylock(void)
return 0;
}
console_locked = 1;
- console_may_schedule = 0;
+ /*
+ * When PREEMPT_COUNT disabled we can't reliably detect if it's
+ * safe to schedule (e.g. calling printk while holding a spin_lock),
+ * because preempt_disable()/preempt_enable() are just barriers there
+ * and preempt_count() is always 0.
+ *
+ * RCU read sections have a separate preemption counter when
+ * PREEMPT_RCU enabled thus we must take extra care and check
+ * rcu_preempt_depth(), otherwise RCU read sections modify
+ * preempt_count().
+ */
+ console_may_schedule = !oops_in_progress &&
+ preemptible() &&
+ !rcu_preempt_depth();
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -2183,6 +2136,34 @@ int is_console_locked(void)
return console_locked;
}
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for_each_console(con)
+ if ((con->flags & CON_ENABLED) &&
+ (con->flags & CON_ANYTIME))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
+ */
+static inline int can_use_console(void)
+{
+ return cpu_online(raw_smp_processor_id()) || have_callable_console();
+}
+
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2233,18 +2214,41 @@ void console_unlock(void)
static u64 seen_seq;
unsigned long flags;
bool wake_klogd = false;
- bool retry;
+ bool do_cond_resched, retry;
if (console_suspended) {
up_console_sem();
return;
}
+ /*
+ * Console drivers are called under logbuf_lock, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system.
+ */
+ do_cond_resched = console_may_schedule;
console_may_schedule = 0;
+again:
+ /*
+ * We released the console_sem lock, so we need to recheck if
+ * cpu is online and (if not) is there at least one CON_ANYTIME
+ * console.
+ */
+ if (!can_use_console()) {
+ console_locked = 0;
+ up_console_sem();
+ return;
+ }
+
/* flush buffered message fragment immediately to console */
console_cont_flush(text, sizeof(text));
-again:
+
for (;;) {
struct printk_log *msg;
size_t ext_len = 0;
@@ -2311,6 +2315,9 @@ skip:
call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
+
+ if (do_cond_resched)
+ cond_resched();
}
console_locked = 0;
@@ -2378,6 +2385,25 @@ void console_unblank(void)
console_unlock();
}
+/**
+ * console_flush_on_panic - flush console content on panic
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(void)
+{
+ /*
+ * If someone else is holding the console lock, trylock will fail
+ * and may_schedule may be set. Ignore and proceed to unlock so
+ * that messages are flushed out. As this can be called from any
+ * context and we don't want to get preempted while flushing,
+ * ensure may_schedule is cleared.
+ */
+ console_trylock();
+ console_may_schedule = 0;
+ console_unlock();
+}
+
/*
* Return the console tty driver structure and its associated index
*/
@@ -2658,13 +2684,36 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that code exists elsewhere to get
+ * rid of the boot console as soon as the proper console shows up, so there
+ * won't be side-effects from postponing the removal.
+ */
static int __init printk_late_init(void)
{
struct console *con;
for_each_console(con) {
if (!keep_bootcon && con->flags & CON_BOOT) {
- unregister_console(con);
+ /*
+ * Make sure to unregister boot consoles whose data
+ * resides in the init section before the init section
+ * is discarded. Boot consoles whose data will stick
+ * around will automatically be unregistered when the
+ * proper console replaces them.
+ */
+ if (init_section_intersects(con, sizeof(*con)))
+ unregister_console(con);
}
}
hotcpu_notifier(console_cpu_notify, 0);
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e1160e5..51369697466e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
+ force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b760bae64cf1..2341efe7fe02 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
+ int dumpable = 0;
+ kuid_t caller_uid;
+ kgid_t caller_gid;
+
+ if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
+ WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
+ return -EPERM;
+ }
/* May we inspect the given task?
* This check is used both for attaching with ptrace
@@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
* because setting up the necessary parent/child relationship
* or halting the specified task is impossible.
*/
- int dumpable = 0;
+
/* Don't let security modules deny introspection */
if (same_thread_group(task, current))
return 0;
rcu_read_lock();
+ if (mode & PTRACE_MODE_FSCREDS) {
+ caller_uid = cred->fsuid;
+ caller_gid = cred->fsgid;
+ } else {
+ /*
+ * Using the euid would make more sense here, but something
+ * in userland might rely on the old behavior, and this
+ * shouldn't be a security problem since
+ * PTRACE_MODE_REALCREDS implies that the caller explicitly
+ * used a syscall that requests access to another process
+ * (and not a filesystem syscall to procfs).
+ */
+ caller_uid = cred->uid;
+ caller_gid = cred->gid;
+ }
tcred = __task_cred(task);
- if (uid_eq(cred->uid, tcred->euid) &&
- uid_eq(cred->uid, tcred->suid) &&
- uid_eq(cred->uid, tcred->uid) &&
- gid_eq(cred->gid, tcred->egid) &&
- gid_eq(cred->gid, tcred->sgid) &&
- gid_eq(cred->gid, tcred->gid))
+ if (uid_eq(caller_uid, tcred->euid) &&
+ uid_eq(caller_uid, tcred->suid) &&
+ uid_eq(caller_uid, tcred->uid) &&
+ gid_eq(caller_gid, tcred->egid) &&
+ gid_eq(caller_gid, tcred->sgid) &&
+ gid_eq(caller_gid, tcred->gid))
goto ok;
if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
@@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request,
goto out;
task_lock(task);
- retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
task_unlock(task);
if (retval)
goto unlock_creds;
@@ -364,8 +387,14 @@ unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
if (!retval) {
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
+ */
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d89328e260df..250ea67c1615 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
- rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
- rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
@@ -162,6 +160,27 @@ static int rcu_torture_writer_state;
#define RTWS_SYNC 7
#define RTWS_STUTTER 8
#define RTWS_STOPPING 9
+static const char * const rcu_torture_writer_state_names[] = {
+ "RTWS_FIXED_DELAY",
+ "RTWS_DELAY",
+ "RTWS_REPLACE",
+ "RTWS_DEF_FREE",
+ "RTWS_EXP_SYNC",
+ "RTWS_COND_GET",
+ "RTWS_COND_SYNC",
+ "RTWS_SYNC",
+ "RTWS_STUTTER",
+ "RTWS_STOPPING",
+};
+
+static const char *rcu_torture_writer_state_getname(void)
+{
+ unsigned int i = READ_ONCE(rcu_torture_writer_state);
+
+ if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
+ return "???";
+ return rcu_torture_writer_state_names[i];
+}
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
@@ -911,12 +930,14 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- pr_alert("%s" TORTURE_FLAG
- " Grace periods expedited from boot/sysfs for %s,\n",
- torture_type, cur_ops->name);
- pr_alert("%s" TORTURE_FLAG
- " Testing of dynamic grace-period expediting diabled.\n",
- torture_type);
+ if (!can_expedite) {
+ pr_alert("%s" TORTURE_FLAG
+ " Grace periods expedited from boot/sysfs for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " Disabled dynamic grace-period expediting.\n",
+ torture_type);
+ }
/* Initialize synctype[] array. If none set, take default. */
if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
@@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void)
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+ rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags);
show_rcu_gp_kthreads();
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index a63a1ea5a41b..9b9cdd549caa 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, rcu_gp_is_expedited()
+ __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
: SYNCHRONIZE_SRCU_TRYCOUNT);
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index e492a5253e0f..196f0302e2f4 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -23,7 +23,7 @@
*/
#include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -122,18 +122,7 @@ free_out:
debugfs_remove_recursive(rcudir);
return 1;
}
-
-static void __exit rcutiny_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-module_init(rcutiny_trace_init);
-module_exit(rcutiny_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutiny_trace_init);
static void check_cpu_stall(struct rcu_ctrlblk *rcp)
{
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f07343b54fe5..9a535a86e732 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree");
/* Data structures. */
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-
/*
* In order to export the rcu_state name to the tracing tools, it
* needs to be added in the __tracepoint_string section.
@@ -112,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
static struct rcu_state *const rcu_state_p;
-static struct rcu_data __percpu *const rcu_data_p;
LIST_HEAD(rcu_struct_flavors);
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -246,24 +241,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
- unsigned long flags;
-
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
- trace_rcu_grace_period(TPS("rcu_sched"),
- __this_cpu_read(rcu_sched_data.gpnum),
- TPS("cpuqs"));
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
- return;
- local_irq_save(flags);
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(&rcu_sched_data),
- true);
- }
- local_irq_restore(flags);
- }
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+ return;
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
}
void rcu_bh_qs(void)
@@ -300,17 +288,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* We inform the RCU core by emulating a zero-duration dyntick-idle
* period, which we in turn do by incrementing the ->dynticks counter
* by two.
+ *
+ * The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp;
int resched_mask;
struct rcu_state *rsp;
- local_irq_save(flags);
-
/*
* Yes, we can lose flag-setting operations. This is OK, because
* the flag will be set again after some delay.
@@ -340,13 +327,12 @@ static void rcu_momentary_dyntick_idle(void)
smp_mb__after_atomic(); /* Later stuff after QS. */
break;
}
- local_irq_restore(flags);
}
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
*/
void rcu_note_context_switch(void)
{
@@ -376,9 +362,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
void rcu_all_qs(void)
{
+ unsigned long flags;
+
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ local_irq_save(flags);
rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -605,25 +596,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
* The caller must have disabled interrupts to prevent races with
* normal callback registry.
*/
-static int
+static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
int i;
if (rcu_gp_in_progress(rsp))
- return 0; /* No, a grace period is already in progress. */
+ return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
- return 1; /* Yes, a no-CBs CPU needs one. */
+ return true; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
- return 0; /* No, this is a no-CBs (or offline) CPU. */
+ return false; /* No, this is a no-CBs (or offline) CPU. */
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
- return 1; /* Yes, this CPU has newly registered callbacks. */
+ return true; /* Yes, CPU has newly registered callbacks. */
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
ULONG_CMP_LT(READ_ONCE(rsp->completed),
rdp->nxtcompleted[i]))
- return 1; /* Yes, CBs for future grace period. */
- return 0; /* No grace period needed. */
+ return true; /* Yes, CBs for future grace period. */
+ return false; /* No grace period needed. */
}
/*
@@ -740,7 +731,7 @@ void rcu_user_enter(void)
*
* Exit from an interrupt handler, which might possibly result in entering
* idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
+ * sections can occur. The caller must have disabled interrupts.
*
* This code assumes that the idle loop never does anything that might
* result in unbalanced calls to irq_enter() and irq_exit(). If your
@@ -753,11 +744,10 @@ void rcu_user_enter(void)
*/
void rcu_irq_exit(void)
{
- unsigned long flags;
long long oldval;
struct rcu_dynticks *rdtp;
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
@@ -768,6 +758,17 @@ void rcu_irq_exit(void)
else
rcu_eqs_enter_common(oldval, true);
rcu_sysidle_enter(1);
+}
+
+/*
+ * Wrapper for rcu_irq_exit() where interrupts are enabled.
+ */
+void rcu_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_irq_exit();
local_irq_restore(flags);
}
@@ -865,7 +866,7 @@ void rcu_user_exit(void)
*
* Enter an interrupt handler, which might possibly result in exiting
* idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
+ * sections can occur. The caller must have disabled interrupts.
*
* Note that the Linux kernel is fully capable of entering an interrupt
* handler that it never exits, for example when doing upcalls to
@@ -881,11 +882,10 @@ void rcu_user_exit(void)
*/
void rcu_irq_enter(void)
{
- unsigned long flags;
struct rcu_dynticks *rdtp;
long long oldval;
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
@@ -896,6 +896,17 @@ void rcu_irq_enter(void)
else
rcu_eqs_exit_common(oldval, true);
rcu_sysidle_exit(1);
+}
+
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ */
+void rcu_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_irq_enter();
local_irq_restore(flags);
}
@@ -1071,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
rcu_sysidle_check_cpu(rdp, isidle, maxj);
if ((rdp->dynticks_snap & 0x1) == 0) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- return 1;
- } else {
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
WRITE_ONCE(rdp->gpwrap, true);
- return 0;
+ return 1;
}
+ return 0;
}
/*
@@ -1161,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
smp_mb(); /* ->cond_resched_completed before *rcrmp. */
WRITE_ONCE(*rcrmp,
READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
- rdp->rsp->jiffies_resched += 5; /* Enable beating. */
- } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- /* Time to beat on that CPU again! */
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
- rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
+ rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
+ /* And if it has been a really long time, kick the CPU as well. */
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
+ ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+
return 0;
}
@@ -1187,6 +1198,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
}
/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
+}
+
+/*
* Complain about starvation of grace-period kthread.
*/
static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
@@ -1196,12 +1217,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
- if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
+ if (j - gpa > 2 * HZ) {
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
- rsp->gp_flags, rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : 0);
+ rsp->gp_flags,
+ gp_state_getname(rsp->gp_state), rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ if (rsp->gp_kthread)
+ sched_show_task(rsp->gp_kthread);
+ }
}
/*
@@ -1214,13 +1239,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask != 0) {
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
if (rnp->qsmask & (1UL << cpu))
dump_cpu_task(rnp->grplo + cpu);
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1237,15 +1262,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
/* Only let one CPU complain about others per time interval. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/*
* OK, time to rat on our buddy...
@@ -1256,7 +1281,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
rsp->name);
print_cpu_stall_info_begin();
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -1266,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
ndetected++;
}
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
print_cpu_stall_info_end();
@@ -1327,11 +1352,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/*
* Attempt to revive the RCU machinery by forcing a context switch.
@@ -1534,10 +1559,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* hold it, acquire the root rcu_node structure's lock in order to
* start one (if needed).
*/
- if (rnp != rnp_root) {
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
- }
+ if (rnp != rnp_root)
+ raw_spin_lock_rcu_node(rnp_root);
/*
* Get a new grace-period number. If there really is no grace
@@ -1571,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
}
unlock_out:
if (rnp != rnp_root)
- raw_spin_unlock(&rnp_root->lock);
+ raw_spin_unlock_rcu_node(rnp_root);
out:
if (c_out != NULL)
*c_out = c;
@@ -1590,7 +1613,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
int needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1];
trace_rcu_future_gp(rnp, rdp, c,
@@ -1611,7 +1633,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
- wake_up(&rsp->gp_wq);
+ swake_up(&rsp->gp_wq);
}
/*
@@ -1786,13 +1808,12 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
rdp->completed == READ_ONCE(rnp->completed) &&
!unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
- !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
}
- smp_mb__after_unlock_lock();
needwake = __note_gp_changes(rsp, rnp, rdp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -1805,21 +1826,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
}
/*
- * Initialize a new grace period. Return 0 if no grace period required.
+ * Initialize a new grace period. Return false if no grace period required.
*/
-static int rcu_gp_init(struct rcu_state *rsp)
+static bool rcu_gp_init(struct rcu_state *rsp)
{
unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
- raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ raw_spin_unlock_irq_rcu_node(rnp);
+ return false;
}
WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
@@ -1828,8 +1848,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
* Grace period already in progress, don't start another.
* Not supposed to be able to happen.
*/
- raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ raw_spin_unlock_irq_rcu_node(rnp);
+ return false;
}
/* Advance to a new grace period and initialize state. */
@@ -1837,7 +1857,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
/* Record GP times before starting GP, hence smp_store_release(). */
smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
/*
* Apply per-leaf buffered online and offline operations to the
@@ -1847,12 +1867,11 @@ static int rcu_gp_init(struct rcu_state *rsp)
*/
rcu_for_each_leaf_node(rsp, rnp) {
rcu_gp_slow(rsp, gp_preinit_delay);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
continue;
}
@@ -1886,7 +1905,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
}
/*
@@ -1904,8 +1923,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
rcu_gp_slow(rsp, gp_init_delay);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
@@ -1918,12 +1936,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
}
- return 1;
+ return true;
}
/*
@@ -1973,11 +1991,10 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
WRITE_ONCE(rsp->gp_flags,
READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
}
}
@@ -1991,10 +2008,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
+ struct swait_queue_head *sq;
WRITE_ONCE(rsp->gp_activity, jiffies);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
@@ -2007,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
/*
* Propagate new ->completed value to rcu_node structures so
@@ -2019,8 +2036,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
* grace period is recorded in any of the rcu_node structures.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->completed, rsp->gpnum);
@@ -2029,14 +2045,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
- raw_spin_unlock_irq(&rnp->lock);
+ sq = rcu_nocb_gp_get(rnp);
+ raw_spin_unlock_irq_rcu_node(rnp);
+ rcu_nocb_gp_cleanup(sq);
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+ raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
@@ -2052,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
}
/*
@@ -2076,7 +2093,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- wait_event_interruptible(rsp->gp_wq,
+ swait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2106,7 +2123,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_interruptible_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2218,19 +2235,21 @@ static bool rcu_start_gp(struct rcu_state *rsp)
}
/*
- * Report a full set of quiescent states to the specified rcu_state
- * data structure. This involves cleaning up after the prior grace
- * period and letting rcu_start_gp() start up the next grace period
- * if one is needed. Note that the caller must hold rnp->lock, which
- * is released before return.
+ * Report a full set of quiescent states to the specified rcu_state data
+ * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period
+ * kthread if another grace period is required. Whether we wake
+ * the grace-period kthread or it awakens itself for the next round
+ * of quiescent-state forcing, that kthread will clean up after the
+ * just-completed grace period. Note that the caller must hold rnp->lock,
+ * which is released before return.
*/
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
- raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
+ swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2259,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
* Our bit has already been cleared, or the
* relevant grace period is already over, so done.
*/
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
@@ -2271,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
/* Other bits still set at this level, so done. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
mask = rnp->grpmask;
@@ -2281,11 +2300,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
break;
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rnp_c = rnp;
rnp = rnp->parent;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
oldmask = rnp_c->qsmask;
}
@@ -2314,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return; /* Still need more quiescent states! */
}
@@ -2331,20 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
/* Report up the rest of the hierarchy, tracking current ->gpnum. */
gps = rnp->gpnum;
mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
}
/*
* Record a quiescent state for the specified CPU to that CPU's rcu_data
- * structure. This must be either called from the specified CPU, or
- * called when the specified CPU is known to be offline (and when it is
- * also known that no other CPU is concurrently trying to help the offline
- * CPU). The lastcomp argument is used to make sure we are still in the
- * grace period of interest. We don't want to end the current grace period
- * based on quiescent states detected in an earlier grace period!
+ * structure. This must be called from the specified CPU.
*/
static void
rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
@@ -2355,8 +2367,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
struct rcu_node *rnp;
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if ((rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
@@ -2370,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
*/
rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
mask = rdp->grpmask;
if ((rnp->qsmask & mask) == 0) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
- rdp->core_needs_qs = 0;
+ rdp->core_needs_qs = false;
/*
* This GP can't end until cpu checks in, so all of our
@@ -2582,42 +2593,19 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
rnp = rnp->parent;
if (!rnp)
break;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rnp->qsmaskinit &= ~mask;
rnp->qsmask &= ~mask;
if (rnp->qsmaskinit) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
return;
}
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
}
/*
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function. We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
- */
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return;
-
- /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
- mask = rdp->grpmask;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
- rnp->qsmaskinitnext &= ~mask;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2809,8 +2797,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
rcu_for_each_leaf_node(rsp, rnp) {
cond_resched_rcu_qs();
mask = 0;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
if (rcu_state_p == &rcu_sched_state ||
rsp != rcu_state_p ||
@@ -2849,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
} else {
/* Nothing to do here, so just drop the lock. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
}
@@ -2881,17 +2868,16 @@ static void force_quiescent_state(struct rcu_state *rsp)
/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
/* Reached the root of the rcu_node tree, acquire lock. */
- raw_spin_lock_irqsave(&rnp_old->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
raw_spin_unlock(&rnp_old->fqslock);
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
- raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
- raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
+ swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2914,9 +2900,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/* Does this CPU require a not-yet-started grace period? */
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+ raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
needwake = rcu_start_gp(rsp);
- raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3005,10 +2991,9 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
if (!rcu_gp_in_progress(rsp)) {
struct rcu_node *rnp_root = rcu_get_root(rsp);
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp_root);
needwake = rcu_start_gp(rsp);
- raw_spin_unlock(&rnp_root->lock);
+ raw_spin_unlock_rcu_node(rnp_root);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3365,7 +3350,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp)
{
unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
s = (READ_ONCE(*sp) + 3) & ~0x1;
smp_mb(); /* Above access must not bleed into critical section. */
return s;
@@ -3392,6 +3376,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
}
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
return rcu_seq_snap(&rsp->expedited_sequence);
}
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
@@ -3426,17 +3411,16 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
* CPUs for the current rcu_node structure up the rcu_node tree.
*/
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmaskinit == rnp->expmaskinitnext) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
continue; /* No new CPUs, nothing to do. */
}
/* Update this node's mask, track old value for propagation. */
oldmask = rnp->expmaskinit;
rnp->expmaskinit = rnp->expmaskinitnext;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* If was already nonzero, nothing to propagate. */
if (oldmask)
@@ -3447,12 +3431,11 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
rnp_up = rnp->parent;
done = false;
while (rnp_up) {
- raw_spin_lock_irqsave(&rnp_up->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
if (rnp_up->expmaskinit)
done = true;
rnp_up->expmaskinit |= mask;
- raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
if (done)
break;
mask = rnp_up->grpmask;
@@ -3472,11 +3455,10 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
sync_exp_reset_tree_hotplug(rsp);
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
rnp->expmask = rnp->expmaskinit;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -3517,22 +3499,21 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
if (!rnp->expmask)
rcu_initiate_boost(rnp, flags);
else
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
break;
}
if (rnp->parent == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- wake_up(&rsp->expedited_wq);
+ swake_up(&rsp->expedited_wq);
}
break;
}
mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
rnp = rnp->parent;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
rnp->expmask &= ~mask;
}
@@ -3549,8 +3530,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
{
unsigned long flags;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
__rcu_report_exp_rnp(rsp, rnp, wake, flags);
}
@@ -3564,10 +3544,9 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
{
unsigned long flags;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
rnp->expmask &= ~mask;
@@ -3609,7 +3588,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
*/
static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
{
- struct rcu_data *rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
struct rcu_node *rnp0;
struct rcu_node *rnp1 = NULL;
@@ -3623,7 +3602,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
if (sync_exp_work_done(rsp, rnp0, NULL,
- &rsp->expedited_workdone0, s))
+ &rdp->expedited_workdone0, s))
return NULL;
return rnp0;
}
@@ -3637,14 +3616,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
* can be inexact, as it is just promoting locality and is not
* strictly needed for correctness.
*/
- rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
return NULL;
mutex_lock(&rdp->exp_funnel_mutex);
rnp0 = rdp->mynode;
for (; rnp0 != NULL; rnp0 = rnp0->parent) {
if (sync_exp_work_done(rsp, rnp1, rdp,
- &rsp->expedited_workdone2, s))
+ &rdp->expedited_workdone2, s))
return NULL;
mutex_lock(&rnp0->exp_funnel_mutex);
if (rnp1)
@@ -3654,7 +3632,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
rnp1 = rnp0;
}
if (sync_exp_work_done(rsp, rnp1, rdp,
- &rsp->expedited_workdone3, s))
+ &rdp->expedited_workdone3, s))
return NULL;
return rnp1;
}
@@ -3708,8 +3686,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
sync_exp_reset_tree(rsp);
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
@@ -3730,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
*/
if (rcu_preempt_has_tasks(rnp))
rnp->exp_tasks = rnp->blkd_tasks.next;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
mask = 1;
@@ -3741,24 +3718,22 @@ retry_ipi:
ret = smp_call_function_single(cpu, func, rsp, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
- } else {
- /* Failed, raced with offline. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ continue;
+ }
+ /* Failed, raced with offline. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ schedule_timeout_uninterruptible(1);
if (cpu_online(cpu) &&
- (rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore(&rnp->lock,
- flags);
- schedule_timeout_uninterruptible(1);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask))
- goto retry_ipi;
- raw_spin_lock_irqsave(&rnp->lock,
- flags);
- }
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ (rnp->expmask & mask))
+ goto retry_ipi;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Report quiescent states for those that went offline. */
mask_ofl_test |= mask_ofl_ipi;
@@ -3773,6 +3748,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
+ int ndetected;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root(rsp);
int ret;
@@ -3781,28 +3757,30 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
- ret = wait_event_interruptible_timeout(
+ ret = swait_event_timeout(
rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall);
- if (ret > 0)
+ if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
return;
if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */
- wait_event(rsp->expedited_wq,
+ swait_event(rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root));
return;
}
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rsp->name);
+ ndetected = 0;
rcu_for_each_leaf_node(rsp, rnp) {
- (void)rcu_print_task_exp_stall(rnp);
+ ndetected = rcu_print_task_exp_stall(rnp);
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_data *rdp;
if (!(rnp->expmask & mask))
continue;
+ ndetected++;
rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu,
"O."[cpu_online(cpu)],
@@ -3811,8 +3789,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
}
mask <<= 1;
}
- pr_cont(" } %lu jiffies s: %lu\n",
- jiffies - jiffies_start, rsp->expedited_sequence);
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ jiffies - jiffies_start, rsp->expedited_sequence,
+ rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ if (!ndetected) {
+ pr_err("blocking rcu_node structures:");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_preempt_exp_done(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi,
+ rnp->expmask,
+ ".T"[!!rnp->exp_tasks]);
+ }
+ pr_cont("\n");
+ }
rcu_for_each_leaf_node(rsp, rnp) {
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
@@ -3847,6 +3840,16 @@ void synchronize_sched_expedited(void)
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
+ /* If only one CPU, this is automatically a grace period. */
+ if (rcu_blocking_is_gp())
+ return;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu_sched);
+ return;
+ }
+
/* Take a snapshot of the sequence number. */
s = rcu_exp_gp_seq_snap(rsp);
@@ -4135,9 +4138,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
rnp = rnp->parent;
if (rnp == NULL)
return;
- raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
rnp->qsmaskinit |= mask;
- raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+ raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
}
}
@@ -4152,7 +4155,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
@@ -4161,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->rsp = rsp;
mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -4179,7 +4182,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -4189,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rcu_sysidle_init_percpu_data(rdp->dynticks);
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
@@ -4198,8 +4201,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
mask = rdp->grpmask;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rnp->qsmaskinitnext |= mask;
rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
@@ -4211,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
static void rcu_prepare_cpu(int cpu)
@@ -4222,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu)
rcu_init_percpu_data(cpu, rsp);
}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function. We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function. We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinitnext &= ~mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+void rcu_report_dead(unsigned int cpu)
+{
+ struct rcu_state *rsp;
+
+ /* QS for any half-done expedited RCU-sched GP. */
+ preempt_disable();
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(rcu_sched_state.rda), true);
+ preempt_enable();
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_idle_cpu(cpu, rsp);
+}
+#endif
+
/*
* Handle CPU online/offline notification events.
*/
@@ -4253,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self,
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
break;
- case CPU_DYING_IDLE:
- /* QS for any half-done expedited RCU-sched GP. */
- preempt_disable();
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(rcu_sched_state.rda), true);
- preempt_enable();
-
- for_each_rcu_flavor(rsp) {
- rcu_cleanup_dying_idle_cpu(cpu, rsp);
- }
- break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
@@ -4327,14 +4358,14 @@ static int __init rcu_spawn_gp_kthread(void)
t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rsp->gp_kthread = t;
if (kthread_prio) {
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
@@ -4385,12 +4416,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
-static void __init rcu_init_one(struct rcu_state *rsp,
- struct rcu_data __percpu *rda)
+static void __init rcu_init_one(struct rcu_state *rsp)
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
static const char * const exp[] = RCU_EXP_NAME_INIT;
+ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4422,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
cpustride *= levelspread[i];
rnp = rsp->level[i];
for (j = 0; j < levelcnt[i]; j++, rnp++) {
- raw_spin_lock_init(&rnp->lock);
- lockdep_set_class_and_name(&rnp->lock,
+ raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
+ lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
&rcu_node_class[i], buf[i]);
raw_spin_lock_init(&rnp->fqslock);
lockdep_set_class_and_name(&rnp->fqslock,
@@ -4455,8 +4488,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
}
- init_waitqueue_head(&rsp->gp_wq);
- init_waitqueue_head(&rsp->expedited_wq);
+ init_swait_queue_head(&rsp->gp_wq);
+ init_swait_queue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
@@ -4576,8 +4609,8 @@ void __init rcu_init(void)
rcu_bootup_announce();
rcu_init_geometry();
- rcu_init_one(&rcu_bh_state, &rcu_bh_data);
- rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ rcu_init_one(&rcu_bh_state);
+ rcu_init_one(&rcu_sched_state);
if (dump_tree)
rcu_dump_rcu_node_tree(&rcu_sched_state);
__rcu_init_preempt();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9fb4e238d4dc..df668c0f9e64 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/swait.h>
#include <linux/stop_machine.h>
/*
@@ -149,8 +150,9 @@ struct rcu_dynticks {
* Definition for node within the RCU grace-period-detection hierarchy.
*/
struct rcu_node {
- raw_spinlock_t lock; /* Root rcu_node's lock protects some */
- /* rcu_state fields as well as following. */
+ raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
+ /* some rcu_state fields as well as */
+ /* following. */
unsigned long gpnum; /* Current grace period for this node. */
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
@@ -178,6 +180,8 @@ struct rcu_node {
/* beginning of each expedited GP. */
unsigned long expmaskinitnext;
/* Online CPUs for next expedited GP. */
+ /* Any CPU that has ever been online will */
+ /* have its bit set. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -241,7 +245,7 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
- wait_queue_head_t nocb_gp_wq[2];
+ struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2];
@@ -384,6 +388,10 @@ struct rcu_data {
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
struct mutex exp_funnel_mutex;
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
+ atomic_long_t expedited_workdone1; /* # done by others #1. */
+ atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -393,7 +401,7 @@ struct rcu_data {
atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail;
- wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -472,7 +480,7 @@ struct rcu_state {
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
- wait_queue_head_t gp_wq; /* Where GP task waits. */
+ struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
@@ -498,13 +506,9 @@ struct rcu_state {
/* End of fields guarded by barrier_mutex. */
unsigned long expedited_sequence; /* Take a ticket. */
- atomic_long_t expedited_workdone0; /* # done by others #0. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
- wait_queue_head_t expedited_wq; /* Wait for check-ins. */
+ struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -545,6 +549,18 @@ struct rcu_state {
#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
+#ifndef RCU_TREE_NONCORE
+static const char * const gp_state_names[] = {
+ "RCU_GP_IDLE",
+ "RCU_GP_WAIT_GPS",
+ "RCU_GP_DONE_GPS",
+ "RCU_GP_WAIT_FQS",
+ "RCU_GP_DOING_FQS",
+ "RCU_GP_CLEANUP",
+ "RCU_GP_CLEANED",
+};
+#endif /* #ifndef RCU_TREE_NONCORE */
+
extern struct list_head rcu_struct_flavors;
/* Sequence through rcu_state structures for each RCU flavor. */
@@ -607,7 +623,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
@@ -664,3 +681,61 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#else /* #ifdef CONFIG_PPC */
#define smp_mb__after_unlock_lock() do { } while (0)
#endif /* #else #ifdef CONFIG_PPC */
+
+/*
+ * Wrappers for the rcu_node::lock acquire and release.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
+ */
+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
+ smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
+}
+
+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
+ smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
+do { \
+ typecheck(unsigned long, flags); \
+ raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
+do { \
+ typecheck(unsigned long, flags); \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
+} while (0)
+
+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
+{
+ bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
+
+ if (locked)
+ smp_mb__after_unlock_lock();
+ return locked;
+}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c19772630..efdf7b61ce12 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
/*
* Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary. If you like #ifdef, you
- * will love this function.
+ * messages about anything out of the ordinary.
*/
static void __init rcu_bootup_announce_oddness(void)
{
@@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void)
* the corresponding expedited grace period will also be the end of the
* normal grace period.
*/
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+ __releases(rnp->lock) /* But leaves rrupts disabled. */
{
int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
(rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
- raw_spin_unlock(&rnp->lock);
+ raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
* Report the quiescent state for the expedited GP. This expedited
@@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
} else {
WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
}
- local_irq_restore(flags);
}
/*
@@ -286,12 +284,11 @@ static void rcu_preempt_qs(void)
* predating the current grace period drain, in other words, until
* rnp->gp_tasks becomes NULL.
*
- * Caller must disable preemption.
+ * Caller must disable interrupts.
*/
static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void)
/* Possibly blocking in an RCU read-side critical section. */
rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
@@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void)
(rnp->qsmask & rdp->grpmask)
? rnp->gpnum
: rnp->gpnum + 1);
- rcu_preempt_ctxt_queue(rnp, rdp, flags);
+ rcu_preempt_ctxt_queue(rnp, rdp);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) {
@@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t)
/*
* Remove this task from the list it blocked on. The task
- * now remains queued on the rcu_node corresponding to
- * the CPU it first blocked on, so the first attempt to
- * acquire the task's rcu_node's ->lock will succeed.
- * Keep the loop and add a WARN_ON() out of sheer paranoia.
+ * now remains queued on the rcu_node corresponding to the
+ * CPU it first blocked on, so there is no longer any need
+ * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
*/
- for (;;) {
- rnp = t->rcu_blocked_node;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- if (rnp == t->rcu_blocked_node)
- break;
- WARN_ON_ONCE(1);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
+ rnp = t->rcu_blocked_node;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ WARN_ON_ONCE(rnp != t->rcu_blocked_node);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -500,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
!!rnp->gp_tasks);
rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
} else {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Unboost if we were boosted. */
@@ -527,16 +516,16 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
unsigned long flags;
struct task_struct *t;
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
sched_show_task(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void)
struct rcu_state *rsp = rcu_state_p;
unsigned long s;
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
s = rcu_exp_gp_seq_snap(rsp);
rnp_unlock = exp_funnel_lock(rsp, s);
@@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
*/
static void __init __rcu_init_preempt(void)
{
- rcu_init_one(rcu_state_p, rcu_data_p);
+ rcu_init_one(rcu_state_p);
}
/*
@@ -812,7 +807,6 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *const rcu_state_p = &rcu_sched_state;
-static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
/*
* Tell them what RCU they are running.
@@ -989,15 +983,14 @@ static int rcu_boost(struct rcu_node *rnp)
READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/*
* Recheck under the lock: all tasks in need of boosting
* might exit their RCU read-side critical sections on their own.
*/
if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
@@ -1034,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp)
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
@@ -1094,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
rnp->n_balk_exp_gp_tasks++;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
if (rnp->exp_tasks != NULL ||
@@ -1104,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
ULONG_CMP_GE(jiffies, rnp->boost_time))) {
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
t = rnp->boost_kthread_task;
if (t)
rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1176,10 +1169,9 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
"rcub/%d", rnp_index);
if (IS_ERR(t))
return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
@@ -1315,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
static void invoke_rcu_callbacks_kthread(void)
@@ -1524,7 +1516,8 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ rcu_is_nocb_cpu(smp_processor_id()))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1538,10 +1531,6 @@ static void rcu_prepare_for_idle(void)
if (!tne)
return;
- /* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(smp_processor_id()))
- return;
-
/*
* If a non-lazy callback arrived at a CPU having only lazy
* callbacks, invoke RCU core for the side-effect of recalculating
@@ -1567,10 +1556,9 @@ static void rcu_prepare_for_idle(void)
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -1822,9 +1810,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
- wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+ swake_up_all(sq);
}
/*
@@ -1840,10 +1828,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
}
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
- init_waitqueue_head(&rnp->nocb_gp_wq[0]);
- init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1868,7 +1861,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
- wake_up(&rdp_leader->nocb_wq);
+ swake_up(&rdp_leader->nocb_wq);
}
}
@@ -2068,10 +2061,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
bool needwake;
struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
needwake = rcu_start_future_gp(rnp, rdp, &c);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rdp->rsp);
@@ -2081,7 +2073,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
- wait_event_interruptible(
+ swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
@@ -2109,7 +2101,7 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
- wait_event_interruptible(my_rdp->nocb_wq,
+ swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
@@ -2184,7 +2176,7 @@ wait_again:
* List was empty, wake up the follower.
* Memory barriers supplied by atomic_long_add().
*/
- wake_up(&rdp->nocb_wq);
+ swake_up(&rdp->nocb_wq);
}
}
@@ -2205,7 +2197,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
- wait_event_interruptible(rdp->nocb_wq,
+ swait_event_interruptible(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
@@ -2364,7 +2356,7 @@ void __init rcu_init_nohz(void)
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
- init_waitqueue_head(&rdp->nocb_wq);
+ init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
@@ -2514,7 +2506,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
return false;
}
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
@@ -2522,6 +2514,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
}
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index ef7093cc9b5c..1088e64f01ad 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -1,5 +1,5 @@
/*
- * Read-Copy Update tracing for classic implementation
+ * Read-Copy Update tracing for hierarchical implementation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
+ * Author: Paul E. McKenney
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
@@ -33,9 +34,7 @@
#include <linux/sched.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
-#include <linux/module.h>
#include <linux/completion.h>
-#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = {
static int show_rcuexp(struct seq_file *m, void *v)
{
+ int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
-
+ struct rcu_data *rdp;
+ unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ s0 += atomic_long_read(&rdp->expedited_workdone0);
+ s1 += atomic_long_read(&rdp->expedited_workdone1);
+ s2 += atomic_long_read(&rdp->expedited_workdone2);
+ s3 += atomic_long_read(&rdp->expedited_workdone3);
+ }
seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence,
- atomic_long_read(&rsp->expedited_workdone0),
- atomic_long_read(&rsp->expedited_workdone1),
- atomic_long_read(&rsp->expedited_workdone2),
- atomic_long_read(&rsp->expedited_workdone3),
+ rsp->expedited_sequence, s0, s1, s2, s3,
atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
@@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
unsigned long gpmax;
struct rcu_node *rnp = &rsp->node[0];
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
completed = READ_ONCE(rsp->completed);
gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
@@ -487,16 +492,4 @@ free_out:
debugfs_remove_recursive(rcudir);
return 1;
}
-
-static void __exit rcutree_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-
-module_init(rcutree_trace_init);
-module_exit(rcutree_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutree_trace_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f748c5a40f0..ca828b41c938 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate");
#endif
#define MODULE_PARAM_PREFIX "rcupdate."
+#ifndef CONFIG_TINY_RCU
module_param(rcu_expedited, int, 0);
+module_param(rcu_normal, int, 0);
+static int rcu_normal_after_boot;
+module_param(rcu_normal_after_boot, int, 0);
+#endif /* #ifndef CONFIG_TINY_RCU */
#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
/**
@@ -113,6 +118,18 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
#ifndef CONFIG_TINY_RCU
+/*
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts? Intended for use within RCU. Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+ * rcu_normal wins.
+ */
+bool rcu_gp_is_normal(void)
+{
+ return READ_ONCE(rcu_normal);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
+
static atomic_t rcu_expedited_nesting =
ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
@@ -157,8 +174,6 @@ void rcu_unexpedite_gp(void)
}
EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
-#endif /* #ifndef CONFIG_TINY_RCU */
-
/*
* Inform RCU of the end of the in-kernel boot sequence.
*/
@@ -166,8 +181,12 @@ void rcu_end_inkernel_boot(void)
{
if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
rcu_unexpedite_gp();
+ if (rcu_normal_after_boot)
+ WRITE_ONCE(rcu_normal, 1);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_PREEMPT_RCU
/*
diff --git a/kernel/relay.c b/kernel/relay.c
index 0b4570cfacae..074994bcfa9b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&file_inode(filp)->i_mutex);
+ inode_lock(file_inode(filp));
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&file_inode(filp)->i_mutex);
+ inode_unlock(file_inode(filp));
return desc->written;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index f150dbbe6f62..2e78ead30934 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -233,9 +233,9 @@ static struct resource * __request_resource(struct resource *root, struct resour
}
}
-static int __release_resource(struct resource *old)
+static int __release_resource(struct resource *old, bool release_child)
{
- struct resource *tmp, **p;
+ struct resource *tmp, **p, *chd;
p = &old->parent->child;
for (;;) {
@@ -243,7 +243,17 @@ static int __release_resource(struct resource *old)
if (!tmp)
break;
if (tmp == old) {
- *p = tmp->sibling;
+ if (release_child || !(tmp->child)) {
+ *p = tmp->sibling;
+ } else {
+ for (chd = tmp->child;; chd = chd->sibling) {
+ chd->parent = tmp->parent;
+ if (!(chd->sibling))
+ break;
+ }
+ *p = tmp->child;
+ chd->sibling = tmp->sibling;
+ }
old->parent = NULL;
return 0;
}
@@ -325,7 +335,7 @@ int release_resource(struct resource *old)
int retval;
write_lock(&resource_lock);
- retval = __release_resource(old);
+ retval = __release_resource(old, true);
write_unlock(&resource_lock);
return retval;
}
@@ -333,13 +343,13 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/*
- * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags and "name".
- * If found, returns 0, res is overwritten, if not found, returns -1.
- * This walks through whole tree and not just first level children
- * until and unless first_level_children_only is true.
+ * Finds the lowest iomem resource existing within [res->start.res->end).
+ * The caller must specify res->start, res->end, res->flags, and optionally
+ * desc. If found, returns 0, res is overwritten, if not found, returns -1.
+ * This function walks the whole tree and not just first level children until
+ * and unless first_level_children_only is true.
*/
-static int find_next_iomem_res(struct resource *res, char *name,
+static int find_next_iomem_res(struct resource *res, unsigned long desc,
bool first_level_children_only)
{
resource_size_t start, end;
@@ -358,9 +368,9 @@ static int find_next_iomem_res(struct resource *res, char *name,
read_lock(&resource_lock);
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
- if (p->flags != res->flags)
+ if ((p->flags & res->flags) != res->flags)
continue;
- if (name && strcmp(p->name, name))
+ if ((desc != IORES_DESC_NONE) && (desc != p->desc))
continue;
if (p->start > end) {
p = NULL;
@@ -385,15 +395,18 @@ static int find_next_iomem_res(struct resource *res, char *name,
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and
- * name are valid candidates.
+ * desc are valid candidates.
*
- * @name: name of resource
- * @flags: resource flags
+ * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
+ * @flags: I/O resource flags
* @start: start addr
* @end: end addr
+ *
+ * NOTE: For a new descriptor search, define a new IORES_DESC in
+ * <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
-int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
- void *arg, int (*func)(u64, u64, void *))
+int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
+ u64 end, void *arg, int (*func)(u64, u64, void *))
{
struct resource res;
u64 orig_end;
@@ -403,23 +416,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
res.end = end;
res.flags = flags;
orig_end = res.end;
+
while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, name, false))) {
+ (!find_next_iomem_res(&res, desc, false))) {
+
ret = (*func)(res.start, res.end, arg);
if (ret)
break;
+
res.start = res.end + 1;
res.end = orig_end;
}
+
return ret;
}
/*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM". This function deals with
- * full ranges and not pfn. If resources are not pfn aligned, dealing
- * with pfn can truncate ranges.
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * Now, this function is only for System RAM, it deals with full ranges and
+ * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
+ * ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(u64, u64, void *))
@@ -430,10 +447,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
res.start = start;
res.end = end;
- res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, "System RAM", true))) {
+ (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
ret = (*func)(res.start, res.end, arg);
if (ret)
break;
@@ -446,9 +463,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM".
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * It is to be used only for System RAM.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -460,10 +477,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
res.start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
- res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
- (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
+ (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
@@ -484,7 +501,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
}
/*
* This generic page_is_ram() returns true if specified address is
- * registered as "System RAM" in iomem_resource list.
+ * registered as System RAM in iomem_resource list.
*/
int __weak page_is_ram(unsigned long pfn)
{
@@ -496,30 +513,34 @@ EXPORT_SYMBOL_GPL(page_is_ram);
* region_intersects() - determine intersection of region with known resources
* @start: region start address
* @size: size of region
- * @name: name of resource (in iomem_resource)
+ * @flags: flags of resource (in iomem_resource)
+ * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
*
* Check if the specified region partially overlaps or fully eclipses a
- * resource identified by @name. Return REGION_DISJOINT if the region
- * does not overlap @name, return REGION_MIXED if the region overlaps
- * @type and another resource, and return REGION_INTERSECTS if the
- * region overlaps @type and no other defined resource. Note, that
- * REGION_INTERSECTS is also returned in the case when the specified
- * region overlaps RAM and undefined memory holes.
+ * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
+ * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
+ * return REGION_MIXED if the region overlaps @flags/@desc and another
+ * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
+ * and no other defined resource. Note that REGION_INTERSECTS is also
+ * returned in the case when the specified region overlaps RAM and undefined
+ * memory holes.
*
* region_intersect() is used by memory remapping functions to ensure
* the user is not remapping RAM and is a vast speed up over walking
* through the resource table page by page.
*/
-int region_intersects(resource_size_t start, size_t size, const char *name)
+int region_intersects(resource_size_t start, size_t size, unsigned long flags,
+ unsigned long desc)
{
- unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
resource_size_t end = start + size - 1;
int type = 0; int other = 0;
struct resource *p;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
- bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+ bool is_type = (((p->flags & flags) == flags) &&
+ ((desc == IORES_DESC_NONE) ||
+ (desc == p->desc)));
if (start >= p->start && start <= p->end)
is_type ? type++ : other++;
@@ -538,6 +559,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
return REGION_DISJOINT;
}
+EXPORT_SYMBOL_GPL(region_intersects);
void __weak arch_remove_reservations(struct resource *avail)
{
@@ -667,7 +689,7 @@ static int reallocate_resource(struct resource *root, struct resource *old,
old->start = new.start;
old->end = new.end;
} else {
- __release_resource(old);
+ __release_resource(old, true);
*old = new;
conflict = __request_resource(root, old);
BUG_ON(conflict);
@@ -813,6 +835,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
* entirely fit within the range of the new resource, then the new
* resource is inserted and the conflicting resources become children of
* the new resource.
+ *
+ * This function is intended for producers of resources, such as FW modules
+ * and bus drivers.
*/
struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
{
@@ -830,6 +855,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour
* @new: new resource to insert
*
* Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is intended for producers of resources, such as FW modules
+ * and bus drivers.
*/
int insert_resource(struct resource *parent, struct resource *new)
{
@@ -838,6 +866,7 @@ int insert_resource(struct resource *parent, struct resource *new)
conflict = insert_resource_conflict(parent, new);
return conflict ? -EBUSY : 0;
}
+EXPORT_SYMBOL_GPL(insert_resource);
/**
* insert_resource_expand_to_fit - Insert a resource into the resource tree
@@ -873,6 +902,32 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
write_unlock(&resource_lock);
}
+/**
+ * remove_resource - Remove a resource in the resource tree
+ * @old: resource to remove
+ *
+ * Returns 0 on success, -EINVAL if the resource is not valid.
+ *
+ * This function removes a resource previously inserted by insert_resource()
+ * or insert_resource_conflict(), and moves the children (if any) up to
+ * where they were before. insert_resource() and insert_resource_conflict()
+ * insert a new resource, and move any conflicting resources down to the
+ * children of the new resource.
+ *
+ * insert_resource(), insert_resource_conflict() and remove_resource() are
+ * intended for producers of resources, such as FW modules and bus drivers.
+ */
+int remove_resource(struct resource *old)
+{
+ int retval;
+
+ write_lock(&resource_lock);
+ retval = __release_resource(old, false);
+ write_unlock(&resource_lock);
+ return retval;
+}
+EXPORT_SYMBOL_GPL(remove_resource);
+
static int __adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
{
@@ -948,6 +1003,7 @@ static void __init __reserve_region_with_split(struct resource *root,
res->start = start;
res->end = end;
res->flags = IORESOURCE_BUSY;
+ res->desc = IORES_DESC_NONE;
while (1) {
@@ -982,6 +1038,7 @@ static void __init __reserve_region_with_split(struct resource *root,
next_res->start = conflict->end + 1;
next_res->end = end;
next_res->flags = IORESOURCE_BUSY;
+ next_res->desc = IORES_DESC_NONE;
}
} else {
res->start = conflict->end + 1;
@@ -1071,21 +1128,24 @@ struct resource * __request_region(struct resource *parent,
res->name = name;
res->start = start;
res->end = start + n - 1;
- res->flags = resource_type(parent);
- res->flags |= IORESOURCE_BUSY | flags;
write_lock(&resource_lock);
for (;;) {
struct resource *conflict;
+ res->flags = resource_type(parent) | resource_ext_type(parent);
+ res->flags |= IORESOURCE_BUSY | flags;
+ res->desc = parent->desc;
+
conflict = __request_resource(parent, res);
if (!conflict)
break;
if (conflict != parent) {
- parent = conflict;
- if (!(conflict->flags & IORESOURCE_BUSY))
+ if (!(conflict->flags & IORESOURCE_BUSY)) {
+ parent = conflict;
continue;
+ }
}
if (conflict->flags & flags & IORESOURCE_MUXED) {
add_wait_queue(&muxed_resource_wait, &wait);
@@ -1237,6 +1297,7 @@ int release_mem_region_adjustable(struct resource *parent,
new_res->start = end + 1;
new_res->end = res->end;
new_res->flags = res->flags;
+ new_res->desc = res->desc;
new_res->parent = res->parent;
new_res->sibling = res->sibling;
new_res->child = NULL;
@@ -1412,6 +1473,7 @@ static int __init reserve_setup(char *str)
res->start = io_start;
res->end = io_start + io_num - 1;
res->flags = IORESOURCE_BUSY;
+ res->desc = IORES_DESC_NONE;
res->child = NULL;
if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
reserved = x+1;
@@ -1498,8 +1560,15 @@ int iomem_is_exclusive(u64 addr)
break;
if (p->end < addr)
continue;
- if (p->flags & IORESOURCE_BUSY &&
- p->flags & IORESOURCE_EXCLUSIVE) {
+ /*
+ * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+ * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+ * resource is busy.
+ */
+ if ((p->flags & IORESOURCE_BUSY) == 0)
+ continue;
+ if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+ || p->flags & IORESOURCE_EXCLUSIVE) {
err = 1;
break;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973ce80..302d6ebd64f7 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,9 +13,10 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601ddf7..a5d966cb8891 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
ag = autogroup_task_get(p);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+ err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index caf4041f5b0a..fedb967a9841 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -61,6 +61,7 @@
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
+#include <linux/tick.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)
{
if (!sched_clock_stable())
static_key_slow_inc(&__sched_clock_stable);
+
+ tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
void set_sched_clock_stable(void)
@@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)
/* XXX worry about clock continuity */
if (sched_clock_stable())
static_key_slow_dec(&__sched_clock_stable);
+
+ tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
@@ -354,7 +359,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return;
sched_clock_tick();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 732e993b564b..4ee3ce7ec78d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
* Thomas Gleixner, Mike Kravetz
*/
+#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
@@ -66,12 +67,10 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
-#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
#undef SCHED_FEAT
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled) \
- #name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
- int i;
-
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (!(sysctl_sched_features & (1UL << i)))
- seq_puts(m, "NO_");
- seq_printf(m, "%s ", sched_feat_names[i]);
- }
- seq_puts(m, "\n");
-
- return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled) \
- jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
- static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
- static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
- int i;
- int neg = 0;
-
- if (strncmp(cmp, "NO_", 3) == 0) {
- neg = 1;
- cmp += 3;
- }
-
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (strcmp(cmp, sched_feat_names[i]) == 0) {
- if (neg) {
- sysctl_sched_features &= ~(1UL << i);
- sched_feat_disable(i);
- } else {
- sysctl_sched_features |= (1UL << i);
- sched_feat_enable(i);
- }
- break;
- }
- }
-
- return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- char *cmp;
- int i;
- struct inode *inode;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
-
- /* Ensure the static_key remains in a consistent state */
- inode = file_inode(filp);
- mutex_lock(&inode->i_mutex);
- i = sched_feat_set(cmp);
- mutex_unlock(&inode->i_mutex);
- if (i == __SCHED_FEAT_NR)
- return -EINVAL;
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
- .open = sched_feat_open,
- .write = sched_feat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static __init int sched_init_debug(void)
-{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
-
- return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
@@ -453,20 +320,6 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */
-/*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#define fetch_or(ptr, val) \
-({ typeof(*(ptr)) __old, __val = *(ptr); \
- for (;;) { \
- __old = cmpxchg((ptr), __val, __val | (val)); \
- if (__old == __val) \
- break; \
- __val = __old; \
- } \
- __old; \
-})
-
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -715,31 +568,36 @@ static inline bool got_nohz_idle_kick(void)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+bool sched_can_stop_tick(struct rq *rq)
{
+ int fifo_nr_running;
+
+ /* Deadline tasks, even if single, need the tick */
+ if (rq->dl.dl_nr_running)
+ return false;
+
/*
- * FIFO realtime policy runs the highest priority task. Other runnable
- * tasks are of a lower priority. The scheduler tick does nothing.
+ * FIFO realtime policy runs the highest priority task (after DEADLINE).
+ * Other runnable tasks are of a lower priority. The scheduler tick
+ * isn't needed.
*/
- if (current->policy == SCHED_FIFO)
+ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ if (fifo_nr_running)
return true;
/*
* Round-robin realtime tasks time slice with other tasks at the same
- * realtime priority. Is this task the only one at this priority?
+ * realtime priority.
*/
- if (current->policy == SCHED_RR) {
- struct sched_rt_entity *rt_se = &current->rt;
-
- return rt_se->run_list.prev == rt_se->run_list.next;
+ if (rq->rt.rr_nr_running) {
+ if (rq->rt.rr_nr_running == 1)
+ return true;
+ else
+ return false;
}
- /*
- * More than one running task need preemption.
- * nr_running update is assumed to be visible
- * after IPI is sent from wakers.
- */
- if (this_rq()->nr_running > 1)
+ /* Normal multitasking need periodic preemption checks */
+ if (rq->cfs.nr_running > 1)
return false;
return true;
@@ -823,8 +681,8 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(prio_to_weight[prio]);
- load->inv_weight = prio_to_wmult[prio];
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1071,8 +929,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);
@@ -1080,8 +938,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
@@ -1274,6 +1132,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1177,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
@@ -1905,6 +1774,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
raw_spin_unlock(&rq->lock);
}
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0)
+ * 2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_acquire(!X->on_cpu);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -1968,19 +1928,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
- */
- while (p->on_cpu)
- cpu_relax();
- /*
- * Combined with the control dependency above, we have an effective
- * smp_load_acquire() without the need for full barriers.
*
* Pairs with the smp_store_release() in finish_lock_switch().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_rmb();
+ smp_cond_acquire(!p->on_cpu);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -1997,7 +1951,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu);
stat:
- ttwu_stat(p, cpu, wake_flags);
+ if (schedstat_enabled())
+ ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2045,7 +2000,8 @@ static void try_to_wake_up_local(struct task_struct *p)
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
+ if (schedstat_enabled())
+ ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2087,7 +2043,6 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
dl_se->dl_yielded = 0;
}
@@ -2109,7 +2064,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2118,6 +2078,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
+ p->rt.timeout = 0;
+ p->rt.time_slice = sched_rr_timeslice;
+ p->rt.on_rq = 0;
+ p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2181,6 +2145,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_schedstats);
+ else
+ static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
+}
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ set_schedstats(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_schedstats(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_schedstats);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_schedstats(state);
+ return err;
+}
+#endif
+#endif
+
/*
* fork()/clone()-time setup:
*/
@@ -2910,16 +2937,6 @@ u64 scheduler_tick_max_deferment(void)
}
#endif
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
- if (in_lock_functions(addr)) {
- addr = CALLER_ADDR2;
- if (in_lock_functions(addr))
- addr = CALLER_ADDR3;
- }
- return addr;
-}
-
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
@@ -2941,7 +2958,7 @@ void preempt_count_add(int val)
PREEMPT_MASK - 10);
#endif
if (preempt_count() == val) {
- unsigned long ip = get_parent_ip(CALLER_ADDR1);
+ unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
@@ -2968,7 +2985,7 @@ void preempt_count_sub(int val)
#endif
if (preempt_count() == val)
- trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
@@ -3109,7 +3126,6 @@ static void __sched notrace __schedule(bool preempt)
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
@@ -3128,13 +3144,16 @@ static void __sched notrace __schedule(bool preempt)
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3155,7 +3174,7 @@ static void __sched notrace __schedule(bool preempt)
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
- to_wakeup = wq_worker_sleeping(prev, cpu);
+ to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
@@ -3178,7 +3197,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
- cpu = cpu_of(rq);
} else {
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
@@ -3364,7 +3382,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+ int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3392,11 +3410,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
+
+ if (oldprio == prio)
+ queue_flag &= ~DEQUEUE_MOVE;
+
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
@@ -3414,7 +3436,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag |= ENQUEUE_REPLENISH;
+ queue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3422,7 +3444,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag |= ENQUEUE_HEAD;
+ queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3437,7 +3459,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, enqueue_flag);
+ enqueue_task(rq, p, queue_flag);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3793,6 +3815,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq *rq;
int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -3975,17 +3998,14 @@ change:
* itself.
*/
new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
- }
+ if (new_effective_prio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
@@ -3995,15 +4015,14 @@ change:
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
- int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
- if (oldprio <= p->prio)
- enqueue_flags |= ENQUEUE_HEAD;
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
- enqueue_task(rq, p, enqueue_flags);
+ enqueue_task(rq, p, queue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -4994,6 +5013,8 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ kasan_unpoison_task_stack(idle);
+
#ifdef CONFIG_SMP
/*
* Its possible that init_idle() gets called multiple times on a task,
@@ -5303,183 +5324,6 @@ static void migrate_tasks(struct rq *dead_rq)
}
#endif /* CONFIG_HOTPLUG_CPU */
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
- return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
- struct ctl_table *entry;
-
- /*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
- */
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
- }
-
- kfree(*tablep);
- *tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler,
- bool load_idx)
-{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
-
- if (load_idx) {
- entry->extra1 = &min_load_idx;
- entry->extra2 = &max_load_idx;
- }
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9], "cache_nice_tries",
- &sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost",
- &sd->max_newidle_lb_cost,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
-
- return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
-
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
- }
- return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
- char buf[32];
-
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
-
- if (entry == NULL)
- return;
-
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
- }
-
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
static void set_rq_online(struct rq *rq)
{
if (!rq->online) {
@@ -5590,16 +5434,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
set_cpu_rq_start_time();
return NOTIFY_OK;
- case CPU_ONLINE:
- /*
- * At this point a starting CPU has marked itself as online via
- * set_cpu_online(). But it might not yet have marked itself
- * as active, which is essential from here on.
- */
- set_cpu_active(cpu, true);
- stop_machine_unpark(cpu);
- return NOTIFY_OK;
-
case CPU_DOWN_FAILED:
set_cpu_active(cpu, true);
return NOTIFY_OK;
@@ -6071,11 +5905,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
+ int ret;
+
alloc_bootmem_cpumask_var(&cpu_isolated_map);
- cpulist_parse(str, cpu_isolated_map);
+ ret = cpulist_parse(str, cpu_isolated_map);
+ if (ret) {
+ pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return 0;
+ }
return 1;
}
-
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
@@ -6738,7 +6577,7 @@ static void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
- for (k = 0; k < nr_node_ids; k++) {
+ for_each_node(k) {
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -7355,6 +7194,9 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7412,11 +7254,12 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
@@ -7697,7 +7540,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
}
/* allocate runqueue etc for a new task group */
@@ -7705,7 +7548,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
@@ -7754,11 +7597,9 @@ void sched_destroy_group(struct task_group *tg)
void sched_offline_group(struct task_group *tg)
{
unsigned long flags;
- int i;
/* end participation in shares distribution */
- for_each_possible_cpu(i)
- unregister_fair_sched_group(tg, i);
+ unregister_fair_sched_group(tg);
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
@@ -7784,7 +7625,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE);
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -7808,7 +7649,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+ enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
task_rq_unlock(rq, tsk, &flags);
}
@@ -8236,7 +8077,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
@@ -8600,7 +8441,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_files,
- .early_init = 1,
+ .early_init = true,
};
#endif /* CONFIG_CGROUP_SCHED */
@@ -8610,3 +8451,44 @@ void dump_cpu_task(int cpu)
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..2ddaebf7469a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
.legacy_cftypes = files,
- .early_init = 1,
+ .early_init = true,
};
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..928c4ba32f68
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,37 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ *
+ * Set and publish the update_util_data pointer for the given CPU. That pointer
+ * points to a struct update_util_data object containing a callback function
+ * to call from cpufreq_update_util(). That function will be called from an RCU
+ * read-side critical section, so it must not sleep.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+{
+ if (WARN_ON(data && !data->func))
+ return;
+
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 05de80b48586..75f98c5498d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,9 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -259,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
- cputime_t steal_ct;
+ unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
/*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
+ * steal is in nsecs but our caller is expecting steal
+ * time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds.
*/
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+ steal_jiffies = nsecs_to_jiffies(steal);
+ this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
- account_steal_time(steal_ct);
- return steal_ct;
+ account_steal_time(jiffies_to_cputime(steal_jiffies));
+ return steal_jiffies;
}
#endif
return false;
@@ -466,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
- if (vtime_accounting_enabled())
+ if (vtime_accounting_cpu_enabled())
return;
if (sched_clock_irqtime) {
@@ -665,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
{
- unsigned long long clock;
+ unsigned long now = READ_ONCE(jiffies);
- clock = local_clock();
- if (clock < tsk->vtime_snap)
+ if (time_before(now, (unsigned long)tsk->vtime_snap))
return 0;
- return clock - tsk->vtime_snap;
+ return jiffies_to_cputime(now - tsk->vtime_snap);
}
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
- unsigned long long delta = vtime_delta(tsk);
+ unsigned long now = READ_ONCE(jiffies);
+ unsigned long delta = now - tsk->vtime_snap;
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
- tsk->vtime_snap += delta;
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
+ tsk->vtime_snap = now;
- /* CHECKME: always safe to convert nsecs to cputime? */
- return nsecs_to_cputime(delta);
+ return jiffies_to_cputime(delta);
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -696,37 +698,44 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ if (!vtime_delta(tsk))
+ return;
+
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
+ if (vtime_delta(tsk))
+ __vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- write_seqlock(&tsk->vtime_seqlock);
- delta_cpu = get_vtime_delta(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
tsk->vtime_snap_whence = VTIME_SYS;
- account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_sequnlock(&tsk->vtime_seqlock);
+ if (vtime_delta(tsk)) {
+ delta_cpu = get_vtime_delta(tsk);
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ }
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_user_enter(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
+ if (vtime_delta(tsk))
+ __vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
@@ -738,19 +747,20 @@ void vtime_guest_enter(struct task_struct *tsk)
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
+ write_seqcount_begin(&tsk->vtime_seqcount);
+ if (vtime_delta(tsk))
+ __vtime_account_system(tsk);
current->flags |= PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
current->flags &= ~PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -763,24 +773,26 @@ void vtime_account_idle(struct task_struct *tsk)
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqlock(&prev->vtime_seqlock);
- prev->vtime_snap_whence = VTIME_SLEEPING;
- write_sequnlock(&prev->vtime_seqlock);
+ write_seqcount_begin(&prev->vtime_seqcount);
+ prev->vtime_snap_whence = VTIME_INACTIVE;
+ write_seqcount_end(&prev->vtime_seqcount);
- write_seqlock(&current->vtime_seqlock);
+ write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_sequnlock(&current->vtime_seqlock);
+ current->vtime_snap = jiffies;
+ write_seqcount_end(&current->vtime_seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
- write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ local_irq_save(flags);
+ write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = sched_clock_cpu(cpu);
- write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+ t->vtime_snap = jiffies;
+ write_seqcount_end(&t->vtime_seqcount);
+ local_irq_restore(flags);
}
cputime_t task_gtime(struct task_struct *t)
@@ -788,17 +800,17 @@ cputime_t task_gtime(struct task_struct *t)
unsigned int seq;
cputime_t gtime;
- if (!context_tracking_is_enabled())
+ if (!vtime_accounting_enabled())
return t->gtime;
do {
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
gtime = t->gtime;
- if (t->flags & PF_VCPU)
+ if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
gtime += vtime_delta(t);
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
return gtime;
}
@@ -821,7 +833,7 @@ fetch_task_cputime(struct task_struct *t,
*udelta = 0;
*sdelta = 0;
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
if (u_dst)
*u_dst = *u_src;
@@ -829,7 +841,7 @@ fetch_task_cputime(struct task_struct *t,
*s_dst = *s_src;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ if (t->vtime_snap_whence == VTIME_INACTIVE ||
is_idle_task(t))
continue;
@@ -845,7 +857,7 @@ fetch_task_cputime(struct task_struct *t,
if (t->vtime_snap_whence == VTIME_SYS)
*sdelta = delta;
}
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
@@ -853,6 +865,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
{
cputime_t udelta, sdelta;
+ if (!vtime_accounting_enabled()) {
+ if (utime)
+ *utime = t->utime;
+ if (stime)
+ *stime = t->stime;
+ return;
+ }
+
fetch_task_cputime(t, utime, stime, &t->utime,
&t->stime, &udelta, &sdelta);
if (utime)
@@ -866,6 +886,14 @@ void task_cputime_scaled(struct task_struct *t,
{
cputime_t udelta, sdelta;
+ if (!vtime_accounting_enabled()) {
+ if (utimescaled)
+ *utimescaled = t->utimescaled;
+ if (stimescaled)
+ *stimescaled = t->stimescaled;
+ return;
+ }
+
fetch_task_cputime(t, utimescaled, stimescaled,
&t->utimescaled, &t->stimescaled, &udelta, &sdelta);
if (utimescaled)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e285f9..affd97ec9f65 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
}
}
- if (leftmost)
+ if (leftmost) {
dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ dl_rq->earliest_dl.next = p->dl.deadline;
+ }
rb_link_node(&p->pushable_dl_tasks, parent, link);
rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
next_node = rb_next(&p->pushable_dl_tasks);
dl_rq->pushable_dl_tasks_leftmost = next_node;
+ if (next_node) {
+ dl_rq->earliest_dl.next = rb_entry(next_node,
+ struct task_struct, pushable_dl_tasks)->dl.deadline;
+ }
}
rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -346,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+ WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+ /*
+ * We are racing with the deadline timer. So, do nothing because
+ * the deadline timer handler will take care of properly recharging
+ * the runtime and postponing the deadline
+ */
+ if (dl_se->dl_throttled)
+ return;
/*
* We use the regular wall clock time to set deadlines in the
@@ -355,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
*/
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
- dl_se->dl_new = 0;
}
/*
@@ -393,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->runtime = pi_se->dl_runtime;
}
+ if (dl_se->dl_yielded && dl_se->runtime > 0)
+ dl_se->runtime = 0;
+
/*
* We keep moving the deadline away until we get some
* available runtime for the entity. This ensures correct
@@ -414,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* entity.
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
- printk_deferred_once("sched: DL replenish lagged to much\n");
+ printk_deferred_once("sched: DL replenish lagged too much\n");
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
@@ -494,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- /*
- * The arrival of a new instance needs special treatment, i.e.,
- * the actual scheduling parameters have to be "renewed".
- */
- if (dl_se->dl_new) {
- setup_new_dl_entity(dl_se, pi_se);
- return;
- }
-
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -599,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
}
/*
- * This is possible if switched_from_dl() raced against a running
- * callback that took the above !dl_task() path and we've since then
- * switched back into SCHED_DEADLINE.
- *
- * There's nothing to do except drop our task reference.
- */
- if (dl_se->dl_new)
- goto unlock;
-
- /*
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
@@ -720,6 +717,10 @@ static void update_curr_dl(struct rq *rq)
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
/*
* Consumed budget is computed considering the time as
* observed by schedulable tasks (excluding time spent
@@ -729,8 +730,11 @@ static void update_curr_dl(struct rq *rq)
* approach need further study.
*/
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ if (unlikely((s64)delta_exec <= 0)) {
+ if (unlikely(dl_se->dl_yielded))
+ goto throttle;
return;
+ }
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -743,8 +747,10 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(dl_se)) {
+ dl_se->runtime -= delta_exec;
+
+throttle:
+ if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq)
#ifdef CONFIG_SMP
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
-
-static inline u64 next_deadline(struct rq *rq)
-{
- struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
-
- if (next && dl_prio(next->prio))
- return next->dl.deadline;
- else
- return 0;
-}
-
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
- /*
- * If the dl_rq had no -deadline tasks, or if the new task
- * has shorter deadline than the current one on dl_rq, we
- * know that the previous earliest becomes our next earliest,
- * as the new task becomes the earliest itself.
- */
- dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
- } else if (dl_rq->earliest_dl.next == 0 ||
- dl_time_before(deadline, dl_rq->earliest_dl.next)) {
- /*
- * On the other hand, if the new -deadline task has a
- * a later deadline than the earliest one on dl_rq, but
- * it is earlier than the next (if any), we must
- * recompute the next-earliest.
- */
- dl_rq->earliest_dl.next = next_deadline(rq);
}
}
@@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
- dl_rq->earliest_dl.next = next_deadline(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
}
}
@@ -940,7 +917,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+ if (flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
else if (flags & ENQUEUE_REPLENISH)
replenish_dl_entity(dl_se, pi_se);
@@ -1017,18 +994,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/
static void yield_task_dl(struct rq *rq)
{
- struct task_struct *p = rq->curr;
-
/*
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
* new scheduling parameters (thanks to dl_yielded=1).
*/
- if (p->dl.runtime > 0) {
- rq->curr->dl.dl_yielded = 1;
- p->dl.runtime = 0;
- }
+ rq->curr->dl.dl_yielded = 1;
+
update_rq_clock(rq);
update_curr_dl(rq);
/*
@@ -1274,28 +1247,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
return 0;
}
-/* Returns the second earliest -deadline task, NULL otherwise */
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
-{
- struct rb_node *next_node = rq->dl.rb_leftmost;
- struct sched_dl_entity *dl_se;
- struct task_struct *p = NULL;
-
-next_node:
- next_node = rb_next(next_node);
- if (next_node) {
- dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
- p = dl_task_of(dl_se);
-
- if (pick_dl_task(rq, p, cpu))
- return p;
-
- goto next_node;
- }
-
- return NULL;
-}
-
/*
* Return the earliest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise:
@@ -1767,6 +1718,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
+ if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+ setup_new_dl_entity(&p->dl, &p->dl);
+
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1813,8 +1767,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
resched_curr(rq);
#endif /* CONFIG_SMP */
- } else
- switched_to_dl(rq, p);
+ }
}
const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771ae6..4fbc3bd5ff60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
+#include <linux/debugfs.h>
#include "sched.h"
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+#define SCHED_FEAT(name, enabled) \
+ #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+ int i;
+ int neg = 0;
+
+ if (strncmp(cmp, "NO_", 3) == 0) {
+ neg = 1;
+ cmp += 3;
+ }
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (strcmp(cmp, sched_feat_names[i]) == 0) {
+ if (neg) {
+ sysctl_sched_features &= ~(1UL << i);
+ sched_feat_disable(i);
+ } else {
+ sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
+ break;
+ }
+ }
+
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+ struct inode *inode;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ inode_lock(inode);
+ i = sched_feat_set(cmp);
+ inode_unlock(inode);
+ if (i == __SCHED_FEAT_NR)
+ return -EINVAL;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+ debugfs_create_file("sched_features", 0644, NULL, NULL,
+ &sched_feat_fops);
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_possible_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_possible_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
+ if (schedstat_enabled()) {
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
+ }
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
#ifdef CONFIG_SCHEDSTATS
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.statistics.wait_sum),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ if (schedstat_enabled()) {
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(p->se.statistics.wait_sum),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+ }
#else
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
+ struct dl_bw *dl_bw;
+
SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+ dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+ dl_bw = &dl_rq->dl_bw;
+#endif
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
}
extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do { \
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
- P(yld_count);
-
- P(sched_count);
- P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
P64(max_idle_balance_cost);
#endif
- P(ttwu_count);
- P(ttwu_local);
+ if (schedstat_enabled()) {
+ P(yld_count);
+ P(sched_count);
+ P(sched_goidle);
+ P(ttwu_count);
+ P(ttwu_local);
+ }
#undef P
#undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
- PN(se.statistics.sum_sleep_runtime);
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
P(se.nr_migrations);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
- {
+ if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
+ PN(se.statistics.sum_sleep_runtime);
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfdc0e61066c..46d64e4ccfde 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
@@ -738,16 +738,52 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta;
+
+ delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
}
/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
@@ -757,25 +793,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Mark the end of the wait period if dequeueing a
@@ -783,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
+
+ if (flags & DEQUEUE_SLEEP) {
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ }
+ }
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+}
+#endif
+
/*
* We are picking a new current task - update its stats:
*/
@@ -880,10 +932,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
+ int active_nodes;
struct rcu_head rcu;
- nodemask_t active_nodes;
unsigned long total_faults;
+ unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
@@ -942,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
@@ -1091,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true;
/*
- * Do not migrate if the destination is not a node that
- * is actively used by this numa group.
- */
- if (!node_isset(dst_nid, ng->active_nodes))
- return false;
-
- /*
- * Source is a node that is not actively used by this
- * numa group, while the destination is. Migrate.
+ * Destination node is much more heavily used than the source
+ * node? Allow migration.
*/
- if (!node_isset(src_nid, ng->active_nodes))
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+ ACTIVE_NODE_FRACTION)
return true;
/*
- * Both source and destination are nodes in active
- * use by this numa group. Maximize memory bandwidth
- * by migrating from more heavily used groups, to less
- * heavily used ones, spreading the load around.
- * Use a 1/4 hysteresis to avoid spurious page movement.
+ * Distribute memory according to CPU & memory use on each node,
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
+ *
+ * faults_cpu(dst) 3 faults_cpu(src)
+ * --------------- * - > ---------------
+ * faults_mem(dst) 4 faults_mem(src)
*/
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static unsigned long weighted_cpuload(const int cpu);
@@ -1193,8 +1254,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1262,20 +1321,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1359,6 +1428,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1384,9 +1454,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -1441,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL,
.best_imp = 0,
- .best_cpu = -1
+ .best_cpu = -1,
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
@@ -1493,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group &&
- nodes_weight(p->numa_group->active_nodes) > 1)) {
+ if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1529,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = env.dst_nid;
- if (node_isset(nid, p->numa_group->active_nodes))
+ if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid);
}
@@ -1584,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
*/
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
- int nid;
+ int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
@@ -1607,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
- if (!node_isset(nid, numa_group->active_nodes)) {
- if (faults > max_faults * 6 / 16)
- node_set(nid, numa_group->active_nodes);
- } else if (faults < max_faults * 3 / 16)
- node_clear(nid, numa_group->active_nodes);
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
+ active_nodes++;
}
+
+ numa_group->max_faults_cpu = max_faults;
+ numa_group->active_nodes = active_nodes;
}
/*
@@ -1903,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
- update_numa_active_node_mask(p->numa_group);
+ numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
}
@@ -1947,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return;
atomic_set(&grp->refcount, 1);
+ grp->active_nodes = 1;
+ grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
- node_set(task_node(current), grp->active_nodes);
-
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2068,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
+ struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
@@ -2108,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- if (!priv && !local && p->numa_group &&
- node_isset(cpu_node, p->numa_group->active_nodes) &&
- node_isset(mem_node, p->numa_group->active_nodes))
+ ng = p->numa_group;
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
local = 1;
task_numa_placement(p);
@@ -2155,6 +2230,7 @@ void task_numa_work(struct callback_head *work)
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2277,6 +2353,17 @@ out:
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
+
+ /*
+ * Make sure tasks use at least 32x as much time to run other code
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+ * Usually update_task_scan_period slows down scanning enough; on an
+ * overloaded system we need to limit overhead on a per task basis.
+ */
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
+ u64 diff = p->se.sum_exec_runtime - runtime;
+ p->node_stamp += 32 * diff;
+ }
}
/*
@@ -2670,12 +2757,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2717,7 +2856,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
- int cpu = cpu_of(rq_of(cfs_rq));
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Track task load average for carrying it to new CPU after migrated, and
@@ -2729,6 +2869,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
+
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ unsigned long max = rq->cpu_capacity_orig;
+
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_clock(rq),
+ min(cfs_rq->avg.util_avg, max), max);
+ }
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2809,48 +2972,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
u64 last_update_time_copy;
+ u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
-#else
- last_update_time = cfs_rq->avg.last_update_time;
-#endif
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+ return last_update_time;
}
-
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
+ return cfs_rq->avg.last_update_time;
}
+#endif
/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-void idle_exit_fair(struct rq *this_rq)
+void remove_entity_load_avg(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -2995,6 +3158,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled()) {
+ pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+ "stat_blocked and stat_runtime require the "
+ "kernel parameter schedstats=enabled or "
+ "kernel.sched_schedstats=1\n");
+ }
+#endif
+}
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -3015,11 +3198,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
+ if (schedstat_enabled())
+ enqueue_sleeper(cfs_rq, se);
}
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
+ check_schedstat_required();
+ if (schedstat_enabled()) {
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ }
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3086,19 +3273,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- }
-#endif
- }
+ if (schedstat_enabled())
+ update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3172,7 +3348,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ if (schedstat_enabled())
+ update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
@@ -3185,7 +3362,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
@@ -3268,9 +3445,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
+ if (schedstat_enabled()) {
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq)
+ update_stats_wait_start(cfs_rq, prev);
+ }
+
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -4240,42 +4421,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ * The exact cpuload calculated at every tick would be:
+ *
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ * load_n = (1 - 1/2^i)^n * load_0
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
*
* decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
*
* The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
*/
#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
+};
/*
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
@@ -4306,14 +4482,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
return load;
}
-/*
+/**
+ * __update_cpu_load - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ * @active: !0 for NOHZ_FULL
+ *
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ * = A * (A * load[i]_n-2 + B) + B
+ * = A * (A * (A * load[i]_n-3 + B) + B) + B
+ * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ * load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term. See the @active paramter.
*/
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
+ unsigned long pending_updates, int active)
{
+ unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
int i, scale;
this_rq->nr_load_updates++;
@@ -4327,6 +4535,15 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
old_load = this_rq->cpu_load[i];
old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ if (tickless_load) {
+ old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+ /*
+ * old_load can never be a negative value because a
+ * decayed tickless_load cannot be greater than the
+ * original tickless_load.
+ */
+ old_load += tickless_load;
+ }
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4349,6 +4566,25 @@ static unsigned long weighted_cpuload(const int cpu)
}
#ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load,
+ int active)
+{
+ unsigned long pending_updates;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * In the regular NOHZ case, we were idle, this means load 0.
+ * In the NOHZ_FULL case, we were non-idle, we should consider
+ * its weighted load.
+ */
+ __update_cpu_load(this_rq, load, pending_updates, active);
+ }
+}
+
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4366,46 +4602,31 @@ static unsigned long weighted_cpuload(const int cpu)
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
{
- unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
- unsigned long pending_updates;
-
/*
* bail if there's load or we're actually up-to-date.
*/
- if (load || curr_jiffies == this_rq->last_load_update_tick)
+ if (weighted_cpuload(cpu_of(this_rq)))
return;
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
+ __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
}
/*
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
*/
-void update_cpu_load_nohz(void)
+void update_cpu_load_nohz(int active)
{
struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long pending_updates;
+ unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
+ __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
raw_spin_unlock(&this_rq->lock);
}
#endif /* CONFIG_NO_HZ */
@@ -4417,10 +4638,10 @@ void update_cpu_load_active(struct rq *this_rq)
{
unsigned long load = weighted_cpuload(cpu_of(this_rq));
/*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
+ __update_cpu_load(this_rq, load, 1, 1);
}
/*
@@ -5007,8 +5228,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
{
@@ -5721,8 +5941,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
}
@@ -5855,8 +6075,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -6302,7 +6522,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *overload)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -6319,7 +6539,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
@@ -6327,7 +6548,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
}
@@ -7248,8 +7472,6 @@ static int idle_balance(struct rq *this_rq)
int pulled_task = 0;
u64 curr_cost = 0;
- idle_enter_fair(this_rq);
-
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -7330,10 +7552,8 @@ out:
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
- if (pulled_task) {
- idle_exit_fair(this_rq);
+ if (pulled_task)
this_rq->idle_stamp = 0;
- }
return pulled_task;
}
@@ -7712,7 +7932,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- update_idle_cpu_load(rq);
+ update_cpu_load_idle(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}
@@ -8098,11 +8318,8 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se) {
- if (tg->se[i])
- remove_entity_load_avg(tg->se[i]);
+ if (tg->se)
kfree(tg->se[i]);
- }
}
kfree(tg->cfs_rq);
@@ -8150,21 +8367,29 @@ err:
return 0;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void unregister_fair_sched_group(struct task_group *tg)
{
- struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ struct rq *rq;
+ int cpu;
- /*
- * Only empty task groups can be destroyed; so we can speculatively
- * check on_list without danger of it being re-added.
- */
- if (!tg->cfs_rq[cpu]->on_list)
- return;
+ for_each_possible_cpu(cpu) {
+ if (tg->se[cpu])
+ remove_entity_load_avg(tg->se[cpu]);
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ continue;
+
+ rq = cpu_rq(cpu);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
}
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8246,7 +8471,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4a2ef5a02fd3..bd12c6c714ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
+#include <linux/cpuhotplug.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
@@ -97,12 +98,6 @@ void default_idle_call(void)
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
- /* Fall back to the default arch idle method on errors. */
- if (next_state < 0) {
- default_idle_call();
- return next_state;
- }
-
/*
* The idle task must be scheduled, it is pointless to go to idle, just
* update no idle residency and return.
@@ -168,7 +163,7 @@ static void cpuidle_idle_call(void)
*/
if (idle_should_freeze()) {
entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state >= 0) {
+ if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
@@ -199,8 +194,6 @@ exit_idle:
rcu_idle_exit();
}
-DEFINE_PER_CPU(bool, cpu_dead_idle);
-
/*
* Generic idle loop implementation
*
@@ -219,6 +212,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
@@ -226,10 +220,7 @@ static void cpu_idle_loop(void)
rmb();
if (cpu_is_offline(smp_processor_id())) {
- rcu_cpu_notify(NULL, CPU_DYING_IDLE,
- (void *)(long)smp_processor_id());
- smp_mb(); /* all activity before dead. */
- this_cpu_write(cpu_dead_idle, true);
+ cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -296,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state)
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare();
+ cpuhp_online_idle(state);
cpu_idle_loop();
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fdf9b..47ce94931f1b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
- idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..c41ea7ac1764 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
- hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+ /*
+ * SCHED_DEADLINE updates the bandwidth, as a run away
+ * RT task with a DL task could hog a CPU. But DL does
+ * not reset the period. If a deadline task was running
+ * without an RT task running, it can cause RT tasks to
+ * throttle when they start up. Kick the timer right away
+ * to update the period.
+ */
+ hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
- return !list_empty(&rt_se->run_list);
+ return rt_se->on_rq;
}
#ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->my_q;
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (!rt_se)
enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se))
- enqueue_rt_entity(rt_se, false);
+ enqueue_rt_entity(rt_se, 0);
if (rt_rq->highest_prio.curr < curr->prio)
resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
if (!rt_se)
dequeue_top_rt_rq(rt_rq);
else if (on_rt_rq(rt_se))
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, 0);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -945,6 +953,10 @@ static void update_curr_rt(struct rq *rq)
if (curr->sched_class != &rt_sched_class)
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
@@ -1142,12 +1154,27 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
}
static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+ struct task_struct *tsk;
+
+ if (group_rq)
+ return group_rq->rr_nr_running;
+
+ tsk = rt_task_of(rt_se);
+
+ return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+ rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1160,13 +1187,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+ rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ return false;
+
+ return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+ list_del_init(&rt_se->run_list);
+
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1230,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
* get throttled and the current group doesn't have any other
* active members.
*/
- if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+ if (rt_se->on_list)
+ __delist_rt_entity(rt_se, array);
return;
+ }
- if (head)
- list_add(&rt_se->run_list, queue);
- else
- list_add_tail(&rt_se->run_list, queue);
- __set_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(rt_se->on_list);
+ if (flags & ENQUEUE_HEAD)
+ list_add(&rt_se->run_list, queue);
+ else
+ list_add_tail(&rt_se->run_list, queue);
+
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+ rt_se->on_list = 1;
+ }
+ rt_se->on_rq = 1;
inc_rt_tasks(rt_se, rt_rq);
}
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
- list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
- __clear_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(!rt_se->on_list);
+ __delist_rt_entity(rt_se, array);
+ }
+ rt_se->on_rq = 0;
dec_rt_tasks(rt_se, rt_rq);
}
@@ -1207,7 +1269,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
@@ -1220,31 +1282,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
- __dequeue_rt_entity(rt_se);
+ __dequeue_rt_entity(rt_se, flags);
}
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
- __enqueue_rt_entity(rt_se, head);
+ __enqueue_rt_entity(rt_se, flags);
enqueue_top_rt_rq(&rq->rt);
}
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
- __enqueue_rt_entity(rt_se, false);
+ __enqueue_rt_entity(rt_se, flags);
}
enqueue_top_rt_rq(&rq->rt);
}
@@ -1260,7 +1322,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
- enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1271,7 +1333,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf670..382848a24ed9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
@@ -248,7 +249,12 @@ struct task_group {
unsigned long shares;
#ifdef CONFIG_SMP
- atomic_long_t load_avg;
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+ * it in its own cacheline separated from the fields above which
+ * will also be accessed at each tick.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
@@ -308,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
@@ -437,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)
struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
+ unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
@@ -896,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
#else
static inline void sched_ttwu_pending(void) { }
@@ -933,6 +959,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
@@ -1008,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void)
{
@@ -1076,7 +1104,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the control dependency and rmb in try_to_wake_up().
+ * Pairs with the smp_cond_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1113,59 +1141,43 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WEIGHT_IDLEPRIO 3
#define WMULT_IDLEPRIO 1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */ 88761, 71755, 56483, 46273, 36291,
- /* -15 */ 29154, 23254, 18705, 14949, 11916,
- /* -10 */ 9548, 7620, 6100, 4904, 3906,
- /* -5 */ 3121, 2501, 1991, 1586, 1277,
- /* 0 */ 1024, 820, 655, 526, 423,
- /* 5 */ 335, 272, 215, 172, 137,
- /* 10 */ 110, 87, 70, 56, 45,
- /* 15 */ 36, 29, 23, 18, 15,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING - sched_class::task_waking was called
*
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
*/
-static const u32 prio_to_wmult[40] = {
- /* -20 */ 48388, 59856, 76040, 92818, 118348,
- /* -15 */ 147320, 184698, 229616, 287308, 360437,
- /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_HEAD 0x02
+#define ENQUEUE_RESTORE 0x02
+#define ENQUEUE_MOVE 0x04
+
+#define ENQUEUE_HEAD 0x08
+#define ENQUEUE_REPLENISH 0x10
#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING 0x20
#else
#define ENQUEUE_WAKING 0x00
#endif
-#define ENQUEUE_REPLENISH 0x08
-#define ENQUEUE_RESTORE 0x10
-
-#define DEQUEUE_SLEEP 0x01
-#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL)
@@ -1252,16 +1264,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
-
#endif
#ifdef CONFIG_CPU_IDLE
@@ -1310,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+ int cpu;
+
+ if (!tick_nohz_full_enabled())
+ return;
+
+ cpu = cpu_of(rq);
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (sched_can_stop_tick(rq))
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+ else
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
static inline void add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -1321,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
if (!rq->rd->overload)
rq->rd->overload = true;
#endif
-
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_cpu(rq->cpu)) {
- /*
- * Tick is needed if more than one task runs on a CPU.
- * Send the target an IPI to kick it out of nohz mode.
- *
- * We assume that IPI implies full memory barrier and the
- * new value of rq->nr_running is visible on reception
- * from the target.
- */
- tick_nohz_full_kick_cpu(rq->cpu);
- }
-#endif
}
+
+ sched_update_tick_dependency(rq);
}
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ /* Check if we still need preemption */
+ sched_update_tick_dependency(rq);
}
static inline void rq_last_tick_reset(struct rq *rq)
@@ -1770,3 +1793,51 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @time: Current time.
+ * @util: Current utilization.
+ * @max: Utilization ceiling.
+ *
+ * This function is called by the scheduler on every invocation of
+ * update_load_avg() on the CPU whose utilization is being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ */
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, time, util, max);
+}
+
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+ cpufreq_update_util(time, ULONG_MAX, 0);
+}
+#else
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+#endif /* CONFIG_CPU_FREQ */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc7632de5..70b3b6a20fb0 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val) do { var = (val); } while (0)
+# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
+# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 000000000000..82f0dff90030
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+ struct lock_class_key *key)
+{
+ raw_spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ wake_up_process(curr->task);
+ list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+ unsigned long flags;
+
+ if (!swait_active(q))
+ return;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ swake_up_locked(q);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ LIST_HEAD(tmp);
+
+ if (!swait_active(q))
+ return;
+
+ raw_spin_lock_irq(&q->lock);
+ list_splice_init(&q->task_list, &tmp);
+ while (!list_empty(&tmp)) {
+ curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+ wake_up_state(curr->task, TASK_NORMAL);
+ list_del_init(&curr->task_list);
+
+ if (list_empty(&tmp))
+ break;
+
+ raw_spin_unlock_irq(&q->lock);
+ raw_spin_lock_irq(&q->lock);
+ }
+ raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ wait->task = current;
+ if (list_empty(&wait->task_list))
+ list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ prepare_to_swait(q, wait, state);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ __set_current_state(TASK_RUNNING);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+
+ if (!list_empty_careful(&wait->task_list)) {
+ raw_spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 580ac2d4024f..15a1795bbba1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void)
put_seccomp_filter(thread);
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
/*
* Opt the other thread into seccomp if needed.
* As threads are considered to be trust-realm
* equivalent (see ptrace_may_access), it is safe to
* allow one thread to transition the other.
*/
- if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
- /*
- * Don't let an unprivileged task work around
- * the no_new_privs restriction by creating
- * a thread that sets it up, enters seccomp,
- * then dies.
- */
- if (task_no_new_privs(caller))
- task_set_no_new_privs(thread);
-
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
- }
}
}
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a972fd..0508544c8ced 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!signal_pending(current)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
set_restore_sigmask();
return -ERESTARTNOHAND;
}
diff --git a/kernel/smp.c b/kernel/smp.c
index d903c02223af..74165443c240 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -105,13 +105,12 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
- while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
- cpu_relax();
+ smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
}
-static void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(struct call_single_data *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
@@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd)
smp_wmb();
}
-static void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(struct call_single_data *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
@@ -569,6 +568,7 @@ void __init smp_init(void)
unsigned int cpu;
idle_threads_init();
+ cpuhp_threads_init();
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59bff56..13bc43d1fb22 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
kthread_unpark(tsk);
}
-void smpboot_unpark_threads(unsigned int cpu)
+int smpboot_unpark_threads(unsigned int cpu)
{
struct smp_hotplug_thread *cur;
@@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu)
if (cpumask_test_cpu(cpu, cur->cpumask))
smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
+ return 0;
}
static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
@@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kthread_park(tsk);
}
-void smpboot_park_threads(unsigned int cpu)
+int smpboot_park_threads(unsigned int cpu)
{
struct smp_hotplug_thread *cur;
@@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu)
list_for_each_entry_reverse(cur, &hotplug_threads, list)
smpboot_park_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
+ return 0;
}
static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 72415a0eb955..485b81cfab34 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { }
#endif
int smpboot_create_threads(unsigned int cpu);
-void smpboot_park_threads(unsigned int cpu);
-void smpboot_unpark_threads(unsigned int cpu);
+int smpboot_park_threads(unsigned int cpu);
+int smpboot_unpark_threads(unsigned int cpu);
+
+void __init cpuhp_threads_init(void);
#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f787..8aae49dd7da8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
if (preempt_count() == cnt) {
#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+ current->preempt_disable_ip = get_lock_parent_ip();
#endif
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
}
}
EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a3bbaee77c58..a467e6c28a3b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -28,7 +28,6 @@
*/
struct cpu_stop_done {
atomic_t nr_todo; /* nr left to execute */
- bool executed; /* actually executed? */
int ret; /* collected return value */
struct completion completion; /* fired if nr_todo reaches 0 */
};
@@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
}
/* signal completion unless @done is NULL */
-static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+static void cpu_stop_signal_done(struct cpu_stop_done *done)
{
- if (done) {
- if (executed)
- done->executed = true;
- if (atomic_dec_and_test(&done->nr_todo))
- complete(&done->completion);
- }
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
@@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
}
/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
unsigned long flags;
+ bool enabled;
spin_lock_irqsave(&stopper->lock, flags);
- if (stopper->enabled)
+ enabled = stopper->enabled;
+ if (enabled)
__cpu_stop_queue_work(stopper, work);
- else
- cpu_stop_signal_done(work->done, false);
+ else if (work->done)
+ cpu_stop_signal_done(work->done);
spin_unlock_irqrestore(&stopper->lock, flags);
+
+ return enabled;
}
/**
@@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
cpu_stop_init_done(&done, 1);
- cpu_stop_queue_work(cpu, &work);
+ if (!cpu_stop_queue_work(cpu, &work))
+ return -ENOENT;
wait_for_completion(&done.completion);
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/* This controls the threads on each CPU. */
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
struct cpu_stop_work work1, work2;
struct multi_stop_data msdata;
- preempt_disable();
msdata = (struct multi_stop_data){
.fn = fn,
.data = arg,
@@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
if (cpu1 > cpu2)
swap(cpu1, cpu2);
- if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
- preempt_enable();
+ if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
return -ENOENT;
- }
-
- preempt_enable();
wait_for_completion(&done.completion);
-
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/**
@@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
*
* CONTEXT:
* Don't care.
+ *
+ * RETURNS:
+ * true if cpu_stop_work was queued successfully and @fn will be called,
+ * false otherwise.
*/
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf)
{
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
- cpu_stop_queue_work(cpu, work_buf);
+ return cpu_stop_queue_work(cpu, work_buf);
}
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
-static void queue_stop_cpus_work(const struct cpumask *cpumask,
+static bool queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
{
struct cpu_stop_work *work;
unsigned int cpu;
+ bool queued = false;
/*
* Disable preemption while queueing to avoid getting
@@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
work->fn = fn;
work->arg = arg;
work->done = done;
- cpu_stop_queue_work(cpu, work);
+ if (cpu_stop_queue_work(cpu, work))
+ queued = true;
}
lg_global_unlock(&stop_cpus_lock);
+
+ return queued;
}
static int __stop_cpus(const struct cpumask *cpumask,
@@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask,
struct cpu_stop_done done;
cpu_stop_init_done(&done, cpumask_weight(cpumask));
- queue_stop_cpus_work(cpumask, fn, arg, &done);
+ if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
+ return -ENOENT;
wait_for_completion(&done.completion);
- return done.executed ? done.ret : -ENOENT;
+ return done.ret;
}
/**
@@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct cpu_stop_work *work;
- int ret;
repeat:
work = NULL;
@@ -448,23 +450,19 @@ repeat:
cpu_stop_fn_t fn = work->fn;
void *arg = work->arg;
struct cpu_stop_done *done = work->done;
- char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
-
- /* cpu stop callbacks are not allowed to sleep */
- preempt_disable();
+ int ret;
+ /* cpu stop callbacks must not sleep, make in_atomic() == T */
+ preempt_count_inc();
ret = fn(arg);
- if (ret)
- done->ret = ret;
-
- /* restore preemption and check it's still balanced */
- preempt_enable();
+ if (done) {
+ if (ret)
+ done->ret = ret;
+ cpu_stop_signal_done(done);
+ }
+ preempt_count_dec();
WARN_ONCE(preempt_count(),
- "cpu_stop: %s(%p) leaked preempt count\n",
- kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
- ksym_buf), arg);
-
- cpu_stop_signal_done(done, true);
+ "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
goto repeat;
}
}
@@ -531,8 +529,6 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);
-#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
-
static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
@@ -630,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
mutex_unlock(&stop_cpus_mutex);
return ret ?: done.ret;
}
-
-#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sys.c b/kernel/sys.c
index 6af9212ab5aa..cf8ba545c7d3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1853,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
}
- if (prctl_map.exe_fd != (u32)-1)
+ if (prctl_map.exe_fd != (u32)-1) {
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
- down_read(&mm->mmap_sem);
- if (error)
- goto out;
+ if (error)
+ return error;
+ }
+
+ down_write(&mm->mmap_sem);
/*
* We don't validate if these members are pointing to
@@ -1894,10 +1896,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- error = 0;
-out:
- up_read(&mm->mmap_sem);
- return error;
+ up_write(&mm->mmap_sem);
+ return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
@@ -1963,7 +1963,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
- down_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
vma = find_vma(mm, addr);
prctl_map.start_code = mm->start_code;
@@ -2056,7 +2056,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
- up_read(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
return error;
}
@@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = perf_event_task_enable();
break;
case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
+ if (current->timer_slack_ns > ULONG_MAX)
+ error = ULONG_MAX;
+ else
+ error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
if (arg2 <= 0)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787ec67a..2c5e3a8e00d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
cond_syscall(sys_setfsgid);
cond_syscall(sys_capget);
cond_syscall(sys_capset);
+cond_syscall(sys_copy_file_range);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dc6858d6639e..725587f10667 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
+static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -173,7 +174,7 @@ extern int no_unaligned_warning;
#define SYSCTL_WRITES_WARN 0
#define SYSCTL_WRITES_STRICT 1
-static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -350,6 +351,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -505,7 +517,7 @@ static struct ctl_table kern_table[] = {
.data = &latencytop_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = sysctl_latencytop,
},
#endif
#ifdef CONFIG_BLK_DEV_INITRD
@@ -1393,6 +1405,15 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
@@ -1568,6 +1589,28 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
@@ -1735,6 +1778,20 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "pipe-user-pages-hard",
+ .data = &pipe_user_pages_hard,
+ .maxlen = sizeof(pipe_user_pages_hard),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "pipe-user-pages-soft",
+ .data = &pipe_user_pages_soft,
+ .maxlen = sizeof(pipe_user_pages_soft),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
@@ -2047,9 +2104,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
void *data)
{
int *i, vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2078,15 +2134,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first=0) {
@@ -2094,11 +2144,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
bool neg;
if (write) {
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
if (!left)
break;
- err = proc_get_long(&kbuf, &left, &lval, &neg,
+ err = proc_get_long(&p, &left, &lval, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2125,10 +2175,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err && left)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2310,9 +2359,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
{
unsigned long *i, *min, *max;
int vleft, first = 1, err = 0;
- unsigned long page = 0;
size_t left;
- char *kbuf;
+ char *kbuf = NULL, *p;
if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
*lenp = 0;
@@ -2340,15 +2388,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- err = -EFAULT;
- goto free;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
}
for (; left && vleft--; i++, first = 0) {
@@ -2357,9 +2399,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (write) {
bool neg;
- left -= proc_skip_spaces(&kbuf);
+ left -= proc_skip_spaces(&p);
- err = proc_get_long(&kbuf, &left, &val, &neg,
+ err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
sizeof(proc_wspace_sep), NULL);
if (err)
@@ -2385,10 +2427,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (!write && !first && left && !err)
err = proc_put_char(&buffer, &left, '\n');
if (write && !err)
- left -= proc_skip_spaces(&kbuf);
-free:
+ left -= proc_skip_spaces(&p);
if (write) {
- free_page(page);
+ kfree(kbuf);
if (first)
return err ? : -EINVAL;
}
@@ -2650,34 +2691,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
if (write) {
- unsigned long page = 0;
- char *kbuf;
+ char *kbuf, *p;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!kbuf)
- return -ENOMEM;
- if (copy_from_user(kbuf, buffer, left)) {
- free_page(page);
- return -EFAULT;
- }
- kbuf[left] = 0;
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
GFP_KERNEL);
if (!tmp_bitmap) {
- free_page(page);
+ kfree(kbuf);
return -ENOMEM;
}
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
while (!err && left) {
unsigned long val_a, val_b;
bool neg;
- err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+ err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
sizeof(tr_a), &c);
if (err)
break;
@@ -2688,12 +2722,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
val_b = val_a;
if (left) {
- kbuf++;
+ p++;
left--;
}
if (c == '-') {
- err = proc_get_long(&kbuf, &left, &val_b,
+ err = proc_get_long(&p, &left, &val_b,
&neg, tr_b, sizeof(tr_b),
&c);
if (err)
@@ -2704,16 +2738,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
break;
}
if (left) {
- kbuf++;
+ p++;
left--;
}
}
bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
first = 0;
- proc_skip_char(&kbuf, &left, '\n');
+ proc_skip_char(&p, &left, '\n');
}
- free_page(page);
+ kfree(kbuf);
} else {
unsigned long bit_a, bit_b = 0;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7fbba635a549..e840ed867a5d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -271,11 +271,27 @@ static int alarmtimer_suspend(struct device *dev)
__pm_wakeup_event(ws, MSEC_PER_SEC);
return ret;
}
+
+static int alarmtimer_resume(struct device *dev)
+{
+ struct rtc_device *rtc;
+
+ rtc = alarmtimer_get_rtcdev();
+ if (rtc)
+ rtc_timer_cancel(rtc, &rtctimer);
+ return 0;
+}
+
#else
static int alarmtimer_suspend(struct device *dev)
{
return 0;
}
+
+static int alarmtimer_resume(struct device *dev)
+{
+ return 0;
+}
#endif
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
@@ -800,6 +816,7 @@ out:
/* Suspend hook structures */
static const struct dev_pm_ops alarmtimer_pm_ops = {
.suspend = alarmtimer_suspend,
+ .resume = alarmtimer_resume,
};
static struct platform_driver alarmtimer_driver = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1347882d131e..56ece145a814 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -218,8 +218,8 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
- pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
- cs->name);
+ pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
+ smp_processor_id(), cs->name);
pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
/* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_select_watchdog(bool fallback)
+{
+ struct clocksource *cs, *old_wd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ /* save current watchdog */
+ old_wd = watchdog;
+ if (fallback)
+ watchdog = NULL;
+
+ list_for_each_entry(cs, &clocksource_list, list) {
+ /* cs is a clocksource to be watched. */
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
+ continue;
+
+ /* Skip current if we were requested for a fallback. */
+ if (fallback && cs == old_wd)
+ continue;
+
/* Pick the best watchdog. */
- if (!watchdog || cs->rating > watchdog->rating) {
+ if (!watchdog || cs->rating > watchdog->rating)
watchdog = cs;
- /* Reset watchdog cycles */
- clocksource_reset_watchdog();
- }
}
+ /* If we failed to find a fallback restore the old one. */
+ if (!watchdog)
+ watchdog = old_wd;
+
+ /* If we changed the watchdog we need to reset cycles. */
+ if (watchdog != old_wd)
+ clocksource_reset_watchdog();
+
/* Check if the watchdog timer needs to be started. */
clocksource_start_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
- /*
- * I really can't convince myself to support this on hardware
- * designed by lobotomized monkeys.
- */
- if (clocksource_is_watchdog(cs))
- return -EBUSY;
+ if (clocksource_is_watchdog(cs)) {
+ /* Select and try to install a replacement watchdog. */
+ clocksource_select_watchdog(true);
+ if (clocksource_is_watchdog(cs))
+ return -EBUSY;
+ }
if (cs == curr_clocksource) {
/* Select and try to install a replacement clock source */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 435b8850dd80..fa0b983290cf 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* High resolution timer enabled ?
*/
-static int hrtimer_hres_enabled __read_mostly = 1;
+static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
@@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution);
*/
static int __init setup_hrtimer_hres(char *str)
{
- if (!strcmp(str, "off"))
- hrtimer_hres_enabled = 0;
- else if (!strcmp(str, "on"))
- hrtimer_hres_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
__setup("highres=", setup_hrtimer_hres);
@@ -897,10 +891,10 @@ static int enqueue_hrtimer(struct hrtimer *timer,
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ u8 newstate, int reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- unsigned int state = timer->state;
+ u8 state = timer->state;
timer->state = newstate;
if (!(state & HRTIMER_STATE_ENQUEUED))
@@ -930,7 +924,7 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state = timer->state;
+ u8 state = timer->state;
int reprogram;
/*
@@ -954,6 +948,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
return 0;
}
+static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
+ const enum hrtimer_mode mode)
+{
+#ifdef CONFIG_TIME_LOW_RES
+ /*
+ * CONFIG_TIME_LOW_RES indicates that the system has no way to return
+ * granular time values. For relative timers we add hrtimer_resolution
+ * (i.e. one jiffie) to prevent short timeouts.
+ */
+ timer->is_rel = mode & HRTIMER_MODE_REL;
+ if (timer->is_rel)
+ tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+#endif
+ return tim;
+}
+
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
@@ -963,7 +973,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
* relative (HRTIMER_MODE_REL)
*/
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
+ u64 delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -974,19 +984,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
remove_hrtimer(timer, base, true);
- if (mode & HRTIMER_MODE_REL) {
+ if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
- /*
- * CONFIG_TIME_LOW_RES is a temporary way for architectures
- * to signal that they simply return xtime in
- * do_gettimeoffset(). In this case we want to round up by
- * resolution when starting a relative timer, to avoid short
- * timeouts. This will go away with the GTOD framework.
- */
-#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
-#endif
- }
+
+ tim = hrtimer_update_lowres(timer, tim, mode);
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
@@ -1074,19 +1075,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
+ * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
-ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
unsigned long flags;
ktime_t rem;
lock_hrtimer_base(timer, &flags);
- rem = hrtimer_expires_remaining(timer);
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
+ rem = hrtimer_expires_remaining_adjusted(timer);
+ else
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
#ifdef CONFIG_NO_HZ_COMMON
/**
@@ -1220,6 +1225,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
fn = timer->function;
/*
+ * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
+ * timer is restarted with a period then it becomes an absolute
+ * timer. If its not restarted it does not matter.
+ */
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES))
+ timer->is_rel = false;
+
+ /*
* Because we run timers from hardirq context, there is no chance
* they get migrated to another cpu, therefore its safe to unlock
* the timer base.
@@ -1529,7 +1542,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- unsigned long slack;
+ u64 slack;
slack = current->timer_slack_ns;
if (dl_task(current) || rt_task(current))
@@ -1705,7 +1718,7 @@ void __init hrtimers_init(void)
* @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
*/
int __sched
-schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode, int clock)
{
struct hrtimer_sleeper t;
@@ -1773,7 +1786,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
*
* Returns 0 when the timer has expired otherwise -EINTR
*/
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range_clock(expires, delta, mode,
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8d262b467573..1d5c7204ddc9 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -26,7 +26,7 @@
*/
static struct timeval itimer_get_remtime(struct hrtimer *timer)
{
- ktime_t rem = hrtimer_get_remaining(timer);
+ ktime_t rem = __hrtimer_get_remaining(timer, true);
/*
* Racy but safe: if the itimer expires after the above
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 347fecf86a3f..555e21f7b966 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
- .mask = 0xffffffff, /*32bits*/
+ .mask = CLOCKSOURCE_MASK(32),
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
.max_cycles = 10,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 149cc8086aea..6df8927c58a5 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,8 +16,11 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/rtc.h>
+#include <linux/math64.h>
#include "ntp_internal.h"
+#include "timekeeping_internal.h"
+
/*
* NTP timekeeping variables:
@@ -70,7 +73,7 @@ static long time_esterror = NTP_PHASE_LIMIT;
static s64 time_freq;
/* time at last adjustment (secs): */
-static long time_reftime;
+static time64_t time_reftime;
static long time_adjust;
@@ -297,25 +300,27 @@ static void ntp_update_offset(long offset)
if (!(time_status & STA_PLL))
return;
- if (!(time_status & STA_NANO))
+ if (!(time_status & STA_NANO)) {
+ /* Make sure the multiplication below won't overflow */
+ offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC);
offset *= NSEC_PER_USEC;
+ }
/*
* Scale the phase adjustment and
* clamp to the operating range.
*/
- offset = min(offset, MAXPHASE);
- offset = max(offset, -MAXPHASE);
+ offset = clamp(offset, -MAXPHASE, MAXPHASE);
/*
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- secs = get_seconds() - time_reftime;
+ secs = (long)(__ktime_get_real_seconds() - time_reftime);
if (unlikely(time_status & STA_FREQHOLD))
secs = 0;
- time_reftime = get_seconds();
+ time_reftime = __ktime_get_real_seconds();
offset64 = offset;
freq_adj = ntp_update_offset_fll(offset64, secs);
@@ -390,10 +395,11 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(unsigned long secs)
+int second_overflow(time64_t secs)
{
s64 delta;
int leap = 0;
+ s32 rem;
/*
* Leap second processing. If in leap-insert state at the end of the
@@ -404,19 +410,19 @@ int second_overflow(unsigned long secs)
case TIME_OK:
if (time_status & STA_INS) {
time_state = TIME_INS;
- ntp_next_leap_sec = secs + SECS_PER_DAY -
- (secs % SECS_PER_DAY);
+ div_s64_rem(secs, SECS_PER_DAY, &rem);
+ ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
} else if (time_status & STA_DEL) {
time_state = TIME_DEL;
- ntp_next_leap_sec = secs + SECS_PER_DAY -
- ((secs+1) % SECS_PER_DAY);
+ div_s64_rem(secs + 1, SECS_PER_DAY, &rem);
+ ntp_next_leap_sec = secs + SECS_PER_DAY - rem;
}
break;
case TIME_INS:
if (!(time_status & STA_INS)) {
ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- } else if (secs % SECS_PER_DAY == 0) {
+ } else if (secs == ntp_next_leap_sec) {
leap = -1;
time_state = TIME_OOP;
printk(KERN_NOTICE
@@ -427,7 +433,7 @@ int second_overflow(unsigned long secs)
if (!(time_status & STA_DEL)) {
ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- } else if ((secs + 1) % SECS_PER_DAY == 0) {
+ } else if (secs == ntp_next_leap_sec) {
leap = 1;
ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
@@ -590,7 +596,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
* reference time to current time.
*/
if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
- time_reftime = get_seconds();
+ time_reftime = __ktime_get_real_seconds();
/* only set allowed bits */
time_status &= STA_RONLY;
@@ -674,8 +680,24 @@ int ntp_validate_timex(struct timex *txc)
return -EINVAL;
}
- if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
- return -EPERM;
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (txc->modes & ADJ_NANO) {
+ struct timespec ts;
+
+ ts.tv_sec = txc->time.tv_sec;
+ ts.tv_nsec = txc->time.tv_usec;
+ if (!timespec_inject_offset_valid(&ts))
+ return -EINVAL;
+
+ } else {
+ if (!timeval_inject_offset_valid(&txc->time))
+ return -EINVAL;
+ }
+ }
/*
* Check for potential multiplication overflows that can
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index af924470eac0..d8a7c11fa71a 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -6,7 +6,7 @@ extern void ntp_clear(void);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(unsigned long secs);
+extern int second_overflow(time64_t secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ce033c7aa2e8..9cff0ab82b63 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
{
struct posix_clock *clk = get_posix_clock(fp);
- int result = 0;
+ unsigned int result = 0;
if (!clk)
- return -ENODEV;
+ return POLLERR;
if (clk->ops.poll)
result = clk->ops.poll(clk, fp, wait);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f5e86d282d52..1cafba860b08 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
return err;
}
-
/*
* Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
* This is called from sys_timer_create() and do_cpu_nanosleep() with the
@@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer)
cputime_expires->sched_exp = exp;
break;
}
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+ else
+ tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
}
}
@@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
return 0;
}
-#ifdef CONFIG_NO_HZ_FULL
-static void nohz_kick_work_fn(struct work_struct *work)
-{
- tick_nohz_full_kick_all();
-}
-
-static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
-
-/*
- * We need the IPIs to be sent from sane process context.
- * The posix cpu timers are always set with irqs disabled.
- */
-static void posix_cpu_timer_kick_nohz(void)
-{
- if (context_tracking_is_enabled())
- schedule_work(&nohz_kick_work);
-}
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
-{
- if (!task_cputime_zero(&tsk->cputime_expires))
- return false;
-
- /* Check if cputimer is running. This is accessed without locking. */
- if (READ_ONCE(tsk->signal->cputimer.running))
- return false;
-
- return true;
-}
-#else
-static inline void posix_cpu_timer_kick_nohz(void) { }
-#endif
-
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
sample_to_timespec(timer->it_clock,
old_incr, &old->it_interval);
}
- if (!ret)
- posix_cpu_timer_kick_nohz();
+
return ret;
}
@@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk,
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
+ if (task_cputime_zero(tsk_expires))
+ tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
}
static inline void stop_process_timers(struct signal_struct *sig)
@@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
/* Turn off cputimer->running. This is done without locking. */
WRITE_ONCE(cputimer->running, false);
+ tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
}
static u32 onecputick;
@@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
arm_timer(timer);
unlock_task_sighand(p, &flags);
- /* Kick full dynticks CPUs in case they need to tick on the new timer */
- posix_cpu_timer_kick_nohz();
out:
timer->it_overrun_last = timer->it_overrun;
timer->it_overrun = -1;
@@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
if (!*newval)
- goto out;
+ return;
*newval += now;
}
@@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
tsk->signal->cputime_expires.virt_exp = *newval;
break;
}
-out:
- posix_cpu_timer_kick_nohz();
+
+ tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31d11ac9fa47..f2826c35e918 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(hrtimer_get_expires(timer), now);
+ remaining = __hrtimer_expires_remaining_adjusted(timer, now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7c7ec4515983..195fe7d2caad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -22,7 +22,6 @@
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
-#include <linux/perf_event.h>
#include <linux/context_tracking.h>
#include <asm/irq_regs.h>
@@ -36,16 +35,17 @@
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
-/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
- */
-static ktime_t last_jiffies_update;
-
struct tick_sched *tick_get_tick_sched(int cpu)
{
return &per_cpu(tick_cpu_sched, cpu);
}
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+/*
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ */
+static ktime_t last_jiffies_update;
+
/*
* Must be called with interrupts disabled !
*/
@@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
* when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) {
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
}
@@ -151,59 +151,69 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
}
+#endif
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
+static unsigned long tick_dep_mask;
+
+static void trace_tick_dependency(unsigned long dep)
+{
+ if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+ trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
+ return;
+ }
+
+ if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+ trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
+ return;
+ }
-static bool can_stop_full_tick(void)
+ if (dep & TICK_DEP_MASK_SCHED) {
+ trace_tick_stop(0, TICK_DEP_MASK_SCHED);
+ return;
+ }
+
+ if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+ trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+}
+
+static bool can_stop_full_tick(struct tick_sched *ts)
{
WARN_ON_ONCE(!irqs_disabled());
- if (!sched_can_stop_tick()) {
- trace_tick_stop(0, "more than 1 task in runqueue\n");
+ if (tick_dep_mask) {
+ trace_tick_dependency(tick_dep_mask);
return false;
}
- if (!posix_cpu_timers_can_stop_tick(current)) {
- trace_tick_stop(0, "posix timers running\n");
+ if (ts->tick_dep_mask) {
+ trace_tick_dependency(ts->tick_dep_mask);
return false;
}
- if (!perf_event_can_stop_tick()) {
- trace_tick_stop(0, "perf events running\n");
+ if (current->tick_dep_mask) {
+ trace_tick_dependency(current->tick_dep_mask);
return false;
}
- /* sched_clock_tick() needs us? */
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- /*
- * TODO: kick full dynticks CPUs when
- * sched_clock_stable is set.
- */
- if (!sched_clock_stable()) {
- trace_tick_stop(0, "unstable sched clock\n");
- /*
- * Don't allow the user to think they can get
- * full NO_HZ with this machine.
- */
- WARN_ONCE(tick_nohz_full_running,
- "NO_HZ FULL will not work with unstable sched clock");
+ if (current->signal->tick_dep_mask) {
+ trace_tick_dependency(current->signal->tick_dep_mask);
return false;
}
-#endif
return true;
}
-static void nohz_full_kick_work_func(struct irq_work *work)
+static void nohz_full_kick_func(struct irq_work *work)
{
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
- .func = nohz_full_kick_work_func,
+ .func = nohz_full_kick_func,
};
/*
@@ -212,7 +222,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
* This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
* is NMI safe.
*/
-void tick_nohz_full_kick(void)
+static void tick_nohz_full_kick(void)
{
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
@@ -232,27 +242,112 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
-static void nohz_full_kick_ipi(void *info)
-{
- /* Empty, the tick restart happens on tick_nohz_irq_exit() */
-}
-
/*
* Kick all full dynticks CPUs in order to force these to re-evaluate
* their dependency on the tick and restart it if necessary.
*/
-void tick_nohz_full_kick_all(void)
+static void tick_nohz_full_kick_all(void)
{
+ int cpu;
+
if (!tick_nohz_full_running)
return;
preempt_disable();
- smp_call_function_many(tick_nohz_full_mask,
- nohz_full_kick_ipi, NULL, false);
- tick_nohz_full_kick();
+ for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
+ tick_nohz_full_kick_cpu(cpu);
preempt_enable();
}
+static void tick_nohz_dep_set_all(unsigned long *dep,
+ enum tick_dep_bits bit)
+{
+ unsigned long prev;
+
+ prev = fetch_or(dep, BIT_MASK(bit));
+ if (!prev)
+ tick_nohz_full_kick_all();
+}
+
+/*
+ * Set a global tick dependency. Used by perf events that rely on freq and
+ * by unstable clock.
+ */
+void tick_nohz_dep_set(enum tick_dep_bits bit)
+{
+ tick_nohz_dep_set_all(&tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear(enum tick_dep_bits bit)
+{
+ clear_bit(bit, &tick_dep_mask);
+}
+
+/*
+ * Set per-CPU tick dependency. Used by scheduler and perf events in order to
+ * manage events throttling.
+ */
+void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+ unsigned long prev;
+ struct tick_sched *ts;
+
+ ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+ prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+ if (!prev) {
+ preempt_disable();
+ /* Perf needs local kick that is NMI safe */
+ if (cpu == smp_processor_id()) {
+ tick_nohz_full_kick();
+ } else {
+ /* Remote irq work not NMI-safe */
+ if (!WARN_ON_ONCE(in_nmi()))
+ tick_nohz_full_kick_cpu(cpu);
+ }
+ preempt_enable();
+ }
+}
+
+void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+ struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+ clear_bit(bit, &ts->tick_dep_mask);
+}
+
+/*
+ * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
+ * per task timers.
+ */
+void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+ /*
+ * We could optimize this with just kicking the target running the task
+ * if that noise matters for nohz full users.
+ */
+ tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+ clear_bit(bit, &tsk->tick_dep_mask);
+}
+
+/*
+ * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
+ * per process timers.
+ */
+void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+ tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+ clear_bit(bit, &sig->tick_dep_mask);
+}
+
/*
* Re-evaluate the need for the tick as we switch the current task.
* It might need the tick due to per task/process properties:
@@ -261,15 +356,19 @@ void tick_nohz_full_kick_all(void)
void __tick_nohz_task_switch(void)
{
unsigned long flags;
+ struct tick_sched *ts;
local_irq_save(flags);
if (!tick_nohz_full_cpu(smp_processor_id()))
goto out;
- if (tick_nohz_tick_stopped() && !can_stop_full_tick())
- tick_nohz_full_kick();
+ ts = this_cpu_ptr(&tick_cpu_sched);
+ if (ts->tick_stopped) {
+ if (current->tick_dep_mask || current->signal->tick_dep_mask)
+ tick_nohz_full_kick();
+ }
out:
local_irq_restore(flags);
}
@@ -387,20 +486,14 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-static int tick_nohz_enabled __read_mostly = 1;
+bool tick_nohz_enabled __read_mostly = true;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
static int __init setup_tick_nohz(char *str)
{
- if (!strcmp(str, "off"))
- tick_nohz_enabled = 0;
- else if (!strcmp(str, "on"))
- tick_nohz_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &tick_nohz_enabled) == 0);
}
__setup("nohz=", setup_tick_nohz);
@@ -430,7 +523,7 @@ static void tick_nohz_update_jiffies(ktime_t now)
tick_do_update_jiffies64(now);
local_irq_restore(flags);
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
/*
@@ -603,15 +696,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
/*
* If the tick is due in the next period, keep it ticking or
- * restart it proper.
+ * force prod the timer.
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
tick.tv64 = 0;
+ /*
+ * We've not stopped the tick yet, and there's a timer in the
+ * next period, so no point in stopping it either, bail.
+ */
if (!ts->tick_stopped)
goto out;
+
+ /*
+ * If, OTOH, we did stop it, but there's a pending (expired)
+ * timer reprogram the timer hardware to fire now.
+ *
+ * We will not restart the tick proper, just prod the timer
+ * hardware into firing an interrupt to process the pending
+ * timers. Just like tick_irq_exit() will not restart the tick
+ * for 'normal' interrupts.
+ *
+ * Only once we exit the idle loop will we re-enable the tick,
+ * see tick_nohz_idle_exit().
+ */
if (delta == 0) {
- /* Tick is stopped, but required now. Enforce it */
tick_nohz_restart(ts, now);
goto out;
}
@@ -671,7 +780,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
- trace_tick_stop(1, " ");
+ trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
/*
@@ -694,14 +803,14 @@ out:
return tick;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
+ update_cpu_load_nohz(active);
calc_load_exit_idle();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
*/
@@ -722,10 +831,10 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (can_stop_full_tick())
+ if (can_stop_full_tick(ts))
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
- tick_nohz_restart_sched_tick(ts, ktime_get());
+ tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
#endif
}
@@ -875,7 +984,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
- if (vtime_accounting_enabled())
+ if (vtime_accounting_cpu_enabled())
return;
/*
* We stopped the tick in idle. Update process times would miss the
@@ -916,7 +1025,7 @@ void tick_nohz_idle_exit(void)
tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_restart_sched_tick(ts, now, 0);
tick_nohz_account_idle_ticks(ts);
}
@@ -977,9 +1086,9 @@ static void tick_nohz_switch_to_nohz(void)
/* Get the next period */
next = tick_init_jiffy_update();
- hrtimer_forward_now(&ts->sched_timer, tick_period);
hrtimer_set_expires(&ts->sched_timer, next);
- tick_program_event(next, 1);
+ hrtimer_forward_now(&ts->sched_timer, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index a4a8d4e9baa1..eb4e32566a83 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,6 +60,7 @@ struct tick_sched {
u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
+ unsigned long tick_dep_mask;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 86751c68e08d..be115b020d27 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc);
* -year/100+year/400 terms, and add 10.]
*
* This algorithm was first published by Gauss (I think).
+ *
+ * A leap second can be indicated by calling this function with sec as
+ * 60 (allowable under ISO 8601). The leap second is treated the same
+ * as the following second since they don't exist in UNIX time.
+ *
+ * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
+ * tomorrow - (allowable under ISO 8601) is supported.
*/
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
const unsigned int day, const unsigned int hour,
@@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
return ((((time64_t)
(year/4 - year/100 + year/400 + 367*mon/12 + day) +
year*365 - 719499
- )*24 + hour /* now have hours */
+ )*24 + hour /* now have hours - midnight tomorrow handled here */
)*60 + min /* now have minutes */
)*60 + sec; /* finally seconds */
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d563c1960302..479d25cd3d4f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
} else {
if (offset > (max_cycles >> 1)) {
- printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+ printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
offset, name, max_cycles >> 1);
printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
}
@@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
+ ++tk->cs_was_changed_seq;
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
tk->tkr_mono.read = clock->read;
@@ -298,13 +299,11 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+ cycle_t delta)
{
- cycle_t delta;
s64 nsec;
- delta = timekeeping_get_delta(tkr);
-
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
@@ -312,6 +311,24 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
return nsec + arch_gettimeoffset();
}
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+{
+ cycle_t delta;
+
+ delta = timekeeping_get_delta(tkr);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
+
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+ cycle_t cycles)
+{
+ cycle_t delta;
+
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
+
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
@@ -846,43 +863,274 @@ time64_t ktime_get_real_seconds(void)
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
-#ifdef CONFIG_NTP_PPS
+/**
+ * __ktime_get_real_seconds - The same as ktime_get_real_seconds
+ * but without the sequence counter protect. This internal function
+ * is called just when timekeeping lock is already held.
+ */
+time64_t __ktime_get_real_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return tk->xtime_sec;
+}
/**
- * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
- * @ts_raw: pointer to the timespec to be set to raw monotonic time
- * @ts_real: pointer to the timespec to be set to the time of day
- *
- * This function reads both the time of day and raw monotonic time at the
- * same time atomically and stores the resulting timestamps in timespec
- * format.
+ * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
+ * @systime_snapshot: pointer to struct receiving the system time snapshot
*/
-void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
+void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
- s64 nsecs_raw, nsecs_real;
+ ktime_t base_raw;
+ ktime_t base_real;
+ s64 nsec_raw;
+ s64 nsec_real;
+ cycle_t now;
WARN_ON_ONCE(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts_raw = tk->raw_time;
- ts_real->tv_sec = tk->xtime_sec;
- ts_real->tv_nsec = 0;
+ now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
+ systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
+ base_real = ktime_add(tk->tkr_mono.base,
+ tk_core.timekeeper.offs_real);
+ base_raw = tk->tkr_raw.base;
+ nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
+ nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
- nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
- nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
+ systime_snapshot->cycles = now;
+ systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+ systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_snapshot);
- } while (read_seqcount_retry(&tk_core.seq, seq));
+/* Scale base by mult/div checking for overflow */
+static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
+{
+ u64 tmp, rem;
+
+ tmp = div64_u64_rem(*base, div, &rem);
+
+ if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
+ ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
+ return -EOVERFLOW;
+ tmp *= mult;
+ rem *= mult;
+
+ do_div(rem, div);
+ *base = tmp + rem;
+ return 0;
+}
+
+/**
+ * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
+ * @history: Snapshot representing start of history
+ * @partial_history_cycles: Cycle offset into history (fractional part)
+ * @total_history_cycles: Total history length in cycles
+ * @discontinuity: True indicates clock was set on history period
+ * @ts: Cross timestamp that should be adjusted using
+ * partial/total ratio
+ *
+ * Helper function used by get_device_system_crosststamp() to correct the
+ * crosstimestamp corresponding to the start of the current interval to the
+ * system counter value (timestamp point) provided by the driver. The
+ * total_history_* quantities are the total history starting at the provided
+ * reference point and ending at the start of the current interval. The cycle
+ * count between the driver timestamp point and the start of the current
+ * interval is partial_history_cycles.
+ */
+static int adjust_historical_crosststamp(struct system_time_snapshot *history,
+ cycle_t partial_history_cycles,
+ cycle_t total_history_cycles,
+ bool discontinuity,
+ struct system_device_crosststamp *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ u64 corr_raw, corr_real;
+ bool interp_forward;
+ int ret;
+
+ if (total_history_cycles == 0 || partial_history_cycles == 0)
+ return 0;
+
+ /* Interpolate shortest distance from beginning or end of history */
+ interp_forward = partial_history_cycles > total_history_cycles/2 ?
+ true : false;
+ partial_history_cycles = interp_forward ?
+ total_history_cycles - partial_history_cycles :
+ partial_history_cycles;
+
+ /*
+ * Scale the monotonic raw time delta by:
+ * partial_history_cycles / total_history_cycles
+ */
+ corr_raw = (u64)ktime_to_ns(
+ ktime_sub(ts->sys_monoraw, history->raw));
+ ret = scale64_check_overflow(partial_history_cycles,
+ total_history_cycles, &corr_raw);
+ if (ret)
+ return ret;
+
+ /*
+ * If there is a discontinuity in the history, scale monotonic raw
+ * correction by:
+ * mult(real)/mult(raw) yielding the realtime correction
+ * Otherwise, calculate the realtime correction similar to monotonic
+ * raw calculation
+ */
+ if (discontinuity) {
+ corr_real = mul_u64_u32_div
+ (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
+ } else {
+ corr_real = (u64)ktime_to_ns(
+ ktime_sub(ts->sys_realtime, history->real));
+ ret = scale64_check_overflow(partial_history_cycles,
+ total_history_cycles, &corr_real);
+ if (ret)
+ return ret;
+ }
- timespec64_add_ns(ts_raw, nsecs_raw);
- timespec64_add_ns(ts_real, nsecs_real);
+ /* Fixup monotonic raw and real time time values */
+ if (interp_forward) {
+ ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
+ ts->sys_realtime = ktime_add_ns(history->real, corr_real);
+ } else {
+ ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
+ ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
-#endif /* CONFIG_NTP_PPS */
+/*
+ * cycle_between - true if test occurs chronologically between before and after
+ */
+static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+{
+ if (test > before && test < after)
+ return true;
+ if (test < before && before > after)
+ return true;
+ return false;
+}
+
+/**
+ * get_device_system_crosststamp - Synchronously capture system/device timestamp
+ * @get_time_fn: Callback to get simultaneous device time and
+ * system counter from the device driver
+ * @ctx: Context passed to get_time_fn()
+ * @history_begin: Historical reference point used to interpolate system
+ * time when counter provided by the driver is before the current interval
+ * @xtstamp: Receives simultaneously captured system and device time
+ *
+ * Reads a timestamp from a device and correlates it to system time
+ */
+int get_device_system_crosststamp(int (*get_time_fn)
+ (ktime_t *device_time,
+ struct system_counterval_t *sys_counterval,
+ void *ctx),
+ void *ctx,
+ struct system_time_snapshot *history_begin,
+ struct system_device_crosststamp *xtstamp)
+{
+ struct system_counterval_t system_counterval;
+ struct timekeeper *tk = &tk_core.timekeeper;
+ cycle_t cycles, now, interval_start;
+ unsigned int clock_was_set_seq = 0;
+ ktime_t base_real, base_raw;
+ s64 nsec_real, nsec_raw;
+ u8 cs_was_changed_seq;
+ unsigned long seq;
+ bool do_interp;
+ int ret;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ /*
+ * Try to synchronously capture device time and a system
+ * counter value calling back into the device driver
+ */
+ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
+ if (ret)
+ return ret;
+
+ /*
+ * Verify that the clocksource associated with the captured
+ * system counter value is the same as the currently installed
+ * timekeeper clocksource
+ */
+ if (tk->tkr_mono.clock != system_counterval.cs)
+ return -ENODEV;
+ cycles = system_counterval.cycles;
+
+ /*
+ * Check whether the system counter value provided by the
+ * device driver is on the current timekeeping interval.
+ */
+ now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ interval_start = tk->tkr_mono.cycle_last;
+ if (!cycle_between(interval_start, cycles, now)) {
+ clock_was_set_seq = tk->clock_was_set_seq;
+ cs_was_changed_seq = tk->cs_was_changed_seq;
+ cycles = interval_start;
+ do_interp = true;
+ } else {
+ do_interp = false;
+ }
+
+ base_real = ktime_add(tk->tkr_mono.base,
+ tk_core.timekeeper.offs_real);
+ base_raw = tk->tkr_raw.base;
+
+ nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
+ system_counterval.cycles);
+ nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
+ system_counterval.cycles);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
+ xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
+
+ /*
+ * Interpolate if necessary, adjusting back from the start of the
+ * current interval
+ */
+ if (do_interp) {
+ cycle_t partial_history_cycles, total_history_cycles;
+ bool discontinuity;
+
+ /*
+ * Check that the counter value occurs after the provided
+ * history reference and that the history doesn't cross a
+ * clocksource change
+ */
+ if (!history_begin ||
+ !cycle_between(history_begin->cycles,
+ system_counterval.cycles, cycles) ||
+ history_begin->cs_was_changed_seq != cs_was_changed_seq)
+ return -EINVAL;
+ partial_history_cycles = cycles - system_counterval.cycles;
+ total_history_cycles = cycles - history_begin->cycles;
+ discontinuity =
+ history_begin->clock_was_set_seq != clock_was_set_seq;
+
+ ret = adjust_historical_crosststamp(history_begin,
+ partial_history_cycles,
+ total_history_cycles,
+ discontinuity, xtstamp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
/**
* do_gettimeofday - Returns the time of day in a timeval
@@ -959,7 +1207,7 @@ int timekeeping_inject_offset(struct timespec *ts)
struct timespec64 ts64, tmp;
int ret = 0;
- if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_inject_offset_valid(ts))
return -EINVAL;
ts64 = timespec_to_timespec64(*ts);
@@ -1592,9 +1840,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
{
s64 interval = tk->cycle_interval;
s64 xinterval = tk->xtime_interval;
+ u32 base = tk->tkr_mono.clock->mult;
+ u32 max = tk->tkr_mono.clock->maxadj;
+ u32 cur_adj = tk->tkr_mono.mult;
s64 tick_error;
bool negative;
- u32 adj;
+ u32 adj_scale;
/* Remove any current error adj from freq calculation */
if (tk->ntp_err_mult)
@@ -1613,13 +1864,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
/* preserve the direction of correction */
negative = (tick_error < 0);
- /* Sort out the magnitude of the correction */
+ /* If any adjustment would pass the max, just return */
+ if (negative && (cur_adj - 1) <= (base - max))
+ return;
+ if (!negative && (cur_adj + 1) >= (base + max))
+ return;
+ /*
+ * Sort out the magnitude of the correction, but
+ * avoid making so large a correction that we go
+ * over the max adjustment.
+ */
+ adj_scale = 0;
tick_error = abs(tick_error);
- for (adj = 0; tick_error > interval; adj++)
+ while (tick_error > interval) {
+ u32 adj = 1 << (adj_scale + 1);
+
+ /* Check if adjustment gets us within 1 unit from the max */
+ if (negative && (cur_adj - adj) <= (base - max))
+ break;
+ if (!negative && (cur_adj + adj) >= (base + max))
+ break;
+
+ adj_scale++;
tick_error >>= 1;
+ }
/* scale the corrections */
- timekeeping_apply_adjustment(tk, offset, negative, adj);
+ timekeeping_apply_adjustment(tk, offset, negative, adj_scale);
}
/*
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ea005a7f9da..5be76270ec4a 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -17,7 +17,11 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
{
cycle_t ret = (now - last) & mask;
- return (s64) ret > 0 ? ret : 0;
+ /*
+ * Prevent time going backwards by checking the MSB of mask in
+ * the result. If set, return 0.
+ */
+ return ret & ~(mask >> 1) ? 0 : ret;
}
#else
static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
@@ -26,4 +30,6 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
}
#endif
+extern time64_t __ktime_get_real_seconds(void);
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bbc5d1114583..d1798fa0c743 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible);
static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
- unsigned long delta;
+ u64 delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
- delta = (max - min) * NSEC_PER_USEC;
+ delta = (u64)(max - min) * NSEC_PER_USEC;
schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f75e35b60149..ba7d8b288bb3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
- SEQ_printf(m, ", S:%02lx", timer->state);
+ SEQ_printf(m, ", S:%02x", timer->state);
#ifdef CONFIG_TIMER_STATS
SEQ_printf(m, ", ");
print_name_offset(m, timer->start_site);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a990824c8604..2aeb6ffc0a1e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
if (count >= BLK_TN_MAX_MSG)
return -EINVAL;
- msg = kmalloc(count + 1, GFP_KERNEL);
- if (msg == NULL)
- return -ENOMEM;
-
- if (copy_from_user(msg, buffer, count)) {
- kfree(msg);
- return -EFAULT;
- }
+ msg = memdup_user_nul(buffer, count);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
- msg[count] = '\0';
bt = filp->private_data;
__trace_note_message(bt, "%s", msg);
kfree(msg);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4228fd3682c3..3e4ffb3ace5f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,8 +13,6 @@
#include <linux/ctype.h>
#include "trace.h"
-static DEFINE_PER_CPU(int, bpf_prog_active);
-
/**
* trace_call_bpf - invoke BPF program
* @prog: BPF program
@@ -191,14 +189,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
+ struct file *file;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (!event)
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
/* make sure event is local and doesn't have pmu::count */
if (event->oncpu != smp_processor_id() ||
event->pmu->count)
@@ -228,6 +229,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
+ struct file *file;
struct perf_raw_record raw = {
.size = size,
.data = data,
@@ -236,10 +238,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (unlikely(!event))
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
@@ -293,6 +297,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_perf_event_read_proto;
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto;
default:
return NULL;
}
@@ -316,7 +322,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return true;
}
-static struct bpf_verifier_ops kprobe_prog_ops = {
+static const struct bpf_verifier_ops kprobe_prog_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f743b147247..57a6eea84694 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@
#define FTRACE_HASH_DEFAULT_BITS 10
#define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
-
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
@@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
-static struct ftrace_ops control_ops;
-
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs);
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -203,7 +196,7 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void control_ops_disable_all(struct ftrace_ops *ops)
+static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
{
int cpu;
@@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops)
*per_cpu_ptr(ops->disabled, cpu) = 1;
}
-static int control_ops_alloc(struct ftrace_ops *ops)
+static int per_cpu_ops_alloc(struct ftrace_ops *ops)
{
int __percpu *disabled;
+ if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
+ return -EINVAL;
+
disabled = alloc_percpu(int);
if (!disabled)
return -ENOMEM;
ops->disabled = disabled;
- control_ops_disable_all(ops);
+ per_cpu_ops_disable_all(ops);
return 0;
}
@@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { }
static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
{
/*
- * If this is a dynamic ops or we force list func,
+ * If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+ FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
return 0;
}
-static void add_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int first = *list == &ftrace_list_end;
- add_ftrace_ops(list, ops);
- if (first)
- add_ftrace_ops(&ftrace_ops_list, main_ops);
-}
-
-static int remove_ftrace_list_ops(struct ftrace_ops **list,
- struct ftrace_ops *main_ops,
- struct ftrace_ops *ops)
-{
- int ret = remove_ftrace_ops(list, ops);
- if (!ret && *list == &ftrace_list_end)
- ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
- return ret;
-}
-
static void ftrace_update_trampoline(struct ftrace_ops *ops);
static int __register_ftrace_function(struct ftrace_ops *ops)
@@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- if (control_ops_alloc(ops))
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
+ if (per_cpu_ops_alloc(ops))
return -ENOMEM;
- add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
- /* The control_ops needs the trampoline update */
- ops = &control_ops;
- } else
- add_ftrace_ops(&ftrace_ops_list, ops);
+ }
+
+ add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
ops->saved_func = ops->func;
@@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
- if (ops->flags & FTRACE_OPS_FL_CONTROL) {
- ret = remove_ftrace_list_ops(&ftrace_control_list,
- &control_ops, ops);
- } else
- ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
if (ret < 0)
return ret;
@@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int in_hash = 0;
int match = 0;
+ if (rec->flags & FTRACE_FL_DISABLED)
+ continue;
+
if (all) {
/*
* Only the filter_hash affects all records.
@@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
}
-static void print_ip_ins(const char *fmt, unsigned char *p)
+static void print_ip_ins(const char *fmt, const unsigned char *p)
{
int i;
@@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
+
+enum ftrace_bug_type ftrace_bug_type;
+const void *ftrace_expected;
+
+static void print_bug_type(void)
+{
+ switch (ftrace_bug_type) {
+ case FTRACE_BUG_UNKNOWN:
+ break;
+ case FTRACE_BUG_INIT:
+ pr_info("Initializing ftrace call sites\n");
+ break;
+ case FTRACE_BUG_NOP:
+ pr_info("Setting ftrace call site to NOP\n");
+ break;
+ case FTRACE_BUG_CALL:
+ pr_info("Setting ftrace call site to call ftrace function\n");
+ break;
+ case FTRACE_BUG_UPDATE:
+ pr_info("Updating ftrace call site to call a different ftrace function\n");
+ break;
+ }
+}
/**
* ftrace_bug - report and shutdown function tracer
@@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
- print_ip_ins(" actual: ", (unsigned char *)ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
+ if (ftrace_expected) {
+ print_ip_ins(" expected: ", ftrace_expected);
+ pr_cont("\n");
+ }
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ print_bug_type();
if (rec) {
struct ftrace_ops *ops = NULL;
@@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
rec->flags & FTRACE_FL_REGS ? " R" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- pr_cont("\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ pr_cont("\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
pr_cont("\ttramp: ERROR!");
}
ip = ftrace_get_addr_curr(rec);
- pr_cont(" expected tramp: %lx\n", ip);
+ pr_cont("\n expected tramp: %lx\n", ip);
}
}
@@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
unsigned long flag = 0UL;
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
+ if (rec->flags & FTRACE_FL_DISABLED)
+ return FTRACE_UPDATE_IGNORE;
+
/*
* If we are updating calls:
*
@@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* from the save regs, to a non-save regs function or
* vice versa, or from a trampoline call.
*/
- if (flag & FTRACE_FL_ENABLED)
+ if (flag & FTRACE_FL_ENABLED) {
+ ftrace_bug_type = FTRACE_BUG_CALL;
return FTRACE_UPDATE_MAKE_CALL;
+ }
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return FTRACE_UPDATE_MODIFY_CALL;
}
@@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
FTRACE_FL_REGS_EN);
}
+ ftrace_bug_type = FTRACE_BUG_NOP;
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
+ struct ftrace_ops *op)
+{
+ unsigned long ip = rec->ip;
+
+ while_for_each_ftrace_op(op) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ }
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
@@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
+ ftrace_bug_type = FTRACE_BUG_UNKNOWN;
+
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
case FTRACE_UPDATE_MAKE_CALL:
+ ftrace_bug_type = FTRACE_BUG_CALL;
return ftrace_make_call(rec, ftrace_addr);
case FTRACE_UPDATE_MAKE_NOP:
+ ftrace_bug_type = FTRACE_BUG_NOP;
return ftrace_make_nop(NULL, rec, ftrace_old_addr);
case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_bug_type = FTRACE_BUG_UPDATE;
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
@@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
+ ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
return 0;
}
@@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void control_ops_free(struct ftrace_ops *ops)
+static void per_cpu_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
}
@@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are control ops, they still need their
+ * If these are per_cpu ops, they still need their
* per_cpu field freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
return 0;
}
@@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
- * The same goes for freeing the per_cpu data of the control
+ * The same goes for freeing the per_cpu data of the per_cpu
* ops.
*
* Again, normal synchronize_sched() is not good enough.
@@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
schedule_on_each_cpu(ftrace_sync);
arch_ftrace_trampoline_free(ops);
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & FTRACE_OPS_FL_PER_CPU)
+ per_cpu_ops_free(ops);
}
return 0;
@@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- /* If ops traces all mods, we already accounted for it */
+ /* If ops traces all then it includes this function */
if (ops_traces_mod(ops))
- return 0;
+ return 1;
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
@@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
return 1;
}
-static int referenced_filters(struct dyn_ftrace *rec)
-{
- struct ftrace_ops *ops;
- int cnt = 0;
-
- for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
- }
-
- return cnt;
-}
-
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long update_cnt = 0;
- unsigned long ref = 0;
- bool test = false;
+ unsigned long rec_flags = 0;
int i;
+ start = ftrace_now(raw_smp_processor_id());
+
/*
- * When adding a module, we need to check if tracers are
- * currently enabled and if they are set to trace all functions.
- * If they are, we need to enable the module functions as well
- * as update the reference counts for those function records.
+ * When a module is loaded, this function is called to convert
+ * the calls to mcount in its text to nops, and also to create
+ * an entry in the ftrace data. Now, if ftrace is activated
+ * after this call, but before the module sets its text to
+ * read-only, the modification of enabling ftrace can fail if
+ * the read-only is done while ftrace is converting the calls.
+ * To prevent this, the module's records are set as disabled
+ * and will be enabled after the call to set the module's text
+ * to read-only.
*/
- if (mod) {
- struct ftrace_ops *ops;
-
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED) {
- if (ops_traces_mod(ops))
- ref++;
- else
- test = true;
- }
- }
- }
-
- start = ftrace_now(raw_smp_processor_id());
+ if (mod)
+ rec_flags |= FTRACE_FL_DISABLED;
for (pg = new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
- int cnt = ref;
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- if (test)
- cnt += referenced_filters(p);
- p->flags = cnt;
+ p->flags = rec_flags;
/*
* Do the initial record conversion from mcount jump
@@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
break;
update_cnt++;
-
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && cnt) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed)
- ftrace_bug(failed, p);
- }
}
}
@@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- struct ftrace_ops *ops = NULL;
+ struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
@@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v)
rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_any(rec);
- if (ops)
- seq_printf(m, "\ttramp: %pS",
- (void *)ops->trampoline);
- else
+ if (ops) {
+ do {
+ seq_printf(m, "\ttramp: %pS (%pS)",
+ (void *)ops->trampoline,
+ (void *)ops->func);
+ add_trampoline_func(m, ops, rec);
+ ops = ftrace_find_tramp_ops_next(rec, ops);
+ } while (ops);
+ } else
seq_puts(m, "\ttramp: ERROR!");
-
+ } else {
+ add_trampoline_func(m, NULL, rec);
}
- add_trampoline_func(m, ops, rec);
}
seq_putc(m, '\n');
@@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod,
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
@@ -4940,44 +4961,85 @@ void ftrace_release_mod(struct module *mod)
mutex_unlock(&ftrace_lock);
}
-static void ftrace_init_module(struct module *mod,
- unsigned long *start, unsigned long *end)
+void ftrace_module_enable(struct module *mod)
{
- if (ftrace_disabled || start == end)
- return;
- ftrace_process_locs(mod, start, end);
-}
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
-void ftrace_module_init(struct module *mod)
-{
- ftrace_init_module(mod, mod->ftrace_callsites,
- mod->ftrace_callsites +
- mod->num_ftrace_callsites);
-}
+ mutex_lock(&ftrace_lock);
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct module *mod = data;
+ if (ftrace_disabled)
+ goto out_unlock;
- if (val == MODULE_STATE_GOING)
- ftrace_release_mod(mod);
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ *
+ * We also delay this to after the module code already set the
+ * text to read-only, as we now need to set it back to read-write
+ * so that we can modify the text.
+ */
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_prepare();
- return 0;
+ do_for_each_ftrace_rec(pg, rec) {
+ int cnt;
+ /*
+ * do_for_each_ftrace_rec() is a double loop.
+ * module text shares the pg. If a record is
+ * not part of this module, then skip this pg,
+ * which the "break" will do.
+ */
+ if (!within_module_core(rec->ip, mod))
+ break;
+
+ cnt = 0;
+
+ /*
+ * When adding a module, we need to check if tracers are
+ * currently enabled and if they are, and can trace this record,
+ * we need to enable the module functions as well as update the
+ * reference counts for those function records.
+ */
+ if (ftrace_start_up)
+ cnt += referenced_filters(rec);
+
+ /* This clears FTRACE_FL_DISABLED */
+ rec->flags = cnt;
+
+ if (ftrace_start_up && cnt) {
+ int failed = __ftrace_replace_code(rec, 1);
+ if (failed) {
+ ftrace_bug(failed, rec);
+ goto out_loop;
+ }
+ }
+
+ } while_for_each_ftrace_rec();
+
+ out_loop:
+ if (ftrace_start_up)
+ ftrace_arch_code_modify_post_process();
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
}
-#else
-static int ftrace_module_notify_exit(struct notifier_block *self,
- unsigned long val, void *data)
+
+void ftrace_module_init(struct module *mod)
{
- return 0;
+ if (ftrace_disabled || !mod->num_ftrace_callsites)
+ return;
+
+ ftrace_process_locs(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites + mod->num_ftrace_callsites);
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_exit_nb = {
- .notifier_call = ftrace_module_notify_exit,
- .priority = INT_MIN, /* Run after anything that can remove kprobes */
-};
-
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -5006,10 +5068,6 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_exit_nb);
- if (ret)
- pr_warning("Failed to register trace ftrace module exit notifier\n");
-
set_ftrace_early_filters();
return;
@@ -5116,44 +5174,6 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *op, struct pt_regs *regs)
-{
- if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
- return;
-
- /*
- * Some of the ops may be dynamically allocated,
- * they must be freed after a synchronize_sched().
- */
- preempt_disable_notrace();
- trace_recursion_set(TRACE_CONTROL_BIT);
-
- /*
- * Control funcs (perf) uses RCU. Only trace if
- * RCU is currently active.
- */
- if (!rcu_is_watching())
- goto out;
-
- do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!(op->flags & FTRACE_OPS_FL_STUB) &&
- !ftrace_function_local_disabled(op) &&
- ftrace_ops_test(op, ip, regs))
- op->func(ip, parent_ip, op, regs);
- } while_for_each_ftrace_op(op);
- out:
- trace_recursion_clear(TRACE_CONTROL_BIT);
- preempt_enable_notrace();
-}
-
-static struct ftrace_ops control_ops = {
- .func = ftrace_ops_control_func,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(control_ops)
-};
-
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
@@ -5170,8 +5190,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* they must be freed after a synchronize_sched().
*/
preempt_disable_notrace();
+
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip, regs)) {
+ /*
+ * Check the following for each ops before calling their func:
+ * if RCU flag is set, then rcu_is_watching() must be true
+ * if PER_CPU is set, then ftrace_function_local_disable()
+ * must be false
+ * Otherwise test if the ip matches the ops filter
+ *
+ * If any of the above fails then the op->func() is not executed.
+ */
+ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
+ (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) &&
+ ftrace_ops_test(op, ip, regs)) {
+
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -5195,7 +5229,7 @@ out:
* being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
* Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
* An architecture can pass partial regs with ftrace_ops and still
- * set the ARCH_SUPPORT_FTARCE_OPS.
+ * set the ARCH_SUPPORTS_FTRACE_OPS.
*/
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
@@ -5212,20 +5246,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
/*
* If there's only one function registered but it does not support
- * recursion, this function will be called by the mcount trampoline.
- * This function will handle recursion protection.
+ * recursion, needs RCU protection and/or requires per cpu handling, then
+ * this function will be called by the mcount trampoline.
*/
-static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
int bit;
+ if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
+ return;
+
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
- op->func(ip, parent_ip, op, regs);
+ preempt_disable_notrace();
+ if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
+ !ftrace_function_local_disabled(op)) {
+ op->func(ip, parent_ip, op, regs);
+ }
+
+ preempt_enable_notrace();
trace_clear_recursion(bit);
}
@@ -5243,12 +5286,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
/*
- * If the func handles its own recursion, call it directly.
- * Otherwise call the recursion protected function that
- * will call the ftrace ops function.
+ * If the function does not handle recursion, needs to be RCU safe,
+ * or does per cpu logic, then we need to call the assist handler.
*/
- if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
- return ftrace_ops_recurs_func;
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
+ ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ return ftrace_ops_assist_func;
return ops->func;
}
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index eb4220a132ec..81b87451c0ea 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,4 +15,5 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c6045a27ba3..95181e36891a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old,
/*
* rb_tail_page_update - move the tail page forward
- *
- * Returns 1 if moved tail page, 0 if someone else did.
*/
-static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
- struct buffer_page *old_tail;
unsigned long old_entries;
unsigned long old_write;
- int ret = 0;
/*
* The tail page now needs to be moved forward.
@@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
* it is, then it is up to us to update the tail
* pointer.
*/
- if (tail_page == cpu_buffer->tail_page) {
+ if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
/* Zero the write counter */
unsigned long val = old_write & ~RB_WRITE_MASK;
unsigned long eval = old_entries & ~RB_WRITE_MASK;
@@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- old_tail = cmpxchg(&cpu_buffer->tail_page,
- tail_page, next_page);
-
- if (old_tail == tail_page)
- ret = 1;
+ /* Again, either we update tail_page or an interrupt does */
+ (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
-
- return ret;
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the tail page would have moved.
*/
if (ret == RB_PAGE_NORMAL) {
+ struct buffer_page *buffer_tail_page;
+
+ buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
/*
* If the tail had moved passed next, then we need
* to reset the pointer.
*/
- if (cpu_buffer->tail_page != tail_page &&
- cpu_buffer->tail_page != next_page)
+ if (buffer_tail_page != tail_page &&
+ buffer_tail_page != next_page)
rb_head_page_set_normal(cpu_buffer, new_head,
next_page,
RB_PAGE_HEAD);
@@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
+
/*
* This is the slow path, force gcc not to inline it.
*/
@@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
- u64 ts;
next_page = tail_page;
@@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
}
- ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
- if (ret) {
- /*
- * Nested commits always have zero deltas, so
- * just reread the time stamp
- */
- ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = ts;
- }
+ rb_tail_page_update(cpu_buffer, tail_page, next_page);
out_again:
rb_reset_tail(cpu_buffer, tail, info);
+ /* Commit what we have for now. */
+ rb_end_commit(cpu_buffer);
+ /* rb_end_commit() decs committing */
+ local_inc(&cpu_buffer->committing);
+
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
@@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
addr = (unsigned long)event;
addr &= PAGE_MASK;
- bpage = cpu_buffer->tail_page;
+ bpage = READ_ONCE(cpu_buffer->tail_page);
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
@@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
again:
max_count = cpu_buffer->nr_pages * 100;
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
return;
if (RB_WARN_ON(cpu_buffer,
@@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
+ /* Only update the write stamp if the page has an event */
+ if (rb_page_write(cpu_buffer->commit_page))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
* and pushed the tail page forward, we will be left with
* a dangling commit that will never go forward.
*/
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
goto again;
}
@@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp))
info->length += RB_LEN_TIME_EXTEND;
- tail_page = info->tail_page = cpu_buffer->tail_page;
+ /* Don't let the compiler play games with cpu_buffer->tail_page */
+ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
write = local_add_return(info->length, &tail_page->write);
/* set write to only the index of the write */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 87fb9801bd9e..d9293402ee68 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, flags, 6, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 919d9d07686f..8414fa40bf27 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -363,8 +363,8 @@ struct trace_option_dentry {
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
- * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
- * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @start: called when tracing is unpaused (echo 1 > tracing_on)
+ * @stop: called when tracing is paused (echo 0 > tracing_on)
* @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
@@ -467,8 +467,6 @@ enum {
TRACE_INTERNAL_IRQ_BIT,
TRACE_INTERNAL_SIRQ_BIT,
- TRACE_CONTROL_BIT,
-
TRACE_BRANCH_BIT,
/*
* Abuse of the trace_recursion.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index cc9f7a9319be..00df25fd86ef 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
ops->func = perf_ftrace_function_call;
return register_ftrace_function(ops);
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4f6ef6912e00..05ddc0820771 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name)
struct ftrace_event_field *field;
struct list_head *head;
- field = __find_event_field(&ftrace_generic_fields, name);
+ head = trace_get_fields(call);
+ field = __find_event_field(head, name);
if (field)
return field;
- field = __find_event_field(&ftrace_common_fields, name);
+ field = __find_event_field(&ftrace_generic_fields, name);
if (field)
return field;
- head = trace_get_fields(call);
- return __find_event_field(head, name);
+ return __find_event_field(&ftrace_common_fields, name);
}
static int __trace_define_field(struct list_head *head, const char *type,
@@ -171,8 +171,10 @@ static int trace_define_generic_fields(void)
{
int ret;
- __generic_field(int, cpu, FILTER_OTHER);
- __generic_field(char *, comm, FILTER_PTR_STRING);
+ __generic_field(int, CPU, FILTER_CPU);
+ __generic_field(int, cpu, FILTER_CPU);
+ __generic_field(char *, COMM, FILTER_COMM);
+ __generic_field(char *, comm, FILTER_COMM);
return ret;
}
@@ -869,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->class && call->class->reg)
+ if (call->class && call->class->reg &&
+ !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
return file;
}
@@ -1340,15 +1343,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
mutex_lock(&event_mutex);
file = event_file_data(filp);
@@ -1356,7 +1353,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
err = apply_event_filter(file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
@@ -1507,18 +1504,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
-
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long) buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
err = apply_subsystem_event_filter(dir, buf);
- free_page((unsigned long) buf);
+ kfree(buf);
if (err < 0)
return err;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f93a219b18da..6816302542b2 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps,
return -EINVAL;
}
- if (is_string_field(field)) {
+ if (field->filter_type == FILTER_COMM) {
+ filter_build_regex(pred);
+ fn = filter_pred_comm;
+ pred->regex.field_len = TASK_COMM_LEN;
+ } else if (is_string_field(field)) {
filter_build_regex(pred);
- if (!strcmp(field->name, "comm")) {
- fn = filter_pred_comm;
- pred->regex.field_len = TASK_COMM_LEN;
- } else if (field->filter_type == FILTER_STATIC_STRING) {
+ if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps,
}
pred->val = val;
- if (!strcmp(field->name, "cpu"))
+ if (field->filter_type == FILTER_CPU)
fn = filter_pred_cpu;
else
fn = select_comparison_fn(pred->op, field->size,
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 42a4009fd75a..b38f617b6181 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file,
if (cnt >= PAGE_SIZE)
return -EINVAL;
- buf = (char *)__get_free_page(GFP_TEMPORARY);
- if (!buf)
- return -ENOMEM;
+ buf = memdup_user_nul(ubuf, cnt);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
- if (copy_from_user(buf, ubuf, cnt)) {
- free_page((unsigned long)buf);
- return -EFAULT;
- }
- buf[cnt] = '\0';
strim(buf);
mutex_lock(&event_mutex);
event_file = event_file_data(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
return -ENODEV;
}
ret = trigger_process_regex(event_file, buf);
mutex_unlock(&event_mutex);
- free_page((unsigned long)buf);
+ kfree(buf);
if (ret < 0)
goto out;
@@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
@@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob,
list_add_rcu(&data->list, &file->triggers);
ret++;
+ update_cond_flag(file);
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
+ update_cond_flag(file);
ret--;
}
- update_cond_flag(file);
out:
return ret;
}
@@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob,
(enable_data->file == test_enable_data->file)) {
unregistered = true;
list_del_rcu(&data->list);
- update_cond_flag(file);
trace_event_trigger_enable_disable(file, 0);
+ update_cond_flag(file);
break;
}
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c9956440d0e6..21b81a41dae5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,7 +30,7 @@
struct trace_kprobe {
struct list_head list;
struct kretprobe rp; /* Use rp.kp for kprobe use */
- unsigned long nhit;
+ unsigned long __percpu *nhit;
const char *symbol; /* symbol name */
struct trace_probe tp;
};
@@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
if (!tk)
return ERR_PTR(ret);
+ tk->nhit = alloc_percpu(unsigned long);
+ if (!tk->nhit)
+ goto error;
+
if (symbol) {
tk->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tk->symbol)
@@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
error:
kfree(tk->tp.call.name);
kfree(tk->symbol);
+ free_percpu(tk->nhit);
kfree(tk);
return ERR_PTR(ret);
}
@@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
kfree(tk->tp.call.class->system);
kfree(tk->tp.call.name);
kfree(tk->symbol);
+ free_percpu(tk->nhit);
kfree(tk);
}
@@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = {
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_kprobe *tk = v;
+ unsigned long nhit = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ nhit += *per_cpu_ptr(tk->nhit, cpu);
seq_printf(m, " %-44s %15lu %15lu\n",
- trace_event_name(&tk->tp.call), tk->nhit,
+ trace_event_name(&tk->tp.call), nhit,
tk->rp.kp.nmissed);
return 0;
@@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
- tk->nhit++;
+ raw_cpu_inc(*tk->nhit);
if (tk->tp.flags & TP_FLAG_TRACE)
kprobe_trace_func(tk, regs);
@@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
- tk->nhit++;
+ raw_cpu_inc(*tk->nhit);
if (tk->tp.flags & TP_FLAG_TRACE)
kretprobe_trace_func(tk, ri, regs);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index dda9e6742950..2a1abbaca10e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack)
}
/*
+ * Some archs may not have the passed in ip in the dump.
+ * If that happens, we need to show everything.
+ */
+ if (i == stack_trace_max.nr_entries)
+ i = 0;
+
+ /*
* Now find where in the stack these are.
*/
x = 0;
@@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack)
for (; p < top && i < stack_trace_max.nr_entries; p++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
- if (*p == stack_dump_trace[i]) {
+ /*
+ * The READ_ONCE_NOCHECK is used to let KASAN know that
+ * this is not a stack-out-of-bounds error.
+ */
+ if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
stack_dump_trace[x] = stack_dump_trace[i++];
this_size = stack_trace_index[x++] =
(top - p) * sizeof(unsigned long);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0655afbea83f..d1663083d903 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
extern char *__bad_type_size(void);
-#define SYSCALL_FIELD(type, name) \
- sizeof(type) != sizeof(trace.name) ? \
+#define SYSCALL_FIELD(type, field, name) \
+ sizeof(type) != sizeof(trace.field) ? \
__bad_type_size() : \
- #type, #name, offsetof(typeof(trace), name), \
- sizeof(trace.name), is_signed_type(type)
+ #type, #name, offsetof(typeof(trace), field), \
+ sizeof(trace.field), is_signed_type(type)
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
int i;
int offset = offsetof(typeof(trace), args);
- ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+ FILTER_OTHER);
if (ret)
return ret;
@@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call)
struct syscall_trace_exit trace;
int ret;
- ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+ FILTER_OTHER);
if (ret)
return ret;
- ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+ ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
FILTER_OTHER);
return ret;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49e32bf..f8e26ab963ed 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
struct mm_struct *mm;
- /* convert pages-usec to Mbyte-usec */
- stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
- stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+ /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+ do_div(stats->coremem, 1000 * KB);
+ stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+ do_div(stats->virtmem, 1000 * KB);
mm = get_task_mm(p);
if (mm) {
/* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
static void __acct_update_integrals(struct task_struct *tsk,
cputime_t utime, cputime_t stime)
{
- if (likely(tsk->mm)) {
- cputime_t time, dtime;
- struct timeval value;
- unsigned long flags;
- u64 delta;
-
- local_irq_save(flags);
- time = stime + utime;
- dtime = time - tsk->acct_timexpd;
- jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
- delta = value.tv_sec;
- delta = delta * USEC_PER_SEC + value.tv_usec;
-
- if (delta == 0)
- goto out;
- tsk->acct_timexpd = time;
- tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
- out:
- local_irq_restore(flags);
- }
+ cputime_t time, dtime;
+ u64 delta;
+
+ if (!likely(tsk->mm))
+ return;
+
+ time = stime + utime;
+ dtime = time - tsk->acct_timexpd;
+ /* Avoid division: cputime_t is often in nanoseconds already. */
+ delta = cputime_to_nsecs(dtime);
+
+ if (delta < TICK_NSEC)
+ return;
+
+ tsk->acct_timexpd = time;
+ /*
+ * Divide by 1024 to avoid overflow, and to avoid division.
+ * The final unit reported to userspace is Mbyte-usecs,
+ * the rest of the math is done in xacct_add_tsk.
+ */
+ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
}
/**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
void acct_update_integrals(struct task_struct *tsk)
{
cputime_t utime, stime;
+ unsigned long flags;
+ local_irq_save(flags);
task_cputime(tsk, &utime, &stime);
__acct_update_integrals(tsk, utime, stime);
+ local_irq_restore(flags);
}
/**
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 88fefa68c516..9bafc211930c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
struct uid_gid_map new_map;
unsigned idx;
struct uid_gid_extent *extent = NULL;
- unsigned long page = 0;
- char *kbuf, *pos, *next_line;
+ char *kbuf = NULL, *pos, *next_line;
ssize_t ret = -EINVAL;
/*
@@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
- /* Get a buffer */
- ret = -ENOMEM;
- page = __get_free_page(GFP_TEMPORARY);
- kbuf = (char *) page;
- if (!page)
- goto out;
-
/* Only allow < page size writes at the beginning of the file */
ret = -EINVAL;
if ((*ppos != 0) || (count >= PAGE_SIZE))
goto out;
/* Slurp in the user data */
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, count))
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf)) {
+ ret = PTR_ERR(kbuf);
+ kbuf = NULL;
goto out;
- kbuf[count] = '\0';
+ }
/* Parse the user data */
ret = -EINVAL;
@@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = count;
out:
mutex_unlock(&userns_state_mutex);
- if (page)
- free_page(page);
+ kfree(kbuf);
return ret;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18f34cf75f74..9acb29f280ec 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -20,6 +20,7 @@
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
#include <linux/tick.h>
+#include <linux/workqueue.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -225,7 +226,15 @@ static void __touch_watchdog(void)
__this_cpu_write(watchdog_touch_ts, get_timestamp());
}
-void touch_softlockup_watchdog(void)
+/**
+ * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
+ *
+ * Call when the scheduler may have stalled for legitimate reasons
+ * preventing the watchdog task from executing - e.g. the scheduler
+ * entering idle state. This should only be used for scheduler events.
+ * Use touch_softlockup_watchdog() for everything else.
+ */
+void touch_softlockup_watchdog_sched(void)
{
/*
* Preemption can be enabled. It doesn't matter which CPU's timestamp
@@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void)
*/
raw_cpu_write(watchdog_touch_ts, 0);
}
+
+void touch_softlockup_watchdog(void)
+{
+ touch_softlockup_watchdog_sched();
+ wq_watchdog_touch(raw_smp_processor_id());
+}
EXPORT_SYMBOL(touch_softlockup_watchdog);
void touch_all_softlockup_watchdogs(void)
@@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void)
*/
for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
+ wq_watchdog_touch(-1);
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -351,7 +367,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
- panic("Hard LOCKUP");
+ nmi_panic(regs, "Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -907,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* both lockup detectors are disabled if proc_watchdog_update()
* returns an error.
*/
+ if (old == new)
+ goto out;
+
err = proc_watchdog_update();
}
out:
@@ -951,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old;
+ int err, old, new;
get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
@@ -971,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
/*
* Update the sample period. Restore on failure.
*/
+ new = ACCESS_ONCE(watchdog_thresh);
+ if (old == new)
+ goto out;
+
set_sample_period();
err = proc_watchdog_update();
if (err) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c579dbab2e36..2232ae3e3ad6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,8 @@ struct worker_pool {
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
+ unsigned long watchdog_ts; /* L: watchdog timestamp */
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
@@ -299,11 +301,26 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
+
+/*
+ * Local execution of unbound work items is no longer guaranteed. The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
@@ -568,6 +585,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
int node)
{
assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+ /*
+ * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+ * delayed item is pending. The plan is to keep CPU -> NODE
+ * mapping valid and stable across CPU on/offlines. Once that
+ * happens, this workaround can be removed.
+ */
+ if (unlikely(node == NUMA_NO_NODE))
+ return wq->dfl_pwq;
+
return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
@@ -830,7 +857,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
/**
* wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep
- * @cpu: CPU in question, must be the current CPU number
*
* This function is called during schedule() when a busy worker is
* going to sleep. Worker on the same cpu can be woken up by
@@ -842,7 +868,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
* Return:
* Worker task on @cpu to wake up, %NULL if none.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
+struct task_struct *wq_worker_sleeping(struct task_struct *task)
{
struct worker *worker = kthread_data(task), *to_wakeup = NULL;
struct worker_pool *pool;
@@ -858,7 +884,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
pool = worker->pool;
/* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
+ if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
return NULL;
/*
@@ -1083,6 +1109,8 @@ static void pwq_activate_delayed_work(struct work_struct *work)
struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
+ if (list_empty(&pwq->pool->worklist))
+ pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
pwq->nr_active++;
@@ -1294,6 +1322,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
return worker && worker->current_pwq->wq == wq;
}
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+ static bool printed_dbg_warning;
+ int new_cpu;
+
+ if (likely(!wq_debug_force_rr_cpu)) {
+ if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+ return cpu;
+ } else if (!printed_dbg_warning) {
+ pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+ printed_dbg_warning = true;
+ }
+
+ if (cpumask_empty(wq_unbound_cpumask))
+ return cpu;
+
+ new_cpu = __this_cpu_read(wq_rr_cpu_last);
+ new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids)) {
+ new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
+ }
+ __this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+ return new_cpu;
+}
+
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
@@ -1319,7 +1380,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
return;
retry:
if (req_cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
+ cpu = wq_select_unbound_cpu(raw_smp_processor_id());
/* pwq which will be used unless @work is executing elsewhere */
if (!(wq->flags & WQ_UNBOUND))
@@ -1385,6 +1446,8 @@ retry:
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
+ if (list_empty(worklist))
+ pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
@@ -1458,13 +1521,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
timer_stats_timer_set_start_info(&dwork->timer);
dwork->wq = wq;
- /* timer isn't guaranteed to run in this cpu, record earlier */
- if (cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
dwork->cpu = cpu;
timer->expires = jiffies + delay;
- add_timer_on(timer, cpu);
+ if (unlikely(cpu != WORK_CPU_UNBOUND))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
}
/**
@@ -2157,6 +2220,8 @@ recheck:
list_first_entry(&pool->worklist,
struct work_struct, entry);
+ pool->watchdog_ts = jiffies;
+
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
@@ -2240,6 +2305,7 @@ repeat:
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
+ bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
@@ -2256,9 +2322,14 @@ repeat:
* process'em.
*/
WARN_ON_ONCE(!list_empty(scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry)
- if (get_work_pwq(work) == pwq)
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ if (first)
+ pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
+ }
+ first = false;
+ }
if (!list_empty(scheduled)) {
process_scheduled_works(rescuer);
@@ -2316,6 +2387,38 @@ repeat:
goto repeat;
}
+/**
+ * check_flush_dependency - check for flush dependency sanity
+ * @target_wq: workqueue being flushed
+ * @target_work: work item being flushed (NULL for workqueue flushes)
+ *
+ * %current is trying to flush the whole @target_wq or @target_work on it.
+ * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
+ * reclaiming memory or running on a workqueue which doesn't have
+ * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
+ * a deadlock.
+ */
+static void check_flush_dependency(struct workqueue_struct *target_wq,
+ struct work_struct *target_work)
+{
+ work_func_t target_func = target_work ? target_work->func : NULL;
+ struct worker *worker;
+
+ if (target_wq->flags & WQ_MEM_RECLAIM)
+ return;
+
+ worker = current_wq_worker();
+
+ WARN_ONCE(current->flags & PF_MEMALLOC,
+ "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+ current->pid, current->comm, target_wq->name, target_func);
+ WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+ (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
+ "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+ worker->current_pwq->wq->name, worker->current_func,
+ target_wq->name, target_func);
+}
+
struct wq_barrier {
struct work_struct work;
struct completion done;
@@ -2525,6 +2628,8 @@ void flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
+ check_flush_dependency(wq, NULL);
+
mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done);
@@ -2697,6 +2802,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
pwq = worker->current_pwq;
}
+ check_flush_dependency(pwq->wq, work);
+
insert_wq_barrier(pwq, barr, work, worker);
spin_unlock_irq(&pool->lock);
@@ -3069,6 +3176,7 @@ static int init_worker_pool(struct worker_pool *pool)
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
+ pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
@@ -3601,7 +3709,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct apply_wqattrs_ctx *ctx;
- int ret = -ENOMEM;
/* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
@@ -3612,16 +3719,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
ctx = apply_wqattrs_prepare(wq, attrs);
+ if (!ctx)
+ return -ENOMEM;
/* the ctx has been prepared successfully, let's commit it */
- if (ctx) {
- apply_wqattrs_commit(ctx);
- ret = 0;
- }
-
+ apply_wqattrs_commit(ctx);
apply_wqattrs_cleanup(ctx);
- return ret;
+ return 0;
}
/**
@@ -4308,7 +4413,9 @@ void show_workqueue_state(void)
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" workers=%d", pool->nr_workers);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
@@ -4587,7 +4694,7 @@ static void work_for_cpu_fn(struct work_struct *work)
}
/**
- * work_on_cpu - run a function in user context on a particular cpu
+ * work_on_cpu - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function arg
@@ -5113,8 +5220,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
wq_dev->wq = wq;
wq_dev->dev.bus = &wq_subsys;
- wq_dev->dev.init_name = wq->name;
wq_dev->dev.release = wq_device_release;
+ dev_set_name(&wq_dev->dev, "%s", wq->name);
/*
* unbound_attrs are created separately. Suppress uevent until
@@ -5167,6 +5274,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely. Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+ TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+ int cpu;
+
+ wq_watchdog_touched = jiffies;
+ for_each_possible_cpu(cpu)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ bool lockup_detected = false;
+ struct worker_pool *pool;
+ int pi;
+
+ if (!thresh)
+ return;
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ unsigned long pool_ts, touched, ts;
+
+ if (list_empty(&pool->worklist))
+ continue;
+
+ /* get the latest of pool and touched timestamps */
+ pool_ts = READ_ONCE(pool->watchdog_ts);
+ touched = READ_ONCE(wq_watchdog_touched);
+
+ if (time_after(pool_ts, touched))
+ ts = pool_ts;
+ else
+ ts = touched;
+
+ if (pool->cpu >= 0) {
+ unsigned long cpu_touched =
+ READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+ pool->cpu));
+ if (time_after(cpu_touched, ts))
+ ts = cpu_touched;
+ }
+
+ /* did we stall? */
+ if (time_after(jiffies, ts + thresh)) {
+ lockup_detected = true;
+ pr_emerg("BUG: workqueue lockup - pool");
+ pr_cont_pool_info(pool);
+ pr_cont(" stuck for %us!\n",
+ jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (lockup_detected)
+ show_workqueue_state();
+
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+ if (cpu >= 0)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ else
+ wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+ wq_watchdog_thresh = 0;
+ del_timer_sync(&wq_watchdog_timer);
+
+ if (thresh) {
+ wq_watchdog_thresh = thresh;
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+ }
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long thresh;
+ int ret;
+
+ ret = kstrtoul(val, 0, &thresh);
+ if (ret)
+ return ret;
+
+ if (system_wq)
+ wq_watchdog_set_thresh(thresh);
+ else
+ wq_watchdog_thresh = thresh;
+
+ return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+ .set = wq_watchdog_param_set_thresh,
+ .get = param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+ 0644);
+
+static void wq_watchdog_init(void)
+{
+ wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else /* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif /* CONFIG_WQ_WATCHDOG */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
@@ -5290,6 +5545,9 @@ static int __init init_workqueues(void)
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
+
+ wq_watchdog_init();
+
return 0;
}
early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 45215870ac6c..8635417c587b 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void)
* sched/core.c and workqueue.c.
*/
void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */