summaryrefslogtreecommitdiffstats
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c436
1 files changed, 293 insertions, 143 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6c4e523dc1e2..10cdb9c26b5d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
+static cpumask_var_t perf_online_mask;
/*
* perf event paranoia level:
@@ -661,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgrp == event->cgrp)
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@@ -900,9 +901,11 @@ list_update_cgroup_event(struct perf_event *event,
cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
/* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
if (add) {
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+
list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
- if (perf_cgroup_from_task(current, ctx) == event->cgrp)
- cpuctx->cgrp = event->cgrp;
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
+ cpuctx->cgrp = cgrp;
} else {
list_del(cpuctx_entry);
cpuctx->cgrp = NULL;
@@ -925,11 +928,6 @@ static inline int is_cgroup_event(struct perf_event *event)
return 0;
}
-static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
-{
- return 0;
-}
-
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
@@ -1253,26 +1251,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1456,6 +1459,13 @@ static enum event_type_t get_event_type(struct perf_event *event)
lockdep_assert_held(&ctx->lock);
+ /*
+ * It's 'group type', really, because if our group leader is
+ * pinned, so are we.
+ */
+ if (event->group_leader != event)
+ event = event->group_leader;
+
event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
if (!ctx->task)
event_type |= EVENT_CPU;
@@ -1567,6 +1577,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -2214,6 +2227,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
+/*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+static void __perf_event_enable_time(struct perf_event *event, u64 now)
+{
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+}
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2221,9 +2261,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2468,10 +2511,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
@@ -3177,6 +3221,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
return;
perf_ctx_lock(cpuctx, ctx);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -3190,6 +3241,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
}
@@ -3622,10 +3675,7 @@ unlock:
static inline u64 perf_event_count(struct perf_event *event)
{
- if (event->pmu->count)
- return event->pmu->count(event);
-
- return __perf_event_count(event);
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
}
/*
@@ -3636,10 +3686,10 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
{
unsigned long flags;
- u64 val;
+ int ret = 0;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3647,25 +3697,28 @@ u64 perf_event_read_local(struct perf_event *event)
*/
local_irq_save(flags);
- /* If this is a per-task event, it must be for current */
- WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
- event->hw.target != current);
-
- /* If this is a per-CPU event, it must be for this CPU */
- WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id());
-
/*
* It must not be an event with inherit set, we cannot read
* all child counters from atomic context.
*/
- WARN_ON_ONCE(event->attr.inherit);
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
- /*
- * It must not have a pmu::count method, those are not
- * NMI safe.
- */
- WARN_ON_ONCE(event->pmu->count);
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* If the event is currently on this CPU, its either a per-task event,
@@ -3675,10 +3728,11 @@ u64 perf_event_read_local(struct perf_event *event)
if (event->oncpu == smp_processor_id())
event->pmu->read(event);
- val = local64_read(&event->count);
+ *value = local64_read(&event->count);
+out:
local_irq_restore(flags);
- return val;
+ return ret;
}
static int perf_event_read(struct perf_event *event, bool group)
@@ -3812,14 +3866,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- /*
- * We could be clever and allow to attach a event to an
- * offline CPU and activate it when the CPU comes up, but
- * that's for later.
- */
- if (!cpu_online(cpu))
- return ERR_PTR(-ENODEV);
-
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
@@ -4377,7 +4423,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
+ struct perf_event_context *ctx = leader->ctx;
struct perf_event *sub;
+ unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -4407,12 +4455,15 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
}
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return 0;
}
@@ -5077,7 +5128,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5100,7 +5151,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5398,7 +5449,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
@@ -5729,9 +5780,6 @@ static void perf_output_read_one(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
}
-/*
- * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
- */
static void perf_output_read_group(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
@@ -5776,6 +5824,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
PERF_FORMAT_TOTAL_TIME_RUNNING)
+/*
+ * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
+ *
+ * The problem is that its both hard and excessively expensive to iterate the
+ * child list, not to mention that its impossible to IPI the children running
+ * on another CPU, from interrupt/NMI context.
+ */
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
@@ -5955,6 +6010,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5970,6 +6028,38 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+static u64 perf_virt_to_phys(u64 virt)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (!virt)
+ return 0;
+
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6088,6 +6178,9 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ data->phys_addr = perf_virt_to_phys(data->addr);
}
static void __always_inline
@@ -7239,6 +7332,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+void perf_event_itrace_started(struct perf_event *event)
+{
+ event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -7254,7 +7352,7 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
rec.header.type = PERF_RECORD_ITRACE_START;
@@ -7316,21 +7414,6 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
- /*
- * Due to interrupt latency (AKA "skid"), we may enter the
- * kernel before taking an overflow, even if the PMU is only
- * counting user events.
- * To avoid leaking information to userspace, we must always
- * reject kernel samples when exclude_kernel is set.
- */
- if (event->attr.exclude_kernel && !user_mode(regs))
- return false;
-
- return true;
-}
-
/*
* Generic event overflow handling, sampling.
*/
@@ -7352,12 +7435,6 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
/*
- * For security, drop the skid kernel samples if necessary.
- */
- if (!sample_is_allowed(event, regs))
- return ret;
-
- /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -7724,7 +7801,8 @@ static int swevent_hlist_get_cpu(int cpu)
int err = 0;
mutex_lock(&swhash->hlist_mutex);
- if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(swhash) &&
+ cpumask_test_cpu(cpu, perf_online_mask)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -7745,7 +7823,7 @@ static int swevent_hlist_get(void)
{
int err, cpu, failed_cpu;
- get_online_cpus();
+ mutex_lock(&pmus_lock);
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(cpu);
if (err) {
@@ -7753,8 +7831,7 @@ static int swevent_hlist_get(void)
goto fail;
}
}
- put_online_cpus();
-
+ mutex_unlock(&pmus_lock);
return 0;
fail:
for_each_possible_cpu(cpu) {
@@ -7762,8 +7839,7 @@ fail:
break;
swevent_hlist_put_cpu(cpu);
}
-
- put_online_cpus();
+ mutex_unlock(&pmus_lock);
return err;
}
@@ -7880,16 +7956,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task);
+ rctx, task, NULL);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
+ struct task_struct *task, struct perf_event *event)
{
struct perf_sample_data data;
- struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7903,9 +7978,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ /* Use the given event instead of the hlist */
+ if (event) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
+ } else {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -8055,22 +8136,19 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
- if (event->attr.type == PERF_TYPE_HARDWARE ||
- event->attr.type == PERF_TYPE_SOFTWARE)
- return perf_event_set_bpf_handler(event, prog_fd);
-
if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ return perf_event_set_bpf_handler(event, prog_fd);
if (event->tp_event->prog)
return -EEXIST;
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
@@ -8079,13 +8157,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
@@ -8094,6 +8173,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
}
}
event->tp_event->prog = prog;
+ event->tp_event->bpf_prog_owner = event;
return 0;
}
@@ -8108,7 +8188,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
return;
prog = event->tp_event->prog;
- if (prog) {
+ if (prog && event->tp_event->bpf_prog_owner == event) {
event->tp_event->prog = NULL;
bpf_prog_put(prog);
}
@@ -8877,6 +8957,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
static void free_pmu_context(struct pmu *pmu)
{
+ /*
+ * Static contexts such as perf_sw_context have a global lifetime
+ * and may be shared between different PMUs. Avoid freeing them
+ * when a single PMU is going away.
+ */
+ if (pmu->task_ctx_nr > perf_invalid_context)
+ return;
+
mutex_lock(&pmus_lock);
free_percpu(pmu->pmu_cpu_context);
mutex_unlock(&pmus_lock);
@@ -8941,7 +9029,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -8950,7 +9038,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpu_function_call(cpu,
(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
}
- put_online_cpus();
+ cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
return count;
@@ -9080,6 +9168,7 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
__perf_mux_hrtimer_init(cpuctx, cpu);
}
@@ -9193,7 +9282,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
static struct pmu *perf_init_event(struct perf_event *event)
{
- struct pmu *pmu = NULL;
+ struct pmu *pmu;
int idx;
int ret;
@@ -9462,9 +9551,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_FORMAT_GROUP on inherited events
+ * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * See perf_output_read().
*/
- if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
goto err_ns;
if (!has_branch_stack(event))
@@ -9477,9 +9567,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
pmu = perf_init_event(event);
- if (!pmu)
- goto err_ns;
- else if (IS_ERR(pmu)) {
+ if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
}
@@ -9492,8 +9580,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
sizeof(unsigned long),
GFP_KERNEL);
- if (!event->addr_filters_offs)
+ if (!event->addr_filters_offs) {
+ err = -ENOMEM;
goto err_per_task;
+ }
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
@@ -9587,6 +9677,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ attr->size = size;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -9859,6 +9951,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
@@ -9903,12 +10000,10 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- get_online_cpus();
-
if (task) {
err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
if (err)
- goto err_cpus;
+ goto err_task;
/*
* Reuse ptrace permission checks for now.
@@ -10010,28 +10105,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
@@ -10094,6 +10188,23 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_locked;
}
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
+ }
+
+
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
* because we need to serialize with concurrent event creation.
@@ -10183,8 +10294,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_task_struct(task);
}
- put_online_cpus();
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -10218,8 +10327,6 @@ err_alloc:
err_cred:
if (task)
mutex_unlock(&task->signal->cred_guard_mutex);
-err_cpus:
- put_online_cpus();
err_task:
if (task)
put_task_struct(task);
@@ -10274,6 +10381,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_unlock;
+ }
+ }
+
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
goto err_unlock;
@@ -10941,6 +11063,8 @@ static void __init perf_event_init_all_cpus(void)
struct swevent_htable *swhash;
int cpu;
+ zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
@@ -10956,7 +11080,7 @@ static void __init perf_event_init_all_cpus(void)
}
}
-int perf_event_init_cpu(unsigned int cpu)
+void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10969,7 +11093,6 @@ int perf_event_init_cpu(unsigned int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
- return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10987,19 +11110,22 @@ static void __perf_event_exit_context(void *__info)
static void perf_event_exit_cpu_context(int cpu)
{
+ struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
struct pmu *pmu;
- int idx;
- idx = srcu_read_lock(&pmus_srcu);
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ mutex_lock(&pmus_lock);
+ list_for_each_entry(pmu, &pmus, entry) {
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
}
- srcu_read_unlock(&pmus_srcu, idx);
+ cpumask_clear_cpu(cpu, perf_online_mask);
+ mutex_unlock(&pmus_lock);
}
#else
@@ -11007,6 +11133,29 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+int perf_event_init_cpu(unsigned int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+
+ perf_swevent_init_cpu(cpu);
+
+ mutex_lock(&pmus_lock);
+ cpumask_set_cpu(cpu, perf_online_mask);
+ list_for_each_entry(pmu, &pmus, entry) {
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
+ }
+ mutex_unlock(&pmus_lock);
+
+ return 0;
+}
+
int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
@@ -11155,5 +11304,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
* controller is not mounted on a legacy hierarchy.
*/
.implicit_on_dfl = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */