diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 211 |
1 files changed, 152 insertions, 59 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..3e691b75b2db 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx) return parent_ctx; } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, + enum pid_type type) { + u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; - return task_tgid_nr_ns(p, event->ns); + nr = __task_pid_nr_ns(p, type, event->ns); + /* avoid -1 if it is idle thread or runs in another ns */ + if (!nr && !pid_alive(p)) + nr = -1; + return nr; } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; + return perf_event_pid_type(event, p, __PIDTYPE_TGID); +} - return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + return perf_event_pid_type(event, p, PIDTYPE_PID); } /* @@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + event->header_size = size; } @@ -2217,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } @@ -3180,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* + * We must check ctx->nr_events while holding ctx->lock, such + * that we serialize against perf_install_in_context(). + */ + if (!ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3193,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } @@ -3625,10 +3673,7 @@ unlock: static inline u64 perf_event_count(struct perf_event *event) { - if (event->pmu->count) - return event->pmu->count(event); - - return __perf_event_count(event); + return local64_read(&event->count) + atomic64_read(&event->child_count); } /* @@ -3659,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } - /* - * It must not have a pmu::count method, those are not - * NMI safe. - */ - if (event->pmu->count) { - ret = -EOPNOTSUPP; - goto out; - } - /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { @@ -5090,7 +5126,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5149,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5447,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } @@ -5972,6 +6008,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5987,6 +6026,38 @@ void perf_output_sample(struct perf_output_handle *handle, } } +static u64 perf_virt_to_phys(u64 virt) +{ + u64 phys_addr = 0; + struct page *p = NULL; + + if (!virt) + return 0; + + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid((void *)(uintptr_t)virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe __get_user_pages_fast first. + * If failed, leave phys_addr as 0. + */ + if ((current->mm != NULL) && + (__get_user_pages_fast(virt, 1, 0, &p) == 1)) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + + if (p) + put_page(p); + } + + return phys_addr; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6105,6 +6176,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + data->phys_addr = perf_virt_to_phys(data->addr); } static void __always_inline @@ -7256,6 +7330,11 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +void perf_event_itrace_started(struct perf_event *event) +{ + event->attach_state |= PERF_ATTACH_ITRACE; +} + static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; @@ -7271,7 +7350,7 @@ static void perf_log_itrace_start(struct perf_event *event) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || - event->hw.itrace_started) + event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; @@ -7875,16 +7954,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task); + rctx, task, NULL); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task) + struct task_struct *task, struct perf_event *event) { struct perf_sample_data data; - struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7898,9 +7976,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - hlist_for_each_entry_rcu(event, head, hlist_entry) { + /* Use the given event instead of the hlist */ + if (event) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); + } else { + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } } /* @@ -8050,7 +8134,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { - bool is_kprobe, is_tracepoint; + bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -8061,7 +8145,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; - if (!is_kprobe && !is_tracepoint) + is_syscall_tp = is_syscall_trace_event(event->tp_event); + if (!is_kprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; @@ -8070,13 +8155,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return PTR_ERR(prog); if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || - (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } - if (is_tracepoint) { + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) { @@ -9580,6 +9666,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + attr->size = size; + if (attr->__reserved_1) return -EINVAL; @@ -9852,6 +9940,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* Only privileged users can get physical addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!attr.sample_max_stack) attr.sample_max_stack = sysctl_perf_event_max_stack; @@ -10001,28 +10094,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned @@ -11201,5 +11293,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = { * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ |