summaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/callchain.c38
-rw-r--r--kernel/events/core.c278
-rw-r--r--kernel/events/hw_breakpoint.c11
-rw-r--r--kernel/events/internal.h82
-rw-r--r--kernel/events/ring_buffer.c10
-rw-r--r--kernel/events/uprobes.c248
6 files changed, 469 insertions, 198 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 98d4597f43d6..c77206184b8b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
int rctx;
struct perf_callchain_entry *entry;
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
entry = get_callchain_entry(&rctx);
if (rctx == -1)
@@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
entry->nr = 0;
- if (!user_mode(regs)) {
+ if (kernel && !user_mode(regs)) {
perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
}
- if (regs) {
- /*
- * Disallow cross-task user callchains.
- */
- if (event->ctx->task && event->ctx->task != current)
- goto exit_put;
-
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
}
exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f18a0a56e5aa..deec4e50eb30 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
#include "internal.h"
@@ -1253,7 +1254,7 @@ retry:
/*
* Cross CPU call to disable a performance event
*/
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -2935,12 +2936,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
{
- struct perf_event *event = file->private_data;
struct task_struct *owner;
- file->private_data = NULL;
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
@@ -2975,7 +2976,13 @@ static int perf_release(struct inode *inode, struct file *file)
put_task_struct(owner);
}
- return perf_event_release_kernel(event);
+ perf_event_release_kernel(event);
+}
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3227,7 +3234,7 @@ unlock:
static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static struct file *perf_fget_light(int fd, int *fput_needed)
{
struct file *file;
@@ -3241,7 +3248,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
return ERR_PTR(-EBADF);
}
- return file->private_data;
+ return file;
}
static int perf_event_set_output(struct perf_event *event,
@@ -3273,19 +3280,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_OUTPUT:
{
+ struct file *output_file = NULL;
struct perf_event *output_event = NULL;
int fput_needed = 0;
int ret;
if (arg != -1) {
- output_event = perf_fget_light(arg, &fput_needed);
- if (IS_ERR(output_event))
- return PTR_ERR(output_event);
+ output_file = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_file))
+ return PTR_ERR(output_file);
+ output_event = output_file->private_data;
}
ret = perf_event_set_output(event, output_event);
if (output_event)
- fput_light(output_event->filp, fput_needed);
+ fput_light(output_file, fput_needed);
return ret;
}
@@ -3756,6 +3765,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask)
+{
+ int bit;
+
+ for_each_set_bit(bit, (const unsigned long *) &mask,
+ sizeof(mask) * BITS_PER_BYTE) {
+ u64 val;
+
+ val = perf_reg_value(regs, bit);
+ perf_output_put(handle, val);
+ }
+}
+
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+ struct pt_regs *regs)
+{
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ regs_user->regs = regs;
+ regs_user->abi = perf_reg_abi(current);
+ }
+}
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+ unsigned long addr = perf_user_stack_pointer(regs);
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+ struct pt_regs *regs)
+{
+ u64 task_size;
+
+ /* No regs, no stack pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ /*
+ * Check if we fit in with the requested stack size into the:
+ * - TASK_SIZE
+ * If we don't, we limit the size to the TASK_SIZE.
+ *
+ * - remaining sample size
+ * If we don't, we customize the stack size to
+ * fit in to the remaining sample size.
+ */
+
+ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+ stack_size = min(stack_size, (u16) task_size);
+
+ /* Current header size plus static size and dynamic size. */
+ header_size += 2 * sizeof(u64);
+
+ /* Do we fit in with the current stack dump size? */
+ if ((u16) (header_size + stack_size) < header_size) {
+ /*
+ * If we overflow the maximum size for the sample,
+ * we customize the stack dump size to fit in.
+ */
+ stack_size = USHRT_MAX - header_size - sizeof(u64);
+ stack_size = round_up(stack_size, sizeof(u64));
+ }
+
+ return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+ struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned long sp;
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user stack dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ sp = perf_user_stack_pointer(regs);
+ rem = __output_copy_user(handle, (void *) sp, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -4016,6 +4151,28 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, nr);
}
}
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ u64 abi = data->regs_user.abi;
+
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_user;
+ perf_output_sample_regs(handle,
+ data->regs_user.regs,
+ mask);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER)
+ perf_output_sample_ustack(handle,
+ data->stack_user_size,
+ data->regs_user.regs);
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4067,6 +4224,49 @@ void perf_prepare_sample(struct perf_event_header *header,
}
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_user(&data->regs_user, regs);
+
+ if (data->regs_user.regs) {
+ u64 mask = event->attr.sample_regs_user;
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ /*
+ * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * processed as the last one or have additional check added
+ * in case new sample type is added, because we could eat
+ * up the rest of the sample size.
+ */
+ struct perf_regs_user *uregs = &data->regs_user;
+ u16 stack_size = event->attr.sample_stack_user;
+ u16 size = sizeof(u64);
+
+ if (!uregs->abi)
+ perf_sample_regs_user(uregs, regs);
+
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ uregs->regs);
+
+ /*
+ * If there is something to dump, add space for the dump
+ * itself and for the field that tells the dynamic size,
+ * which is how many have been actually dumped.
+ */
+ if (stack_size)
+ size += sizeof(u64) + stack_size;
+
+ data->stack_user_size = stack_size;
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -5950,6 +6150,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->mmap_mutex);
+ atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
@@ -6142,6 +6343,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
}
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+ ret = perf_reg_validate(attr->sample_regs_user);
+ if (ret)
+ return ret;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+ if (!arch_perf_have_user_stack_dump())
+ return -ENOSYS;
+
+ /*
+ * We have __u32 type for the size, but so far
+ * we can only use __u16 as maximum due to the
+ * __u16 sample size limit.
+ */
+ if (attr->sample_stack_user >= USHRT_MAX)
+ ret = -EINVAL;
+ else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+ ret = -EINVAL;
+ }
+
out:
return ret;
@@ -6260,12 +6483,12 @@ SYSCALL_DEFINE5(perf_event_open,
return event_fd;
if (group_fd != -1) {
- group_leader = perf_fget_light(group_fd, &fput_needed);
- if (IS_ERR(group_leader)) {
- err = PTR_ERR(group_leader);
+ group_file = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_file)) {
+ err = PTR_ERR(group_file);
goto err_fd;
}
- group_file = group_leader->filp;
+ group_leader = group_file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6402,7 +6625,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_ctx(gctx);
}
- event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
@@ -6496,7 +6718,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_free;
}
- event->filp = NULL;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
@@ -6578,7 +6799,7 @@ static void sync_child_event(struct perf_event *child_event,
* Release the parent event, if this was the last
* reference to it.
*/
- fput(parent_event->filp);
+ put_event(parent_event);
}
static void
@@ -6654,9 +6875,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*
* __perf_event_exit_task()
* sync_child_event()
- * fput(parent_event->filp)
- * perf_release()
- * mutex_lock(&ctx->mutex)
+ * put_event()
+ * mutex_lock(&ctx->mutex)
*
* But since its the parent context it won't be the same instance.
*/
@@ -6724,7 +6944,7 @@ static void perf_free_event(struct perf_event *event,
list_del_init(&event->child_list);
mutex_unlock(&parent->child_mutex);
- fput(parent->filp);
+ put_event(parent);
perf_group_detach(event);
list_del_event(event, ctx);
@@ -6804,6 +7024,12 @@ inherit_event(struct perf_event *parent_event,
NULL, NULL);
if (IS_ERR(child_event))
return child_event;
+
+ if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
get_ctx(child_ctx);
/*
@@ -6845,14 +7071,6 @@ inherit_event(struct perf_event *parent_event,
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
- * Get a reference to the parent filp - we will fput it
- * when the child event exits. This is safe to do because
- * we are in the parent and we know that the filp still
- * exists and has a nonzero count:
- */
- atomic_long_inc(&parent_event->filp->f_count);
-
- /*
* Link this into the parent event's child list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bb38c4d3ee12..9a7b487c6fe2 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
int old_type = bp->attr.bp_type;
int err = 0;
- perf_event_disable(bp);
+ /*
+ * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+ * will not be possible to raise IPIs that invoke __perf_event_disable.
+ * So call the function directly after making sure we are targeting the
+ * current task.
+ */
+ if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+ __perf_event_disable(bp);
+ else
+ perf_event_disable(bp);
bp->attr.bp_addr = attr->bp_addr;
bp->attr.bp_type = attr->bp_type;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index a096c19f2c2a..d56a64c99a8b 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
/* Buffer handling */
@@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline void
-__output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned int \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned int len) \
+{ \
+ unsigned long size, written; \
+ \
+ do { \
+ size = min_t(unsigned long, handle->size, len); \
+ \
+ written = memcpy_func(handle->addr, buf, size); \
+ \
+ len -= written; \
+ handle->addr += written; \
+ buf += written; \
+ handle->size -= written; \
+ if (!handle->size) { \
+ struct ring_buffer *rb = handle->rb; \
+ \
+ handle->page++; \
+ handle->page &= rb->nr_pages - 1; \
+ handle->addr = rb->data_pages[handle->page]; \
+ handle->size = PAGE_SIZE << page_order(rb); \
+ } \
+ } while (len && written == size); \
+ \
+ return len; \
+}
+
+static inline int memcpy_common(void *dst, const void *src, size_t n)
{
- do {
- unsigned long size = min_t(unsigned long, handle->size, len);
-
- memcpy(handle->addr, buf, size);
-
- len -= size;
- handle->addr += size;
- buf += size;
- handle->size -= size;
- if (!handle->size) {
- struct ring_buffer *rb = handle->rb;
-
- handle->page++;
- handle->page &= rb->nr_pages - 1;
- handle->addr = rb->data_pages[handle->page];
- handle->size = PAGE_SIZE << page_order(rb);
- }
- } while (len);
+ memcpy(dst, src, n);
+ return n;
}
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+#define MEMCPY_SKIP(dst, src, n) (n)
+
+DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user __copy_from_user_inatomic
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
+
/* Callchain handling */
extern struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs);
@@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
recursion[rctx]--;
}
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7a..23cb34ff3973 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -182,10 +182,16 @@ out:
return -ENOSPC;
}
-void perf_output_copy(struct perf_output_handle *handle,
+unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
- __output_copy(handle, buf, len);
+ return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+ unsigned int len)
+{
+ return __output_skip(handle, NULL, len);
}
void perf_output_end(struct perf_output_handle *handle)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c08a22d02f72..912ef48d28ab 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -280,12 +280,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
if (ret <= 0)
return ret;
- lock_page(page);
vaddr_new = kmap_atomic(page);
vaddr &= ~PAGE_MASK;
memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
kunmap_atomic(vaddr_new);
- unlock_page(page);
put_page(page);
@@ -334,7 +332,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
*/
result = is_swbp_at_addr(mm, vaddr);
if (result == 1)
- return -EEXIST;
+ return 0;
if (result)
return result;
@@ -347,24 +345,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
* @mm: the probed process address space.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
- * @verify: if true, verify existance of breakpoint instruction.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- if (verify) {
- int result;
+ int result;
+
+ result = is_swbp_at_addr(mm, vaddr);
+ if (!result)
+ return -EINVAL;
- result = is_swbp_at_addr(mm, vaddr);
- if (!result)
- return -EINVAL;
+ if (result != 1)
+ return result;
- if (result != 1)
- return result;
- }
return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
@@ -415,11 +411,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe;
- unsigned long flags;
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
uprobe = __find_uprobe(inode, offset);
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
return uprobe;
}
@@ -466,12 +461,11 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
- unsigned long flags;
struct uprobe *u;
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
u = __insert_uprobe(uprobe);
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
/* For now assume that the instruction need not be single-stepped */
uprobe->flags |= UPROBE_SKIP_SSTEP;
@@ -649,6 +643,7 @@ static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long vaddr)
{
+ bool first_uprobe;
int ret;
/*
@@ -659,7 +654,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
* Hence behave as if probe already existed.
*/
if (!uprobe->consumers)
- return -EEXIST;
+ return 0;
if (!(uprobe->flags & UPROBE_COPY_INSN)) {
ret = copy_insn(uprobe, vma->vm_file);
@@ -681,17 +676,18 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
}
/*
- * Ideally, should be updating the probe count after the breakpoint
- * has been successfully inserted. However a thread could hit the
- * breakpoint we just inserted even before the probe count is
- * incremented. If this is the first breakpoint placed, breakpoint
- * notifier might ignore uprobes and pass the trap to the thread.
- * Hence increment before and decrement on failure.
+ * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+ * the task can hit this breakpoint right after __replace_page().
*/
- atomic_inc(&mm->uprobes_state.count);
+ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ if (first_uprobe)
+ set_bit(MMF_HAS_UPROBES, &mm->flags);
+
ret = set_swbp(&uprobe->arch, mm, vaddr);
- if (ret)
- atomic_dec(&mm->uprobes_state.count);
+ if (!ret)
+ clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ else if (first_uprobe)
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
return ret;
}
@@ -699,8 +695,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
static void
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
- if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
- atomic_dec(&mm->uprobes_state.count);
+ /* can happen if uprobe_register() fails */
+ if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
+ return;
+
+ set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ set_orig_insn(&uprobe->arch, mm, vaddr);
}
/*
@@ -710,11 +710,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
*/
static void delete_uprobe(struct uprobe *uprobe)
{
- unsigned long flags;
-
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
iput(uprobe->inode);
put_uprobe(uprobe);
atomic_dec(&uprobe_events);
@@ -831,17 +829,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
- if (is_register) {
+ if (is_register)
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- /*
- * We can race against uprobe_mmap(), see the
- * comment near uprobe_hash().
- */
- if (err == -EEXIST)
- err = 0;
- } else {
+ else
remove_breakpoint(uprobe, mm, info->vaddr);
- }
+
unlock:
up_write(&mm->mmap_sem);
free:
@@ -908,7 +900,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
}
mutex_unlock(uprobes_hash(inode));
- put_uprobe(uprobe);
+ if (uprobe)
+ put_uprobe(uprobe);
return ret;
}
@@ -978,7 +971,6 @@ static void build_probe_list(struct inode *inode,
struct list_head *head)
{
loff_t min, max;
- unsigned long flags;
struct rb_node *n, *t;
struct uprobe *u;
@@ -986,7 +978,7 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
@@ -1004,27 +996,20 @@ static void build_probe_list(struct inode *inode,
atomic_inc(&u->ref);
}
}
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
}
/*
- * Called from mmap_region.
- * called with mm->mmap_sem acquired.
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
*
- * Return -ve no if we fail to insert probes and we cannot
- * bail-out.
- * Return 0 otherwise. i.e:
- *
- * - successful insertion of probes
- * - (or) no possible probes to be inserted.
- * - (or) insertion of probes failed but we can bail-out.
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
*/
int uprobe_mmap(struct vm_area_struct *vma)
{
struct list_head tmp_list;
struct uprobe *uprobe, *u;
struct inode *inode;
- int ret, count;
if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
return 0;
@@ -1036,44 +1021,35 @@ int uprobe_mmap(struct vm_area_struct *vma)
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
- ret = 0;
- count = 0;
-
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- if (!ret) {
+ if (!fatal_signal_pending(current)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-
- ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
- /*
- * We can race against uprobe_register(), see the
- * comment near uprobe_hash().
- */
- if (ret == -EEXIST) {
- ret = 0;
-
- if (!is_swbp_at_addr(vma->vm_mm, vaddr))
- continue;
-
- /*
- * Unable to insert a breakpoint, but
- * breakpoint lies underneath. Increment the
- * probe count.
- */
- atomic_inc(&vma->vm_mm->uprobes_state.count);
- }
-
- if (!ret)
- count++;
+ install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
put_uprobe(uprobe);
}
-
mutex_unlock(uprobes_mmap_hash(inode));
- if (ret)
- atomic_sub(count, &vma->vm_mm->uprobes_state.count);
+ return 0;
+}
- return ret;
+static bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ loff_t min, max;
+ struct inode *inode;
+ struct rb_node *n;
+
+ inode = vma->vm_file->f_mapping->host;
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ spin_unlock(&uprobes_treelock);
+
+ return !!n;
}
/*
@@ -1081,37 +1057,18 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
- struct list_head tmp_list;
- struct uprobe *uprobe, *u;
- struct inode *inode;
-
if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
- if (!atomic_read(&vma->vm_mm->uprobes_state.count))
- return;
-
- inode = vma->vm_file->f_mapping->host;
- if (!inode)
+ if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
+ test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
return;
- mutex_lock(uprobes_mmap_hash(inode));
- build_probe_list(inode, vma, start, end, &tmp_list);
-
- list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- /*
- * An unregister could have removed the probe before
- * unmap. So check before we decrement the count.
- */
- if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
- atomic_dec(&vma->vm_mm->uprobes_state.count);
- put_uprobe(uprobe);
- }
- mutex_unlock(uprobes_mmap_hash(inode));
+ if (vma_has_uprobes(vma, start, end))
+ set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
/* Slot allocation for XOL */
@@ -1213,13 +1170,15 @@ void uprobe_clear_state(struct mm_struct *mm)
kfree(area);
}
-/*
- * uprobe_reset_state - Free the area allocated for slots.
- */
-void uprobe_reset_state(struct mm_struct *mm)
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- mm->uprobes_state.xol_area = NULL;
- atomic_set(&mm->uprobes_state.count, 0);
+ newmm->uprobes_state.xol_area = NULL;
+
+ if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
+ set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
+ set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ }
}
/*
@@ -1437,6 +1396,25 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
return false;
}
+static void mmf_recalc_uprobes(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!valid_vma(vma, false))
+ continue;
+ /*
+ * This is not strictly accurate, we can race with
+ * uprobe_unregister() and see the already removed
+ * uprobe if delete_uprobe() was not yet called.
+ */
+ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
+ return;
+ }
+
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+}
+
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
@@ -1458,11 +1436,24 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
} else {
*is_swbp = -EFAULT;
}
+
+ if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ mmf_recalc_uprobes(mm);
up_read(&mm->mmap_sem);
return uprobe;
}
+void __weak arch_uprobe_enable_step(struct arch_uprobe *arch)
+{
+ user_enable_single_step(current);
+}
+
+void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
+{
+ user_disable_single_step(current);
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1509,7 +1500,7 @@ static void handle_swbp(struct pt_regs *regs)
utask->state = UTASK_SSTEP;
if (!pre_ssout(uprobe, regs, bp_vaddr)) {
- user_enable_single_step(current);
+ arch_uprobe_enable_step(&uprobe->arch);
return;
}
@@ -1518,17 +1509,15 @@ cleanup_ret:
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
}
- if (uprobe) {
- if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+ if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
+ /*
+ * cannot singlestep; cannot skip instruction;
+ * re-execute the instruction.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
- put_uprobe(uprobe);
- }
+ put_uprobe(uprobe);
}
/*
@@ -1547,10 +1536,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
else
WARN_ON_ONCE(1);
+ arch_uprobe_disable_step(&uprobe->arch);
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
- user_disable_single_step(current);
xol_free_insn_slot(current);
spin_lock_irq(&current->sighand->siglock);
@@ -1589,8 +1578,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
struct uprobe_task *utask;
- if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
- /* task is currently not uprobed */
+ if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
return 0;
utask = current->utask;