diff options
author | Daniel Borkmann | 2019-07-08 15:35:44 +0200 |
---|---|---|
committer | Daniel Borkmann | 2019-07-08 15:35:45 +0200 |
commit | d2850ce0bdd772a85043b1f07af99205b96e6e2a (patch) | |
tree | c17fdb0f9b71b4c169e4c252a6dd40ee5e38bc62 | |
parent | selftests/bpf: add test_tcp_rtt to .gitignore (diff) | |
parent | libbpf: add perf_buffer_ prefix to README (diff) | |
download | kernel-qcow2-linux-d2850ce0bdd772a85043b1f07af99205b96e6e2a.tar.gz kernel-qcow2-linux-d2850ce0bdd772a85043b1f07af99205b96e6e2a.tar.xz kernel-qcow2-linux-d2850ce0bdd772a85043b1f07af99205b96e6e2a.zip |
Merge branch 'bpf-libbpf-perf-rb-api'
Andrii Nakryiko says:
====================
This patchset adds a high-level API for setting up and polling perf buffers
associated with BPF_MAP_TYPE_PERF_EVENT_ARRAY map. Details of APIs are
described in corresponding commit.
Patch #1 adds a set of APIs to set up and work with perf buffer.
Patch #2 enhances libbpf to support auto-setting PERF_EVENT_ARRAY map size.
Patch #3 adds test.
Patch #4 converts bpftool map event_pipe to new API.
Patch #5 updates README to mention perf_buffer_ prefix.
v6->v7:
- __x64_ syscall prefix (Yonghong);
v5->v6:
- fix C99 for loop variable initialization usage (Yonghong);
v4->v5:
- initialize perf_buffer_raw_opts in bpftool map event_pipe (Jakub);
- add perf_buffer_ to README;
v3->v4:
- fixed bpftool event_pipe cmd error handling (Jakub);
v2->v3:
- added perf_buffer__new_raw for more low-level control;
- converted bpftool map event_pipe to new API (Daniel);
- fixed bug with error handling in create_maps (Song);
v1->v2:
- add auto-sizing of PERF_EVENT_ARRAY maps;
====================
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r-- | tools/bpf/bpftool/map_perf_ring.c | 201 | ||||
-rw-r--r-- | tools/lib/bpf/README.rst | 3 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.c | 397 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.h | 49 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.map | 4 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/prog_tests/perf_buffer.c | 100 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/test_perf_buffer.c | 25 |
7 files changed, 634 insertions, 145 deletions
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c index 0507dfaf7a8f..3f108ab17797 100644 --- a/tools/bpf/bpftool/map_perf_ring.c +++ b/tools/bpf/bpftool/map_perf_ring.c @@ -28,7 +28,7 @@ #define MMAP_PAGE_CNT 16 -static bool stop; +static volatile bool stop; struct event_ring_info { int fd; @@ -44,32 +44,44 @@ struct perf_event_sample { unsigned char data[]; }; +struct perf_event_lost { + struct perf_event_header header; + __u64 id; + __u64 lost; +}; + static void int_exit(int signo) { fprintf(stderr, "Stopping...\n"); stop = true; } +struct event_pipe_ctx { + bool all_cpus; + int cpu; + int idx; +}; + static enum bpf_perf_event_ret -print_bpf_output(struct perf_event_header *event, void *private_data) +print_bpf_output(void *private_data, int cpu, struct perf_event_header *event) { - struct perf_event_sample *e = container_of(event, struct perf_event_sample, + struct perf_event_sample *e = container_of(event, + struct perf_event_sample, header); - struct event_ring_info *ring = private_data; - struct { - struct perf_event_header header; - __u64 id; - __u64 lost; - } *lost = (typeof(lost))event; + struct perf_event_lost *lost = container_of(event, + struct perf_event_lost, + header); + struct event_pipe_ctx *ctx = private_data; + int idx = ctx->all_cpus ? cpu : ctx->idx; if (json_output) { jsonw_start_object(json_wtr); jsonw_name(json_wtr, "type"); jsonw_uint(json_wtr, e->header.type); jsonw_name(json_wtr, "cpu"); - jsonw_uint(json_wtr, ring->cpu); + jsonw_uint(json_wtr, cpu); jsonw_name(json_wtr, "index"); - jsonw_uint(json_wtr, ring->key); + jsonw_uint(json_wtr, idx); if (e->header.type == PERF_RECORD_SAMPLE) { jsonw_name(json_wtr, "timestamp"); jsonw_uint(json_wtr, e->time); @@ -89,7 +101,7 @@ print_bpf_output(struct perf_event_header *event, void *private_data) if (e->header.type == PERF_RECORD_SAMPLE) { printf("== @%lld.%09lld CPU: %d index: %d =====\n", e->time / 1000000000ULL, e->time % 1000000000ULL, - ring->cpu, ring->key); + cpu, idx); fprint_hex(stdout, e->data, e->size, " "); printf("\n"); } else if (e->header.type == PERF_RECORD_LOST) { @@ -103,87 +115,25 @@ print_bpf_output(struct perf_event_header *event, void *private_data) return LIBBPF_PERF_EVENT_CONT; } -static void -perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len) -{ - enum bpf_perf_event_ret ret; - - ret = bpf_perf_event_read_simple(ring->mem, - MMAP_PAGE_CNT * get_page_size(), - get_page_size(), buf, buf_len, - print_bpf_output, ring); - if (ret != LIBBPF_PERF_EVENT_CONT) { - fprintf(stderr, "perf read loop failed with %d\n", ret); - stop = true; - } -} - -static int perf_mmap_size(void) -{ - return get_page_size() * (MMAP_PAGE_CNT + 1); -} - -static void *perf_event_mmap(int fd) -{ - int mmap_size = perf_mmap_size(); - void *base; - - base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (base == MAP_FAILED) { - p_err("event mmap failed: %s\n", strerror(errno)); - return NULL; - } - - return base; -} - -static void perf_event_unmap(void *mem) -{ - if (munmap(mem, perf_mmap_size())) - fprintf(stderr, "Can't unmap ring memory!\n"); -} - -static int bpf_perf_event_open(int map_fd, int key, int cpu) +int do_event_pipe(int argc, char **argv) { - struct perf_event_attr attr = { + struct perf_event_attr perf_attr = { .sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_BPF_OUTPUT, + .sample_period = 1, + .wakeup_events = 1, }; - int pmu_fd; - - pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0); - if (pmu_fd < 0) { - p_err("failed to open perf event %d for CPU %d", key, cpu); - return -1; - } - - if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) { - p_err("failed to update map for event %d for CPU %d", key, cpu); - goto err_close; - } - if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) { - p_err("failed to enable event %d for CPU %d", key, cpu); - goto err_close; - } - - return pmu_fd; - -err_close: - close(pmu_fd); - return -1; -} - -int do_event_pipe(int argc, char **argv) -{ - int i, nfds, map_fd, index = -1, cpu = -1; struct bpf_map_info map_info = {}; - struct event_ring_info *rings; - size_t tmp_buf_sz = 0; - void *tmp_buf = NULL; - struct pollfd *pfds; + struct perf_buffer_raw_opts opts = {}; + struct event_pipe_ctx ctx = { + .all_cpus = true, + .cpu = -1, + .idx = -1, + }; + struct perf_buffer *pb; __u32 map_info_len; - bool do_all = true; + int err, map_fd; map_info_len = sizeof(map_info); map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len); @@ -205,7 +155,7 @@ int do_event_pipe(int argc, char **argv) char *endptr; NEXT_ARG(); - cpu = strtoul(*argv, &endptr, 0); + ctx.cpu = strtoul(*argv, &endptr, 0); if (*endptr) { p_err("can't parse %s as CPU ID", **argv); goto err_close_map; @@ -216,7 +166,7 @@ int do_event_pipe(int argc, char **argv) char *endptr; NEXT_ARG(); - index = strtoul(*argv, &endptr, 0); + ctx.idx = strtoul(*argv, &endptr, 0); if (*endptr) { p_err("can't parse %s as index", **argv); goto err_close_map; @@ -228,45 +178,32 @@ int do_event_pipe(int argc, char **argv) goto err_close_map; } - do_all = false; + ctx.all_cpus = false; } - if (!do_all) { - if (index == -1 || cpu == -1) { + if (!ctx.all_cpus) { + if (ctx.idx == -1 || ctx.cpu == -1) { p_err("cpu and index must be specified together"); goto err_close_map; } - - nfds = 1; } else { - nfds = min(get_possible_cpus(), map_info.max_entries); - cpu = 0; - index = 0; + ctx.cpu = 0; + ctx.idx = 0; } - rings = calloc(nfds, sizeof(rings[0])); - if (!rings) + opts.attr = &perf_attr; + opts.event_cb = print_bpf_output; + opts.ctx = &ctx; + opts.cpu_cnt = ctx.all_cpus ? 0 : 1; + opts.cpus = &ctx.cpu; + opts.map_keys = &ctx.idx; + + pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &opts); + err = libbpf_get_error(pb); + if (err) { + p_err("failed to create perf buffer: %s (%d)", + strerror(err), err); goto err_close_map; - - pfds = calloc(nfds, sizeof(pfds[0])); - if (!pfds) - goto err_free_rings; - - for (i = 0; i < nfds; i++) { - rings[i].cpu = cpu + i; - rings[i].key = index + i; - - rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key, - rings[i].cpu); - if (rings[i].fd < 0) - goto err_close_fds_prev; - - rings[i].mem = perf_event_mmap(rings[i].fd); - if (!rings[i].mem) - goto err_close_fds_current; - - pfds[i].fd = rings[i].fd; - pfds[i].events = POLLIN; } signal(SIGINT, int_exit); @@ -277,34 +214,24 @@ int do_event_pipe(int argc, char **argv) jsonw_start_array(json_wtr); while (!stop) { - poll(pfds, nfds, 200); - for (i = 0; i < nfds; i++) - perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz); + err = perf_buffer__poll(pb, 200); + if (err < 0 && err != -EINTR) { + p_err("perf buffer polling failed: %s (%d)", + strerror(err), err); + goto err_close_pb; + } } - free(tmp_buf); if (json_output) jsonw_end_array(json_wtr); - for (i = 0; i < nfds; i++) { - perf_event_unmap(rings[i].mem); - close(rings[i].fd); - } - free(pfds); - free(rings); + perf_buffer__free(pb); close(map_fd); return 0; -err_close_fds_prev: - while (i--) { - perf_event_unmap(rings[i].mem); -err_close_fds_current: - close(rings[i].fd); - } - free(pfds); -err_free_rings: - free(rings); +err_close_pb: + perf_buffer__free(pb); err_close_map: close(map_fd); return -1; diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst index cef7b77eab69..8928f7787f2d 100644 --- a/tools/lib/bpf/README.rst +++ b/tools/lib/bpf/README.rst @@ -9,7 +9,8 @@ described here. It's recommended to follow these conventions whenever a new function or type is added to keep libbpf API clean and consistent. All types and functions provided by libbpf API should have one of the -following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``. +following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``, +``perf_buffer_``. System call wrappers -------------------- diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2a08eb106221..ed07789b3e62 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -32,7 +32,9 @@ #include <linux/limits.h> #include <linux/perf_event.h> #include <linux/ring_buffer.h> +#include <sys/epoll.h> #include <sys/ioctl.h> +#include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/vfs.h> @@ -2114,6 +2116,7 @@ static int bpf_object__create_maps(struct bpf_object *obj) { struct bpf_create_map_attr create_attr = {}; + int nr_cpus = 0; unsigned int i; int err; @@ -2136,7 +2139,22 @@ bpf_object__create_maps(struct bpf_object *obj) create_attr.map_flags = def->map_flags; create_attr.key_size = def->key_size; create_attr.value_size = def->value_size; - create_attr.max_entries = def->max_entries; + if (def->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && + !def->max_entries) { + if (!nr_cpus) + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + pr_warning("failed to determine number of system CPUs: %d\n", + nr_cpus); + err = nr_cpus; + goto err_out; + } + pr_debug("map '%s': setting size to %d\n", + map->name, nr_cpus); + create_attr.max_entries = nr_cpus; + } else { + create_attr.max_entries = def->max_entries; + } create_attr.btf_fd = 0; create_attr.btf_key_type_id = 0; create_attr.btf_value_type_id = 0; @@ -2153,9 +2171,10 @@ bpf_object__create_maps(struct bpf_object *obj) *pfd = bpf_create_map_xattr(&create_attr); if (*pfd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", - map->name, cp, errno); + map->name, cp, err); create_attr.btf_fd = 0; create_attr.btf_key_type_id = 0; create_attr.btf_value_type_id = 0; @@ -2167,11 +2186,11 @@ bpf_object__create_maps(struct bpf_object *obj) if (*pfd < 0) { size_t j; - err = *pfd; + err = -errno; err_out: - cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); - pr_warning("failed to create map (name: '%s'): %s\n", - map->name, cp); + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warning("failed to create map (name: '%s'): %s(%d)\n", + map->name, cp, err); for (j = 0; j < i; j++) zclose(obj->maps[j].fd); return err; @@ -4354,6 +4373,370 @@ bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, return ret; } +struct perf_buffer; + +struct perf_buffer_params { + struct perf_event_attr *attr; + /* if event_cb is specified, it takes precendence */ + perf_buffer_event_fn event_cb; + /* sample_cb and lost_cb are higher-level common-case callbacks */ + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; + int cpu_cnt; + int *cpus; + int *map_keys; +}; + +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void perf_buffer__free_cpu_buf(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + if (!cpu_buf) + return; + if (cpu_buf->base && + munmap(cpu_buf->base, pb->mmap_size + pb->page_size)) + pr_warning("failed to munmap cpu_buf #%d\n", cpu_buf->cpu); + if (cpu_buf->fd >= 0) { + ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0); + close(cpu_buf->fd); + } + free(cpu_buf->buf); + free(cpu_buf); +} + +void perf_buffer__free(struct perf_buffer *pb) +{ + int i; + + if (!pb) + return; + if (pb->cpu_bufs) { + for (i = 0; i < pb->cpu_cnt && pb->cpu_bufs[i]; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key); + perf_buffer__free_cpu_buf(pb, cpu_buf); + } + free(pb->cpu_bufs); + } + if (pb->epoll_fd >= 0) + close(pb->epoll_fd); + free(pb->events); + free(pb); +} + +static struct perf_cpu_buf * +perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr, + int cpu, int map_key) +{ + struct perf_cpu_buf *cpu_buf; + char msg[STRERR_BUFSIZE]; + int err; + + cpu_buf = calloc(1, sizeof(*cpu_buf)); + if (!cpu_buf) + return ERR_PTR(-ENOMEM); + + cpu_buf->pb = pb; + cpu_buf->cpu = cpu; + cpu_buf->map_key = map_key; + + cpu_buf->fd = syscall(__NR_perf_event_open, attr, -1 /* pid */, cpu, + -1, PERF_FLAG_FD_CLOEXEC); + if (cpu_buf->fd < 0) { + err = -errno; + pr_warning("failed to open perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + cpu_buf->fd, 0); + if (cpu_buf->base == MAP_FAILED) { + cpu_buf->base = NULL; + err = -errno; + pr_warning("failed to mmap perf buffer on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warning("failed to enable perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + return cpu_buf; + +error: + perf_buffer__free_cpu_buf(pb, cpu_buf); + return (struct perf_cpu_buf *)ERR_PTR(err); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p); + +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + const struct perf_buffer_opts *opts) +{ + struct perf_buffer_params p = {}; + struct perf_event_attr attr = { + .config = PERF_COUNT_SW_BPF_OUTPUT, + .type = PERF_TYPE_SOFTWARE, + .sample_type = PERF_SAMPLE_RAW, + .sample_period = 1, + .wakeup_events = 1, + }; + + p.attr = &attr; + p.sample_cb = opts ? opts->sample_cb : NULL; + p.lost_cb = opts ? opts->lost_cb : NULL; + p.ctx = opts ? opts->ctx : NULL; + + return __perf_buffer__new(map_fd, page_cnt, &p); +} + +struct perf_buffer * +perf_buffer__new_raw(int map_fd, size_t page_cnt, + const struct perf_buffer_raw_opts *opts) +{ + struct perf_buffer_params p = {}; + + p.attr = opts->attr; + p.event_cb = opts->event_cb; + p.ctx = opts->ctx; + p.cpu_cnt = opts->cpu_cnt; + p.cpus = opts->cpus; + p.map_keys = opts->map_keys; + + return __perf_buffer__new(map_fd, page_cnt, &p); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p) +{ + struct bpf_map_info map = {}; + char msg[STRERR_BUFSIZE]; + struct perf_buffer *pb; + __u32 map_info_len; + int err, i; + + if (page_cnt & (page_cnt - 1)) { + pr_warning("page count should be power of two, but is %zu\n", + page_cnt); + return ERR_PTR(-EINVAL); + } + + map_info_len = sizeof(map); + err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len); + if (err) { + err = -errno; + pr_warning("failed to get map info for map FD %d: %s\n", + map_fd, libbpf_strerror_r(err, msg, sizeof(msg))); + return ERR_PTR(err); + } + + if (map.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + pr_warning("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n", + map.name); + return ERR_PTR(-EINVAL); + } + + pb = calloc(1, sizeof(*pb)); + if (!pb) + return ERR_PTR(-ENOMEM); + + pb->event_cb = p->event_cb; + pb->sample_cb = p->sample_cb; + pb->lost_cb = p->lost_cb; + pb->ctx = p->ctx; + + pb->page_size = getpagesize(); + pb->mmap_size = pb->page_size * page_cnt; + pb->map_fd = map_fd; + + pb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (pb->epoll_fd < 0) { + err = -errno; + pr_warning("failed to create epoll instance: %s\n", + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (p->cpu_cnt > 0) { + pb->cpu_cnt = p->cpu_cnt; + } else { + pb->cpu_cnt = libbpf_num_possible_cpus(); + if (pb->cpu_cnt < 0) { + err = pb->cpu_cnt; + goto error; + } + if (map.max_entries < pb->cpu_cnt) + pb->cpu_cnt = map.max_entries; + } + + pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events)); + if (!pb->events) { + err = -ENOMEM; + pr_warning("failed to allocate events: out of memory\n"); + goto error; + } + pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs)); + if (!pb->cpu_bufs) { + err = -ENOMEM; + pr_warning("failed to allocate buffers: out of memory\n"); + goto error; + } + + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf; + int cpu, map_key; + + cpu = p->cpu_cnt > 0 ? p->cpus[i] : i; + map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i; + + cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key); + if (IS_ERR(cpu_buf)) { + err = PTR_ERR(cpu_buf); + goto error; + } + + pb->cpu_bufs[i] = cpu_buf; + + err = bpf_map_update_elem(pb->map_fd, &map_key, + &cpu_buf->fd, 0); + if (err) { + err = -errno; + pr_warning("failed to set cpu #%d, key %d -> perf FD %d: %s\n", + cpu, map_key, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + pb->events[i].events = EPOLLIN; + pb->events[i].data.ptr = cpu_buf; + if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd, + &pb->events[i]) < 0) { + err = -errno; + pr_warning("failed to epoll_ctl cpu #%d perf FD %d: %s\n", + cpu, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + } + + return pb; + +error: + if (pb) + perf_buffer__free(pb); + return ERR_PTR(err); +} + +struct perf_sample_raw { + struct perf_event_header header; + uint32_t size; + char data[0]; +}; + +struct perf_sample_lost { + struct perf_event_header header; + uint64_t id; + uint64_t lost; + uint64_t sample_id; +}; + +static enum bpf_perf_event_ret +perf_buffer__process_record(struct perf_event_header *e, void *ctx) +{ + struct perf_cpu_buf *cpu_buf = ctx; + struct perf_buffer *pb = cpu_buf->pb; + void *data = e; + + /* user wants full control over parsing perf event */ + if (pb->event_cb) + return pb->event_cb(pb->ctx, cpu_buf->cpu, e); + + switch (e->type) { + case PERF_RECORD_SAMPLE: { + struct perf_sample_raw *s = data; + + if (pb->sample_cb) + pb->sample_cb(pb->ctx, cpu_buf->cpu, s->data, s->size); + break; + } + case PERF_RECORD_LOST: { + struct perf_sample_lost *s = data; + + if (pb->lost_cb) + pb->lost_cb(pb->ctx, cpu_buf->cpu, s->lost); + break; + } + default: + pr_warning("unknown perf sample type %d\n", e->type); + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static int perf_buffer__process_records(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + enum bpf_perf_event_ret ret; + + ret = bpf_perf_event_read_simple(cpu_buf->base, pb->mmap_size, + pb->page_size, &cpu_buf->buf, + &cpu_buf->buf_size, + perf_buffer__process_record, cpu_buf); + if (ret != LIBBPF_PERF_EVENT_CONT) + return ret; + return 0; +} + +int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms) +{ + int i, cnt, err; + + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms); + for (i = 0; i < cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warning("error while processing records: %d\n", err); + return err; + } + } + return cnt < 0 ? -errno : cnt; +} + struct bpf_prog_info_array_desc { int array_offset; /* e.g. offset of jited_prog_insns */ int count_offset; /* e.g. offset of jited_prog_len */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f55933784f95..5cbf459ece0b 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -358,6 +358,26 @@ LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type, LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); +struct perf_buffer; + +typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, + void *data, __u32 size); +typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); + +/* common use perf buffer options */ +struct perf_buffer_opts { + /* if specified, sample_cb is called for each sample */ + perf_buffer_sample_fn sample_cb; + /* if specified, lost_cb is called for each batch of lost samples */ + perf_buffer_lost_fn lost_cb; + /* ctx is provided to sample_cb and lost_cb */ + void *ctx; +}; + +LIBBPF_API struct perf_buffer * +perf_buffer__new(int map_fd, size_t page_cnt, + const struct perf_buffer_opts *opts); + enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, LIBBPF_PERF_EVENT_ERROR = -1, @@ -365,6 +385,35 @@ enum bpf_perf_event_ret { }; struct perf_event_header; + +typedef enum bpf_perf_event_ret +(*perf_buffer_event_fn)(void *ctx, int cpu, struct perf_event_header *event); + +/* raw perf buffer options, giving most power and control */ +struct perf_buffer_raw_opts { + /* perf event attrs passed directly into perf_event_open() */ + struct perf_event_attr *attr; + /* raw event callback */ + perf_buffer_event_fn event_cb; + /* ctx is provided to event_cb */ + void *ctx; + /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of + * max_entries of given PERF_EVENT_ARRAY map) + */ + int cpu_cnt; + /* if cpu_cnt > 0, cpus is an array of CPUs to open ring buffers on */ + int *cpus; + /* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */ + int *map_keys; +}; + +LIBBPF_API struct perf_buffer * +perf_buffer__new_raw(int map_fd, size_t page_cnt, + const struct perf_buffer_raw_opts *opts); + +LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); + typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, void *private_data); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index e6b7d4edbc93..f9d316e873d8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -179,4 +179,8 @@ LIBBPF_0.0.4 { btf_dump__new; btf__parse_elf; libbpf_num_possible_cpus; + perf_buffer__free; + perf_buffer__new; + perf_buffer__new_raw; + perf_buffer__poll; } LIBBPF_0.0.3; diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c new file mode 100644 index 000000000000..3f1ef95865ff --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <sys/socket.h> +#include <test_progs.h> + +#ifdef __x86_64__ +#define SYS_KPROBE_NAME "__x64_sys_nanosleep" +#else +#define SYS_KPROBE_NAME "sys_nanosleep" +#endif + +static void on_sample(void *ctx, int cpu, void *data, __u32 size) +{ + int cpu_data = *(int *)data, duration = 0; + cpu_set_t *cpu_seen = ctx; + + if (cpu_data != cpu) + CHECK(cpu_data != cpu, "check_cpu_data", + "cpu_data %d != cpu %d\n", cpu_data, cpu); + + CPU_SET(cpu, cpu_seen); +} + +void test_perf_buffer(void) +{ + int err, prog_fd, nr_cpus, i, duration = 0; + const char *prog_name = "kprobe/sys_nanosleep"; + const char *file = "./test_perf_buffer.o"; + struct perf_buffer_opts pb_opts = {}; + struct bpf_map *perf_buf_map; + cpu_set_t cpu_set, cpu_seen; + struct bpf_program *prog; + struct bpf_object *obj; + struct perf_buffer *pb; + struct bpf_link *link; + + nr_cpus = libbpf_num_possible_cpus(); + if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus)) + return; + + /* load program */ + err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd); + if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) + return; + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name)) + goto out_close; + + /* load map */ + perf_buf_map = bpf_object__find_map_by_name(obj, "perf_buf_map"); + if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n")) + goto out_close; + + /* attach kprobe */ + link = bpf_program__attach_kprobe(prog, false /* retprobe */, + SYS_KPROBE_NAME); + if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link))) + goto out_close; + + /* set up perf buffer */ + pb_opts.sample_cb = on_sample; + pb_opts.ctx = &cpu_seen; + pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts); + if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb))) + goto out_detach; + + /* trigger kprobe on every CPU */ + CPU_ZERO(&cpu_seen); + for (i = 0; i < nr_cpus; i++) { + CPU_ZERO(&cpu_set); + CPU_SET(i, &cpu_set); + + err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), + &cpu_set); + if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n", + i, err)) + goto out_detach; + + usleep(1); + } + + /* read perf buffer */ + err = perf_buffer__poll(pb, 100); + if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err)) + goto out_free_pb; + + if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt", + "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen))) + goto out_free_pb; + +out_free_pb: + perf_buffer__free(pb); +out_detach: + bpf_link__destroy(link); +out_close: + bpf_object__close(obj); +} diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c new file mode 100644 index 000000000000..876c27deb65a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Facebook + +#include <linux/ptrace.h> +#include <linux/bpf.h> +#include "bpf_helpers.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); +} perf_buf_map SEC(".maps"); + +SEC("kprobe/sys_nanosleep") +int handle_sys_nanosleep_entry(struct pt_regs *ctx) +{ + int cpu = bpf_get_smp_processor_id(); + + bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU, + &cpu, sizeof(cpu)); + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; |