summaryrefslogtreecommitdiffstats
path: root/tools/perf/util/machine.c
diff options
context:
space:
mode:
authorKan Liang2015-01-05 19:23:05 +0100
committerIngo Molnar2015-02-18 17:16:18 +0100
commit384b60557b5522fcb99646f0eb6e7a344cdb94c6 (patch)
tree20da511149037cf1066c4f76ff9d84963bec12d0 /tools/perf/util/machine.c
parentperf tools: Enable LBR call stack support (diff)
downloadkernel-qcow2-linux-384b60557b5522fcb99646f0eb6e7a344cdb94c6.tar.gz
kernel-qcow2-linux-384b60557b5522fcb99646f0eb6e7a344cdb94c6.tar.xz
kernel-qcow2-linux-384b60557b5522fcb99646f0eb6e7a344cdb94c6.zip
perf tools: Construct LBR call chain
LBR call stack only has user-space callchains. It is output in the PERF_SAMPLE_BRANCH_STACK data format. For kernel callchains, it's still in the form of PERF_SAMPLE_CALLCHAIN. The perf tool has to handle both data sources to construct a complete callstack. For the "perf report -D" option, both lbr and fp information will be displayed. A new call chain recording option "lbr" is introduced into the perf tool for LBR call stack. The user can use --call-graph lbr to get the call stack information from hardware. Here are some examples. When profiling bc(1) on Fedora 19: echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph lbr bc -l < cmd If enabling LBR, perf report output looks like: 50.36% bc bc [.] bc_divide | --- bc_divide execute run_code yyparse main __libc_start_main _start 33.66% bc bc [.] _one_mult | --- _one_mult bc_divide execute run_code yyparse main __libc_start_main _start 7.62% bc bc [.] _bc_do_add | --- _bc_do_add | |--99.89%-- 0x2000186a8 --0.11%-- [...] 6.83% bc bc [.] _bc_do_sub | --- _bc_do_sub | |--99.94%-- bc_add | execute | run_code | yyparse | main | __libc_start_main | _start --0.06%-- [...] 0.46% bc libc-2.17.so [.] __memset_sse2 | --- __memset_sse2 | |--54.13%-- bc_new_num | | | |--51.00%-- bc_divide | | execute | | run_code | | yyparse | | main | | __libc_start_main | | _start | | | |--30.46%-- _bc_do_sub | | bc_add | | execute | | run_code | | yyparse | | main | | __libc_start_main | | _start | | | --18.55%-- _bc_do_add | bc_add | execute | run_code | yyparse | main | __libc_start_main | _start | --45.87%-- bc_divide execute run_code yyparse main __libc_start_main _start If using FP, perf report output looks like: echo 'scale=2000; 4*a(1)' > cmd; perf record --call-graph fp bc -l < cmd 50.49% bc bc [.] bc_divide | --- bc_divide 33.57% bc bc [.] _one_mult | --- _one_mult 7.61% bc bc [.] _bc_do_add | --- _bc_do_add 0x2000186a8 6.88% bc bc [.] _bc_do_sub | --- _bc_do_sub 0.42% bc libc-2.17.so [.] __memcpy_ssse3_back | --- __memcpy_ssse3_back If using LBR, perf report -D output looks like: 3458145275743 0x2fd750 [0xd8]: PERF_RECORD_SAMPLE(IP, 0x2): 9748/9748: 0x408ea8 period: 609644 addr: 0 ... LBR call chain: nr:8 ..... 0: fffffffffffffe00 ..... 1: 0000000000408e50 ..... 2: 000000000040a458 ..... 3: 000000000040562e ..... 4: 0000000000408590 ..... 5: 00000000004022c0 ..... 6: 00000000004015dd ..... 7: 0000003d1cc21b43 ... FP chain: nr:2 ..... 0: fffffffffffffe00 ..... 1: 0000000000408ea8 ... thread: bc:9748 ...... dso: /usr/bin/bc The LBR call stack has the following known limitations: - Zero length calls are not filtered out by the hardware - Exception handing such as setjmp/longjmp will have calls/returns not match - Pushing different return address onto the stack will have calls/returns not match - If callstack is deeper than the LBR, only the last entries are captured Tested-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Kan Liang <kan.liang@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Borislav Petkov <bp@suse.de> Cc: David Ahern <dsahern@gmail.com> Cc: Don Zickus <dzickus@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Simon Que <sque@chromium.org> Cc: Stephane Eranian <eranian@google.com> Link: http://lkml.kernel.org/r/1420482185-29830-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'tools/perf/util/machine.c')
-rw-r--r--tools/perf/util/machine.c102
1 files changed, 91 insertions, 11 deletions
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 1bca3a9f2b16..9e0f60a7e7b3 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1502,18 +1502,100 @@ static int remove_loops(struct branch_entry *l, int nr)
return nr;
}
-static int thread__resolve_callchain_sample(struct thread *thread,
- struct ip_callchain *chain,
- struct branch_stack *branch,
- struct symbol **parent,
- struct addr_location *root_al,
- int max_stack)
+/*
+ * Recolve LBR callstack chain sample
+ * Return:
+ * 1 on success get LBR callchain information
+ * 0 no available LBR callchain information, should try fp
+ * negative error code on other errors.
+ */
+static int resolve_lbr_callchain_sample(struct thread *thread,
+ struct perf_sample *sample,
+ struct symbol **parent,
+ struct addr_location *root_al,
+ int max_stack)
{
+ struct ip_callchain *chain = sample->callchain;
+ int chain_nr = min(max_stack, (int)chain->nr);
+ int i, j, err;
+ u64 ip;
+
+ for (i = 0; i < chain_nr; i++) {
+ if (chain->ips[i] == PERF_CONTEXT_USER)
+ break;
+ }
+
+ /* LBR only affects the user callchain */
+ if (i != chain_nr) {
+ struct branch_stack *lbr_stack = sample->branch_stack;
+ int lbr_nr = lbr_stack->nr;
+ /*
+ * LBR callstack can only get user call chain.
+ * The mix_chain_nr is kernel call chain
+ * number plus LBR user call chain number.
+ * i is kernel call chain number,
+ * 1 is PERF_CONTEXT_USER,
+ * lbr_nr + 1 is the user call chain number.
+ * For details, please refer to the comments
+ * in callchain__printf
+ */
+ int mix_chain_nr = i + 1 + lbr_nr + 1;
+
+ if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
+ pr_warning("corrupted callchain. skipping...\n");
+ return 0;
+ }
+
+ for (j = 0; j < mix_chain_nr; j++) {
+ if (callchain_param.order == ORDER_CALLEE) {
+ if (j < i + 1)
+ ip = chain->ips[j];
+ else if (j > i + 1)
+ ip = lbr_stack->entries[j - i - 2].from;
+ else
+ ip = lbr_stack->entries[0].to;
+ } else {
+ if (j < lbr_nr)
+ ip = lbr_stack->entries[lbr_nr - j - 1].from;
+ else if (j > lbr_nr)
+ ip = chain->ips[i + 1 - (j - lbr_nr)];
+ else
+ ip = lbr_stack->entries[0].to;
+ }
+
+ err = add_callchain_ip(thread, parent, root_al, false, ip);
+ if (err)
+ return (err < 0) ? err : 0;
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+static int thread__resolve_callchain_sample(struct thread *thread,
+ struct perf_evsel *evsel,
+ struct perf_sample *sample,
+ struct symbol **parent,
+ struct addr_location *root_al,
+ int max_stack)
+{
+ struct branch_stack *branch = sample->branch_stack;
+ struct ip_callchain *chain = sample->callchain;
int chain_nr = min(max_stack, (int)chain->nr);
int i, j, err;
int skip_idx = -1;
int first_call = 0;
+ callchain_cursor_reset(&callchain_cursor);
+
+ if (has_branch_callstack(evsel)) {
+ err = resolve_lbr_callchain_sample(thread, sample, parent,
+ root_al, max_stack);
+ if (err)
+ return (err < 0) ? err : 0;
+ }
+
/*
* Based on DWARF debug information, some architectures skip
* a callchain entry saved by the kernel.
@@ -1521,8 +1603,6 @@ static int thread__resolve_callchain_sample(struct thread *thread,
if (chain->nr < PERF_MAX_STACK_DEPTH)
skip_idx = arch_skip_callchain_idx(thread, chain);
- callchain_cursor_reset(&callchain_cursor);
-
/*
* Add branches to call stack for easier browsing. This gives
* more context for a sample than just the callers.
@@ -1623,9 +1703,9 @@ int thread__resolve_callchain(struct thread *thread,
struct addr_location *root_al,
int max_stack)
{
- int ret = thread__resolve_callchain_sample(thread, sample->callchain,
- sample->branch_stack,
- parent, root_al, max_stack);
+ int ret = thread__resolve_callchain_sample(thread, evsel,
+ sample, parent,
+ root_al, max_stack);
if (ret)
return ret;