linux/tools/perf/util/annotate.c

1976 lines
46 KiB
C
Raw Normal View History

/*
* Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
*
* Parts came from builtin-annotate.c, see those files for further
* copyright notes.
*
* Released under the GPL v2. (and only v2, not any later version)
*/
#include <errno.h>
#include <inttypes.h>
#include "util.h"
#include "ui/ui.h"
#include "sort.h"
#include "build-id.h"
#include "color.h"
#include "cache.h"
#include "symbol.h"
#include "debug.h"
#include "annotate.h"
#include "evsel.h"
perf annotate: Add branch stack / basic block I wanted to know the hottest path through a function and figured the branch-stack (LBR) information should be able to help out with that. The below uses the branch-stack to create basic blocks and generate statistics from them. from to branch_i * ----> * | | block v * ----> * from to branch_i+1 The blocks are broken down into non-overlapping ranges, while tracking if the start of each range is an entry point and/or the end of a range is a branch. Each block iterates all ranges it covers (while splitting where required to exactly match the block) and increments the 'coverage' count. For the range including the branch we increment the taken counter, as well as the pred counter if flags.predicted. Using these number we can find if an instruction: - had coverage; given by: br->coverage / br->sym->max_coverage This metric ensures each symbol has a 100% spot, which reflects the observation that each symbol must have a most covered/hottest block. - is a branch target: br->is_target && br->start == add - for targets, how much of a branch's coverages comes from it: target->entry / branch->coverage - is a branch: br->is_branch && br->end == addr - for branches, how often it was taken: br->taken / br->coverage after all, all execution that didn't take the branch would have incremented the coverage and continued onward to a later branch. - for branches, how often it was predicted: br->pred / br->taken The coverage percentage is used to color the address and asm sections; for low (<1%) coverage we use NORMAL (uncolored), indicating that these instructions are not 'important'. For high coverage (>75%) we color the address RED. For each branch, we add an asm comment after the instruction with information on how often it was taken and predicted. Output looks like (sans color, which does loose a lot of the information :/) $ perf record --branch-filter u,any -e cycles:p ./branches 27 $ perf annotate branches Percent | Source code & Disassembly of branches for cycles:pu (217 samples) --------------------------------------------------------------------------------- : branches(): 0.00 : 40057a: push %rbp 0.00 : 40057b: mov %rsp,%rbp 0.00 : 40057e: sub $0x20,%rsp 0.00 : 400582: mov %rdi,-0x18(%rbp) 0.00 : 400586: mov %rsi,-0x20(%rbp) 0.00 : 40058a: mov -0x18(%rbp),%rax 0.00 : 40058e: mov %rax,-0x10(%rbp) 0.00 : 400592: movq $0x0,-0x8(%rbp) 0.00 : 40059a: jmpq 400656 <branches+0xdc> 1.84 : 40059f: mov -0x10(%rbp),%rax # +100.00% 3.23 : 4005a3: and $0x1,%eax 1.84 : 4005a6: test %rax,%rax 0.00 : 4005a9: je 4005bf <branches+0x45> # -54.50% (p:42.00%) 0.46 : 4005ab: mov 0x200bbe(%rip),%rax # 601170 <acc> 12.90 : 4005b2: add $0x1,%rax 2.30 : 4005b6: mov %rax,0x200bb3(%rip) # 601170 <acc> 0.46 : 4005bd: jmp 4005d1 <branches+0x57> # -100.00% (p:100.00%) 0.92 : 4005bf: mov 0x200baa(%rip),%rax # 601170 <acc> # +49.54% 13.82 : 4005c6: sub $0x1,%rax 0.46 : 4005ca: mov %rax,0x200b9f(%rip) # 601170 <acc> 2.30 : 4005d1: mov -0x10(%rbp),%rax # +50.46% 0.46 : 4005d5: mov %rax,%rdi 0.46 : 4005d8: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 4005dd: mov %rax,-0x10(%rbp) # +100.00% 0.92 : 4005e1: mov -0x18(%rbp),%rax 0.00 : 4005e5: and $0x1,%eax 0.00 : 4005e8: test %rax,%rax 0.00 : 4005eb: je 4005ff <branches+0x85> # -100.00% (p:100.00%) 0.00 : 4005ed: mov 0x200b7c(%rip),%rax # 601170 <acc> 0.00 : 4005f4: shr $0x2,%rax 0.00 : 4005f8: mov %rax,0x200b71(%rip) # 601170 <acc> 0.00 : 4005ff: mov -0x10(%rbp),%rax # +100.00% 7.37 : 400603: and $0x1,%eax 3.69 : 400606: test %rax,%rax 0.00 : 400609: jne 400612 <branches+0x98> # -59.25% (p:42.99%) 1.84 : 40060b: mov $0x1,%eax 14.29 : 400610: jmp 400617 <branches+0x9d> # -100.00% (p:100.00%) 1.38 : 400612: mov $0x0,%eax # +57.65% 10.14 : 400617: test %al,%al # +42.35% 0.00 : 400619: je 40062f <branches+0xb5> # -57.65% (p:100.00%) 0.46 : 40061b: mov 0x200b4e(%rip),%rax # 601170 <acc> 2.76 : 400622: sub $0x1,%rax 0.00 : 400626: mov %rax,0x200b43(%rip) # 601170 <acc> 0.46 : 40062d: jmp 400641 <branches+0xc7> # -100.00% (p:100.00%) 0.92 : 40062f: mov 0x200b3a(%rip),%rax # 601170 <acc> # +56.13% 2.30 : 400636: add $0x1,%rax 0.92 : 40063a: mov %rax,0x200b2f(%rip) # 601170 <acc> 0.92 : 400641: mov -0x10(%rbp),%rax # +43.87% 2.30 : 400645: mov %rax,%rdi 0.00 : 400648: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 40064d: mov %rax,-0x10(%rbp) # +100.00% 1.84 : 400651: addq $0x1,-0x8(%rbp) 0.92 : 400656: mov -0x8(%rbp),%rax 5.07 : 40065a: cmp -0x20(%rbp),%rax 0.00 : 40065e: jb 40059f <branches+0x25> # -100.00% (p:100.00%) 0.00 : 400664: nop 0.00 : 400665: leaveq 0.00 : 400666: retq (Note: the --branch-filter u,any was used to avoid spurious target and branch points due to interrupts/faults, they show up as very small -/+ annotations on 'weird' locations) Committer note: Please take a look at: http://vger.kernel.org/~acme/perf/annotate_basic_blocks.png To see the colors. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: David Carrillo-Cisneros <davidcc@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Stephane Eranian <eranian@google.com> [ Moved sym->max_coverage to 'struct annotate', aka symbol__annotate(sym) ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-09-06 03:08:12 +08:00
#include "block-range.h"
#include "string2.h"
#include "arch/common.h"
#include <regex.h>
#include <pthread.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <sys/utsname.h>
#include "sane_ctype.h"
const char *disassembler_style;
const char *objdump_path;
static regex_t file_lineno;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
static struct ins_ops *ins__find(struct arch *arch, const char *name);
perf annotate: Introduce alternative method of keeping instructions table Some arches may want to dynamically populate the table using regular expressions on the instruction names to associate them with a set of parsing/formatting/etc functions (struct ins_ops), so provide a fallback for when the ins__find() method fails. That fall back will be able to resize the arch->instructions, setting arch->nr_instructions appropriately, helper functions to associate an ins_ops to an instruction name, growing the arch->instructions if needed and resorting it are provided, all the arch specific callback needs to do is to decide if the missing instruction should be added to arch->instructions with a ins_ops association. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-auu13yradxf7g5dgtpnzt97a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:37:08 +08:00
static void ins__sort(struct arch *arch);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
static int disasm_line__parse(char *line, const char **namep, char **rawp);
struct arch {
const char *name;
struct ins *instructions;
size_t nr_instructions;
perf annotate: Introduce alternative method of keeping instructions table Some arches may want to dynamically populate the table using regular expressions on the instruction names to associate them with a set of parsing/formatting/etc functions (struct ins_ops), so provide a fallback for when the ins__find() method fails. That fall back will be able to resize the arch->instructions, setting arch->nr_instructions appropriately, helper functions to associate an ins_ops to an instruction name, growing the arch->instructions if needed and resorting it are provided, all the arch specific callback needs to do is to decide if the missing instruction should be added to arch->instructions with a ins_ops association. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-auu13yradxf7g5dgtpnzt97a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:37:08 +08:00
size_t nr_instructions_allocated;
struct ins_ops *(*associate_instruction_ops)(struct arch *arch, const char *name);
bool sorted_instructions;
bool initialized;
void *priv;
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
unsigned int model;
unsigned int family;
int (*init)(struct arch *arch);
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
bool (*ins_is_fused)(struct arch *arch, const char *ins1,
const char *ins2);
int (*cpuid_parse)(struct arch *arch, char *cpuid);
struct {
char comment_char;
char skip_functions_char;
} objdump;
};
static struct ins_ops call_ops;
static struct ins_ops dec_ops;
static struct ins_ops jump_ops;
static struct ins_ops mov_ops;
static struct ins_ops nop_ops;
static struct ins_ops lock_ops;
static struct ins_ops ret_ops;
perf annotate: Introduce alternative method of keeping instructions table Some arches may want to dynamically populate the table using regular expressions on the instruction names to associate them with a set of parsing/formatting/etc functions (struct ins_ops), so provide a fallback for when the ins__find() method fails. That fall back will be able to resize the arch->instructions, setting arch->nr_instructions appropriately, helper functions to associate an ins_ops to an instruction name, growing the arch->instructions if needed and resorting it are provided, all the arch specific callback needs to do is to decide if the missing instruction should be added to arch->instructions with a ins_ops association. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-auu13yradxf7g5dgtpnzt97a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:37:08 +08:00
static int arch__grow_instructions(struct arch *arch)
{
struct ins *new_instructions;
size_t new_nr_allocated;
if (arch->nr_instructions_allocated == 0 && arch->instructions)
goto grow_from_non_allocated_table;
new_nr_allocated = arch->nr_instructions_allocated + 128;
new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins));
if (new_instructions == NULL)
return -1;
out_update_instructions:
arch->instructions = new_instructions;
arch->nr_instructions_allocated = new_nr_allocated;
return 0;
grow_from_non_allocated_table:
new_nr_allocated = arch->nr_instructions + 128;
new_instructions = calloc(new_nr_allocated, sizeof(struct ins));
if (new_instructions == NULL)
return -1;
memcpy(new_instructions, arch->instructions, arch->nr_instructions);
goto out_update_instructions;
}
static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops)
perf annotate: Introduce alternative method of keeping instructions table Some arches may want to dynamically populate the table using regular expressions on the instruction names to associate them with a set of parsing/formatting/etc functions (struct ins_ops), so provide a fallback for when the ins__find() method fails. That fall back will be able to resize the arch->instructions, setting arch->nr_instructions appropriately, helper functions to associate an ins_ops to an instruction name, growing the arch->instructions if needed and resorting it are provided, all the arch specific callback needs to do is to decide if the missing instruction should be added to arch->instructions with a ins_ops association. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-auu13yradxf7g5dgtpnzt97a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:37:08 +08:00
{
struct ins *ins;
if (arch->nr_instructions == arch->nr_instructions_allocated &&
arch__grow_instructions(arch))
return -1;
ins = &arch->instructions[arch->nr_instructions];
ins->name = strdup(name);
if (!ins->name)
return -1;
ins->ops = ops;
arch->nr_instructions++;
ins__sort(arch);
return 0;
}
#include "arch/arm/annotate/instructions.c"
perf annotate: AArch64 support This is a regex converted version from the original: https://lkml.org/lkml/2016/5/19/461 Add basic support to recognise AArch64 assembly. This allows perf to identify AArch64 instructions that branch to other parts within the same function, thereby properly annotating them. Rebased onto new cross-arch annotation bits: https://lkml.org/lkml/2016/11/25/546 Sample output: security_file_permission vmlinux 5.80 │ ← ret ▒ │70: ldr w0, [x21,#68] ▒ 4.44 │ ↓ tbnz d0 ▒ │ mov w0, #0x24 // #36 ▒ 1.37 │ ands w0, w22, w0 ▒ │ ↑ b.eq 60 ▒ 1.37 │ ↓ tbnz e4 ▒ │ mov w19, #0x20000 // #131072 ▒ 1.02 │ ↓ tbz ec ▒ │90:┌─→ldr x3, [x21,#24] ▒ 1.37 │ │ add x21, x21, #0x10 ▒ │ │ mov w2, w19 ▒ 1.02 │ │ mov x0, x21 ▒ │ │ mov x1, x3 ▒ 1.71 │ │ ldr x20, [x3,#48] ▒ │ │→ bl __fsnotify_parent ▒ 0.68 │ │↑ cbnz 60 ▒ │ │ mov x2, x21 ▒ 1.37 │ │ mov w1, w19 ▒ │ │ mov x0, x20 ▒ 0.68 │ │ mov w5, #0x0 // #0 ▒ │ │ mov x4, #0x0 // #0 ▒ 1.71 │ │ mov w3, #0x1 // #1 ▒ │ │→ bl fsnotify ▒ 1.37 │ │↑ b 60 ▒ │d0:│ mov w0, #0x0 // #0 ▒ │ │ ldp x19, x20, [sp,#16] ▒ │ │ ldp x21, x22, [sp,#32] ▒ │ │ ldp x29, x30, [sp],#48 ▒ │ │← ret ▒ │e4:│ mov w19, #0x10000 // #65536 ▒ │ └──b 90 ◆ │ec: brk #0x800 ▒ Press 'h' for help on key bindings Signed-off-by: Kim Phillips <kim.phillips@arm.com> Signed-off-by: Chris Ryder <chris.ryder@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will.deacon@arm.com> Link: http://lkml.kernel.org/r/20161130092344.012e18e3e623bea395162f95@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-30 23:23:44 +08:00
#include "arch/arm64/annotate/instructions.c"
#include "arch/x86/annotate/instructions.c"
perf annotate: Initial PowerPC support Support the PowerPC architecture using the ins_ops association method. Committer notes: Testing it with a perf.data file collected on a PowerPC machine and cross-annotated on a x86_64 workstation, using the associated vmlinux file: $ perf report -i perf.data.f22vm.powerdev --vmlinux vmlinux.powerpc .ktime_get vmlinux.powerpc │ clrldi r9,r28,63 8.57 │ ┌──bne e0 <- TUI cursor positioned here │54:│ lwsync 2.86 │ │ std r2,40(r1) │ │ ld r9,144(r31) │ │ ld r3,136(r31) │ │ ld r30,184(r31) │ │ ld r10,0(r9) │ │ mtctr r10 │ │ ld r2,8(r9) 8.57 │ │→ bctrl │ │ ld r2,40(r1) │ │ ld r10,160(r31) │ │ ld r5,152(r31) │ │ lwz r7,168(r31) │ │ ld r9,176(r31) 8.57 │ │ lwz r6,172(r31) │ │ lwsync 2.86 │ │ lwz r8,128(r31) │ │ cmpw cr7,r8,r28 2.86 │ │↑ bne 48 │ │ subf r10,r10,r3 │ │ mr r3,r29 │ │ and r10,r10,r5 2.86 │ │ mulld r10,r10,r7 │ │ add r9,r10,r9 │ │ srd r9,r9,r6 │ │ add r9,r9,r30 │ │ std r9,0(r29) │ │ addi r1,r1,144 │ │ ld r0,16(r1) │ │ ld r28,-32(r1) │ │ ld r29,-24(r1) │ │ ld r30,-16(r1) │ │ mtlr r0 │ │ ld r31,-8(r1) │ │← blr 5.71 │e0:└─→mr r1,r1 11.43 │ mr r2,r2 11.43 │ lwz r28,128(r31) Press 'h' for help on key bindings $ perf report -i perf.data.f22vm.powerdev --header-only # ======== # captured on: Thu Nov 24 12:40:38 2016 # hostname : pdev-f22-qemu # os release : 4.4.10-200.fc22.ppc64 # perf version : 4.9.rc1.g6298ce # arch : ppc64 # nrcpus online : 48 # nrcpus avail : 48 # cpudesc : POWER7 (architected), altivec supported # cpuid : 74,513 # total memory : 4158976 kB # cmdline : /home/ravi/Workspace/linux/tools/perf/perf record -a # event : name = cycles:ppp, , size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|CPU|PERIOD, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1 # HEADER_CPU_TOPOLOGY info available, use -I to display # HEADER_NUMA_TOPOLOGY info available, use -I to display # pmu mappings: cpu = 4, software = 1, tracepoint = 2, breakpoint = 5 # missing features: HEADER_TRACING_DATA HEADER_BRANCH_STACK HEADER_GROUP_DESC HEADER_AUXTRACE HEADER_STAT HEADER_CACHE # ======== # $ Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Kim Phillips <kim.phillips@arm.com> Link: http://lkml.kernel.org/n/tip-tbjnp40ddoxxl474uvhwi6g4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 00:03:46 +08:00
#include "arch/powerpc/annotate/instructions.c"
#include "arch/s390/annotate/instructions.c"
static struct arch architectures[] = {
{
.name = "arm",
.init = arm__annotate_init,
},
perf annotate: AArch64 support This is a regex converted version from the original: https://lkml.org/lkml/2016/5/19/461 Add basic support to recognise AArch64 assembly. This allows perf to identify AArch64 instructions that branch to other parts within the same function, thereby properly annotating them. Rebased onto new cross-arch annotation bits: https://lkml.org/lkml/2016/11/25/546 Sample output: security_file_permission vmlinux 5.80 │ ← ret ▒ │70: ldr w0, [x21,#68] ▒ 4.44 │ ↓ tbnz d0 ▒ │ mov w0, #0x24 // #36 ▒ 1.37 │ ands w0, w22, w0 ▒ │ ↑ b.eq 60 ▒ 1.37 │ ↓ tbnz e4 ▒ │ mov w19, #0x20000 // #131072 ▒ 1.02 │ ↓ tbz ec ▒ │90:┌─→ldr x3, [x21,#24] ▒ 1.37 │ │ add x21, x21, #0x10 ▒ │ │ mov w2, w19 ▒ 1.02 │ │ mov x0, x21 ▒ │ │ mov x1, x3 ▒ 1.71 │ │ ldr x20, [x3,#48] ▒ │ │→ bl __fsnotify_parent ▒ 0.68 │ │↑ cbnz 60 ▒ │ │ mov x2, x21 ▒ 1.37 │ │ mov w1, w19 ▒ │ │ mov x0, x20 ▒ 0.68 │ │ mov w5, #0x0 // #0 ▒ │ │ mov x4, #0x0 // #0 ▒ 1.71 │ │ mov w3, #0x1 // #1 ▒ │ │→ bl fsnotify ▒ 1.37 │ │↑ b 60 ▒ │d0:│ mov w0, #0x0 // #0 ▒ │ │ ldp x19, x20, [sp,#16] ▒ │ │ ldp x21, x22, [sp,#32] ▒ │ │ ldp x29, x30, [sp],#48 ▒ │ │← ret ▒ │e4:│ mov w19, #0x10000 // #65536 ▒ │ └──b 90 ◆ │ec: brk #0x800 ▒ Press 'h' for help on key bindings Signed-off-by: Kim Phillips <kim.phillips@arm.com> Signed-off-by: Chris Ryder <chris.ryder@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Will Deacon <will.deacon@arm.com> Link: http://lkml.kernel.org/r/20161130092344.012e18e3e623bea395162f95@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-30 23:23:44 +08:00
{
.name = "arm64",
.init = arm64__annotate_init,
},
{
.name = "x86",
.instructions = x86__instructions,
.nr_instructions = ARRAY_SIZE(x86__instructions),
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
.ins_is_fused = x86__ins_is_fused,
.cpuid_parse = x86__cpuid_parse,
.objdump = {
.comment_char = '#',
},
},
perf annotate: Initial PowerPC support Support the PowerPC architecture using the ins_ops association method. Committer notes: Testing it with a perf.data file collected on a PowerPC machine and cross-annotated on a x86_64 workstation, using the associated vmlinux file: $ perf report -i perf.data.f22vm.powerdev --vmlinux vmlinux.powerpc .ktime_get vmlinux.powerpc │ clrldi r9,r28,63 8.57 │ ┌──bne e0 <- TUI cursor positioned here │54:│ lwsync 2.86 │ │ std r2,40(r1) │ │ ld r9,144(r31) │ │ ld r3,136(r31) │ │ ld r30,184(r31) │ │ ld r10,0(r9) │ │ mtctr r10 │ │ ld r2,8(r9) 8.57 │ │→ bctrl │ │ ld r2,40(r1) │ │ ld r10,160(r31) │ │ ld r5,152(r31) │ │ lwz r7,168(r31) │ │ ld r9,176(r31) 8.57 │ │ lwz r6,172(r31) │ │ lwsync 2.86 │ │ lwz r8,128(r31) │ │ cmpw cr7,r8,r28 2.86 │ │↑ bne 48 │ │ subf r10,r10,r3 │ │ mr r3,r29 │ │ and r10,r10,r5 2.86 │ │ mulld r10,r10,r7 │ │ add r9,r10,r9 │ │ srd r9,r9,r6 │ │ add r9,r9,r30 │ │ std r9,0(r29) │ │ addi r1,r1,144 │ │ ld r0,16(r1) │ │ ld r28,-32(r1) │ │ ld r29,-24(r1) │ │ ld r30,-16(r1) │ │ mtlr r0 │ │ ld r31,-8(r1) │ │← blr 5.71 │e0:└─→mr r1,r1 11.43 │ mr r2,r2 11.43 │ lwz r28,128(r31) Press 'h' for help on key bindings $ perf report -i perf.data.f22vm.powerdev --header-only # ======== # captured on: Thu Nov 24 12:40:38 2016 # hostname : pdev-f22-qemu # os release : 4.4.10-200.fc22.ppc64 # perf version : 4.9.rc1.g6298ce # arch : ppc64 # nrcpus online : 48 # nrcpus avail : 48 # cpudesc : POWER7 (architected), altivec supported # cpuid : 74,513 # total memory : 4158976 kB # cmdline : /home/ravi/Workspace/linux/tools/perf/perf record -a # event : name = cycles:ppp, , size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|CPU|PERIOD, disabled = 1, inherit = 1, mmap = 1, comm = 1, freq = 1, task = 1, precise_ip = 3, sample_id_all = 1, exclude_guest = 1, mmap2 = 1, comm_exec = 1 # HEADER_CPU_TOPOLOGY info available, use -I to display # HEADER_NUMA_TOPOLOGY info available, use -I to display # pmu mappings: cpu = 4, software = 1, tracepoint = 2, breakpoint = 5 # missing features: HEADER_TRACING_DATA HEADER_BRANCH_STACK HEADER_GROUP_DESC HEADER_AUXTRACE HEADER_STAT HEADER_CACHE # ======== # $ Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Kim Phillips <kim.phillips@arm.com> Link: http://lkml.kernel.org/n/tip-tbjnp40ddoxxl474uvhwi6g4@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 00:03:46 +08:00
{
.name = "powerpc",
.init = powerpc__annotate_init,
},
{
.name = "s390",
.init = s390__annotate_init,
.objdump = {
.comment_char = '#',
},
},
};
static void ins__delete(struct ins_operands *ops)
{
2015-03-06 02:27:28 +08:00
if (ops == NULL)
return;
zfree(&ops->source.raw);
zfree(&ops->source.name);
zfree(&ops->target.raw);
zfree(&ops->target.name);
}
static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
return scnprintf(bf, size, "%-6.6s %s", ins->name, ops->raw);
}
int ins__scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
if (ins->ops->scnprintf)
return ins->ops->scnprintf(ins, bf, size, ops);
return ins__raw_scnprintf(ins, bf, size, ops);
}
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
{
if (!arch || !arch->ins_is_fused)
return false;
return arch->ins_is_fused(arch, ins1, ins2);
}
static int call__parse(struct arch *arch, struct ins_operands *ops, struct map *map)
{
char *endptr, *tok, *name;
ops->target.addr = strtoull(ops->raw, &endptr, 16);
name = strchr(endptr, '<');
if (name == NULL)
goto indirect_call;
name++;
if (arch->objdump.skip_functions_char &&
strchr(name, arch->objdump.skip_functions_char))
return -1;
tok = strchr(name, '>');
if (tok == NULL)
return -1;
*tok = '\0';
ops->target.name = strdup(name);
*tok = '>';
return ops->target.name == NULL ? -1 : 0;
indirect_call:
perf annotate: Do not ignore call instruction with indirect target Do not ignore call instruction with indirect target when its already identified as a call. This is an extension of commit e8ea1561952b ("perf annotate: Use raw form for register indirect call instructions") to generalize annotation for all instructions with indirect calls. This is needed for certain powerpc call instructions that use address in a register (such as bctrl, btarl, ...). Apart from that, when kcore is used to disassemble function, all call instructions were ignored. This patch will fix it as a side effect by not ignoring them. For example, Before (with kcore): mov %r13,%rdi callq 0xffffffff811a7e70 ^ jmpq 64 mov %gs:0x7ef41a6e(%rip),%al After (with kcore): mov %r13,%rdi > callq 0xffffffff811a7e70 ^ jmpq 64 mov %gs:0x7ef41a6e(%rip),%al Suggested-by: Michael Ellerman <mpe@ellerman.id.au> [Suggested about 'bctrl' instruction] Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Hemant Kumar <hemant@linux.vnet.ibm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Link: http://lkml.kernel.org/r/1471611578-11255-5-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-08-19 20:59:35 +08:00
tok = strchr(endptr, '*');
if (tok == NULL) {
perf annotate: Resolve 'call' operands to function names Before this patch the '_raw_spin_lock_irqsave' and 'update_rq_clock' operands were appearing just as hexadecimal numbers: update_blocked_averages /proc/kcore │ push %r12 │ push %rbx │ and $0xfffffffffffffff0,%rsp │ sub $0x40,%rsp │ add -0x662cac00(,%rdi,8),%rax │ mov %rax,%rbx │ mov %rax,%rdi │ mov %rax,0x38(%rsp) │ → callq _raw_spin_lock_irqsave │ mov %rbx,%rdi │ mov %rax,0x30(%rsp) │ → callq update_rq_clock │ mov 0x8d0(%rbx),%rax │ lea 0x8d0(%rbx),%r11 To check that all is right one can always use the 'o' hotkey and see the original objdump -dS output, that for this case is: update_blocked_averages /proc/kcore │ffffffff990d5489: push %r12 │ffffffff990d548b: push %rbx │ffffffff990d548c: and $0xfffffffffffffff0,%rsp │ffffffff990d5490: sub $0x40,%rsp │ffffffff990d5494: add -0x662cac00(,%rdi,8),%rax │ffffffff990d549c: mov %rax,%rbx │ffffffff990d549f: mov %rax,%rdi │ffffffff990d54a2: mov %rax,0x38(%rsp) │ffffffff990d54a7: → callq 0xffffffff997eb7a0 │ffffffff990d54ac: mov %rbx,%rdi │ffffffff990d54af: mov %rax,0x30(%rsp) │ffffffff990d54b4: → callq 0xffffffff990c7720 │ffffffff990d54b9: mov 0x8d0(%rbx),%rax │ffffffff990d54c0: lea 0x8d0(%rbx),%r11 Use the 'h' hotkey to see a list of available hotkeys. More work needed to cover operands for other instructions, such as 'mov', that can resolve variable names, etc. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Hemant Kumar <hemant@linux.vnet.ibm.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-xqgtw9mzmzcjgwkis9kiiv1p@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-09-20 04:26:11 +08:00
struct symbol *sym = map__find_symbol(map, map->map_ip(map, ops->target.addr));
if (sym != NULL)
ops->target.name = strdup(sym->name);
else
ops->target.addr = 0;
return 0;
}
ops->target.addr = strtoull(tok + 1, NULL, 16);
return 0;
}
static int call__scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
if (ops->target.name)
return scnprintf(bf, size, "%-6.6s %s", ins->name, ops->target.name);
if (ops->target.addr == 0)
return ins__raw_scnprintf(ins, bf, size, ops);
return scnprintf(bf, size, "%-6.6s *%" PRIx64, ins->name, ops->target.addr);
}
static struct ins_ops call_ops = {
.parse = call__parse,
.scnprintf = call__scnprintf,
};
bool ins__is_call(const struct ins *ins)
{
return ins->ops == &call_ops;
}
static int jump__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map *map __maybe_unused)
{
const char *s = strchr(ops->raw, '+');
const char *c = strchr(ops->raw, ',');
perf annotate: Fix branch instruction with multiple operands 'perf annotate' is dropping the cr* fields from branch instructions. Fix it by adding support to display branch instructions having multiple operands. Power Arch objdump of int_sqrt: 20.36 | c0000000004d2694: subf r10,r10,r3 | c0000000004d2698: v bgt cr6,c0000000004d26a0 <int_sqrt+0x40> 1.82 | c0000000004d269c: mr r3,r10 29.18 | c0000000004d26a0: mr r10,r8 | c0000000004d26a4: v bgt cr7,c0000000004d26ac <int_sqrt+0x4c> | c0000000004d26a8: mr r10,r7 Power Arch Before Patch: 20.36 | subf r10,r10,r3 | v bgt 40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt 4c | mr r10,r7 Power Arch After patch: 20.36 | subf r10,r10,r3 | v bgt cr6,40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt cr7,4c | mr r10,r7 Also support AArch64 conditional branch instructions, which can have up to three operands: Aarch64 Non-simplified (raw objdump) view: │ffff0000083cd11c: ↑ cbz w0, ffff0000083cd100 <security_fil▒ ... 4.44 │ffff000│083cd134: ↓ tbnz w0, #26, ffff0000083cd190 <securit▒ ... 1.37 │ffff000│083cd144: ↓ tbnz w22, #5, ffff0000083cd1a4 <securit▒ │ffff000│083cd148: mov w19, #0x20000 //▒ 1.02 │ffff000│083cd14c: ↓ tbz w22, #2, ffff0000083cd1ac <securit▒ ... 0.68 │ffff000└──3cd16c: ↑ cbnz w0, ffff0000083cd120 <security_fil▒ Aarch64 Simplified, before this patch: │ ↑ cbz 40 ... 4.44 │ │↓ tbnz w0, #26, ffff0000083cd190 <security_file_permiss▒ ... 1.37 │ │↓ tbnz w22, #5, ffff0000083cd1a4 <security_file_permiss▒ │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ffff0000083cd1ac <security_file_permiss▒ ... 0.68 │ └──cbnz 60 the cbz operand is missing, and the tbz doesn't get simplified processing at all because the parsing function failed to match an address. Aarch64 Simplified, After this patch applied: │ ↑ cbz w0, 40 ... 4.44 │ │↓ tbnz w0, #26, d0 ... 1.37 │ │↓ tbnz w22, #5, e4 │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ec ... 0.68 │ └──cbnz w0, 60 Originally-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Tested-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Reported-by: Anton Blanchard <anton@samba.org> Reported-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Kim Phillips <kim.phillips@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Link: http://lkml.kernel.org/r/20170601092959.f60d98912e8a1b66fd1e4c0e@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-06-01 22:29:59 +08:00
/*
* skip over possible up to 2 operands to get to address, e.g.:
* tbnz w0, #26, ffff0000083cd190 <security_file_permission+0xd0>
*/
if (c++ != NULL) {
ops->target.addr = strtoull(c, NULL, 16);
perf annotate: Fix branch instruction with multiple operands 'perf annotate' is dropping the cr* fields from branch instructions. Fix it by adding support to display branch instructions having multiple operands. Power Arch objdump of int_sqrt: 20.36 | c0000000004d2694: subf r10,r10,r3 | c0000000004d2698: v bgt cr6,c0000000004d26a0 <int_sqrt+0x40> 1.82 | c0000000004d269c: mr r3,r10 29.18 | c0000000004d26a0: mr r10,r8 | c0000000004d26a4: v bgt cr7,c0000000004d26ac <int_sqrt+0x4c> | c0000000004d26a8: mr r10,r7 Power Arch Before Patch: 20.36 | subf r10,r10,r3 | v bgt 40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt 4c | mr r10,r7 Power Arch After patch: 20.36 | subf r10,r10,r3 | v bgt cr6,40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt cr7,4c | mr r10,r7 Also support AArch64 conditional branch instructions, which can have up to three operands: Aarch64 Non-simplified (raw objdump) view: │ffff0000083cd11c: ↑ cbz w0, ffff0000083cd100 <security_fil▒ ... 4.44 │ffff000│083cd134: ↓ tbnz w0, #26, ffff0000083cd190 <securit▒ ... 1.37 │ffff000│083cd144: ↓ tbnz w22, #5, ffff0000083cd1a4 <securit▒ │ffff000│083cd148: mov w19, #0x20000 //▒ 1.02 │ffff000│083cd14c: ↓ tbz w22, #2, ffff0000083cd1ac <securit▒ ... 0.68 │ffff000└──3cd16c: ↑ cbnz w0, ffff0000083cd120 <security_fil▒ Aarch64 Simplified, before this patch: │ ↑ cbz 40 ... 4.44 │ │↓ tbnz w0, #26, ffff0000083cd190 <security_file_permiss▒ ... 1.37 │ │↓ tbnz w22, #5, ffff0000083cd1a4 <security_file_permiss▒ │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ffff0000083cd1ac <security_file_permiss▒ ... 0.68 │ └──cbnz 60 the cbz operand is missing, and the tbz doesn't get simplified processing at all because the parsing function failed to match an address. Aarch64 Simplified, After this patch applied: │ ↑ cbz w0, 40 ... 4.44 │ │↓ tbnz w0, #26, d0 ... 1.37 │ │↓ tbnz w22, #5, e4 │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ec ... 0.68 │ └──cbnz w0, 60 Originally-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Tested-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Reported-by: Anton Blanchard <anton@samba.org> Reported-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Kim Phillips <kim.phillips@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Link: http://lkml.kernel.org/r/20170601092959.f60d98912e8a1b66fd1e4c0e@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-06-01 22:29:59 +08:00
if (!ops->target.addr) {
c = strchr(c, ',');
if (c++ != NULL)
ops->target.addr = strtoull(c, NULL, 16);
}
} else {
ops->target.addr = strtoull(ops->raw, NULL, 16);
perf annotate: Fix branch instruction with multiple operands 'perf annotate' is dropping the cr* fields from branch instructions. Fix it by adding support to display branch instructions having multiple operands. Power Arch objdump of int_sqrt: 20.36 | c0000000004d2694: subf r10,r10,r3 | c0000000004d2698: v bgt cr6,c0000000004d26a0 <int_sqrt+0x40> 1.82 | c0000000004d269c: mr r3,r10 29.18 | c0000000004d26a0: mr r10,r8 | c0000000004d26a4: v bgt cr7,c0000000004d26ac <int_sqrt+0x4c> | c0000000004d26a8: mr r10,r7 Power Arch Before Patch: 20.36 | subf r10,r10,r3 | v bgt 40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt 4c | mr r10,r7 Power Arch After patch: 20.36 | subf r10,r10,r3 | v bgt cr6,40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt cr7,4c | mr r10,r7 Also support AArch64 conditional branch instructions, which can have up to three operands: Aarch64 Non-simplified (raw objdump) view: │ffff0000083cd11c: ↑ cbz w0, ffff0000083cd100 <security_fil▒ ... 4.44 │ffff000│083cd134: ↓ tbnz w0, #26, ffff0000083cd190 <securit▒ ... 1.37 │ffff000│083cd144: ↓ tbnz w22, #5, ffff0000083cd1a4 <securit▒ │ffff000│083cd148: mov w19, #0x20000 //▒ 1.02 │ffff000│083cd14c: ↓ tbz w22, #2, ffff0000083cd1ac <securit▒ ... 0.68 │ffff000└──3cd16c: ↑ cbnz w0, ffff0000083cd120 <security_fil▒ Aarch64 Simplified, before this patch: │ ↑ cbz 40 ... 4.44 │ │↓ tbnz w0, #26, ffff0000083cd190 <security_file_permiss▒ ... 1.37 │ │↓ tbnz w22, #5, ffff0000083cd1a4 <security_file_permiss▒ │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ffff0000083cd1ac <security_file_permiss▒ ... 0.68 │ └──cbnz 60 the cbz operand is missing, and the tbz doesn't get simplified processing at all because the parsing function failed to match an address. Aarch64 Simplified, After this patch applied: │ ↑ cbz w0, 40 ... 4.44 │ │↓ tbnz w0, #26, d0 ... 1.37 │ │↓ tbnz w22, #5, e4 │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ec ... 0.68 │ └──cbnz w0, 60 Originally-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Tested-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Reported-by: Anton Blanchard <anton@samba.org> Reported-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Kim Phillips <kim.phillips@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Link: http://lkml.kernel.org/r/20170601092959.f60d98912e8a1b66fd1e4c0e@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-06-01 22:29:59 +08:00
}
perf annotate: Fix jump target outside of function address range If jump target is outside of function range, perf is not handling it correctly. Especially when target address is lesser than function start address, target offset will be negative. But, target address declared to be unsigned, converts negative number into 2's complement. See below example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is lesser than function start address(34cf0). 34ac0 - 34cf0 = -0x230 = 0xfffffffffffffdd0 Objdump output: 0000000000034cf0 <__sigaction>: __GI___sigaction(): 34cf0: lea -0x20(%rdi),%eax 34cf3: cmp -bashx1,%eax 34cf6: jbe 34d00 <__sigaction+0x10> 34cf8: jmpq 34ac0 <__GI___libc_sigaction> 34cfd: nopl (%rax) 34d00: mov 0x386161(%rip),%rax # 3bae68 <_DYNAMIC+0x2e8> 34d07: movl -bashx16,%fs:(%rax) 34d0e: mov -bashxffffffff,%eax 34d13: retq perf annotate before applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 v jmpq fffffffffffffdd0 nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq perf annotate after applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 ^ jmpq 34ac0 <__GI___libc_sigaction> nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1480953407-7605-3-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-05 23:56:47 +08:00
if (s++ != NULL) {
ops->target.offset = strtoull(s, NULL, 16);
perf annotate: Fix jump target outside of function address range If jump target is outside of function range, perf is not handling it correctly. Especially when target address is lesser than function start address, target offset will be negative. But, target address declared to be unsigned, converts negative number into 2's complement. See below example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is lesser than function start address(34cf0). 34ac0 - 34cf0 = -0x230 = 0xfffffffffffffdd0 Objdump output: 0000000000034cf0 <__sigaction>: __GI___sigaction(): 34cf0: lea -0x20(%rdi),%eax 34cf3: cmp -bashx1,%eax 34cf6: jbe 34d00 <__sigaction+0x10> 34cf8: jmpq 34ac0 <__GI___libc_sigaction> 34cfd: nopl (%rax) 34d00: mov 0x386161(%rip),%rax # 3bae68 <_DYNAMIC+0x2e8> 34d07: movl -bashx16,%fs:(%rax) 34d0e: mov -bashxffffffff,%eax 34d13: retq perf annotate before applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 v jmpq fffffffffffffdd0 nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq perf annotate after applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 ^ jmpq 34ac0 <__GI___libc_sigaction> nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1480953407-7605-3-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-05 23:56:47 +08:00
ops->target.offset_avail = true;
} else {
ops->target.offset_avail = false;
}
return 0;
}
static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
perf annotate: Fix branch instruction with multiple operands 'perf annotate' is dropping the cr* fields from branch instructions. Fix it by adding support to display branch instructions having multiple operands. Power Arch objdump of int_sqrt: 20.36 | c0000000004d2694: subf r10,r10,r3 | c0000000004d2698: v bgt cr6,c0000000004d26a0 <int_sqrt+0x40> 1.82 | c0000000004d269c: mr r3,r10 29.18 | c0000000004d26a0: mr r10,r8 | c0000000004d26a4: v bgt cr7,c0000000004d26ac <int_sqrt+0x4c> | c0000000004d26a8: mr r10,r7 Power Arch Before Patch: 20.36 | subf r10,r10,r3 | v bgt 40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt 4c | mr r10,r7 Power Arch After patch: 20.36 | subf r10,r10,r3 | v bgt cr6,40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt cr7,4c | mr r10,r7 Also support AArch64 conditional branch instructions, which can have up to three operands: Aarch64 Non-simplified (raw objdump) view: │ffff0000083cd11c: ↑ cbz w0, ffff0000083cd100 <security_fil▒ ... 4.44 │ffff000│083cd134: ↓ tbnz w0, #26, ffff0000083cd190 <securit▒ ... 1.37 │ffff000│083cd144: ↓ tbnz w22, #5, ffff0000083cd1a4 <securit▒ │ffff000│083cd148: mov w19, #0x20000 //▒ 1.02 │ffff000│083cd14c: ↓ tbz w22, #2, ffff0000083cd1ac <securit▒ ... 0.68 │ffff000└──3cd16c: ↑ cbnz w0, ffff0000083cd120 <security_fil▒ Aarch64 Simplified, before this patch: │ ↑ cbz 40 ... 4.44 │ │↓ tbnz w0, #26, ffff0000083cd190 <security_file_permiss▒ ... 1.37 │ │↓ tbnz w22, #5, ffff0000083cd1a4 <security_file_permiss▒ │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ffff0000083cd1ac <security_file_permiss▒ ... 0.68 │ └──cbnz 60 the cbz operand is missing, and the tbz doesn't get simplified processing at all because the parsing function failed to match an address. Aarch64 Simplified, After this patch applied: │ ↑ cbz w0, 40 ... 4.44 │ │↓ tbnz w0, #26, d0 ... 1.37 │ │↓ tbnz w22, #5, e4 │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ec ... 0.68 │ └──cbnz w0, 60 Originally-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Tested-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Reported-by: Anton Blanchard <anton@samba.org> Reported-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Kim Phillips <kim.phillips@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Link: http://lkml.kernel.org/r/20170601092959.f60d98912e8a1b66fd1e4c0e@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-06-01 22:29:59 +08:00
const char *c = strchr(ops->raw, ',');
perf annotate: Fix jump target outside of function address range If jump target is outside of function range, perf is not handling it correctly. Especially when target address is lesser than function start address, target offset will be negative. But, target address declared to be unsigned, converts negative number into 2's complement. See below example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is lesser than function start address(34cf0). 34ac0 - 34cf0 = -0x230 = 0xfffffffffffffdd0 Objdump output: 0000000000034cf0 <__sigaction>: __GI___sigaction(): 34cf0: lea -0x20(%rdi),%eax 34cf3: cmp -bashx1,%eax 34cf6: jbe 34d00 <__sigaction+0x10> 34cf8: jmpq 34ac0 <__GI___libc_sigaction> 34cfd: nopl (%rax) 34d00: mov 0x386161(%rip),%rax # 3bae68 <_DYNAMIC+0x2e8> 34d07: movl -bashx16,%fs:(%rax) 34d0e: mov -bashxffffffff,%eax 34d13: retq perf annotate before applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 v jmpq fffffffffffffdd0 nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq perf annotate after applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 ^ jmpq 34ac0 <__GI___libc_sigaction> nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1480953407-7605-3-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-05 23:56:47 +08:00
if (!ops->target.addr || ops->target.offset < 0)
perf annotate: Show raw form for jump instruction with indirect target For jump instructions that does not include target address as direct operand, show the original disassembled line for them. This is needed for certain powerpc jump instructions that use target address in a register (such as bctr, btar, ...). Before: ld r12,32088(r12) mtctr r12 v bctr ffffffffffffca2c std r2,24(r1) addis r12,r2,-1 After: ld r12,32088(r12) mtctr r12 v bctr std r2,24(r1) addis r12,r2,-1 Committer notes: Testing it using a perf.data file and vmlinux for powerpc64, cross-annotating it on a x86_64 workstation: Before: .__bpf_prog_run vmlinux.powerpc │ std r10,512(r9) ▒ │ lbz r9,0(r31) ▒ │ rldicr r9,r9,3,60 ▒ │ ldx r9,r30,r9 ▒ │ mtctr r9 ▒ 100.00 │ ↓ bctr 3fffffffffe01510 ▒ │ lwa r10,4(r31) ▒ │ lwz r9,0(r31) ▒ <SNIP> Invalid jump offset: 3fffffffffe01510 After: .__bpf_prog_run vmlinux.powerpc │ std r10,512(r9) ▒ │ lbz r9,0(r31) ▒ │ rldicr r9,r9,3,60 ▒ │ ldx r9,r30,r9 ▒ │ mtctr r9 ▒ 100.00 │ ↓ bctr ▒ │ lwa r10,4(r31) ▒ │ lwz r9,0(r31) ▒ <SNIP> Invalid jump offset: 3fffffffffe01510 This, in turn, uncovers another problem with jumps without operands, the ENTER/-> operation, to jump to the target, still continues using the bogus target :-) BTW, this was the file used for the above tests: [acme@jouet ravi_bangoria]$ perf report --header-only -i perf.data.f22vm.powerdev # ======== # captured on: Thu Nov 24 12:40:38 2016 # hostname : pdev-f22-qemu # os release : 4.4.10-200.fc22.ppc64 # perf version : 4.9.rc1.g6298ce # arch : ppc64 # nrcpus online : 48 # nrcpus avail : 48 # cpudesc : POWER7 (architected), altivec supported # cpuid : 74,513 # total memory : 4158976 kB # cmdline : /home/ravi/Workspace/linux/tools/perf/perf record -a # event : name = cycles:ppp, , size = 112, { sample_period, sample_freq } = 4000, sample_type = IP|TID|TIME|CPU|PERIOD, disabled = 1, inherit = 1, mmap = 1, c # HEADER_CPU_TOPOLOGY info available, use -I to display # HEADER_NUMA_TOPOLOGY info available, use -I to display # pmu mappings: cpu = 4, software = 1, tracepoint = 2, breakpoint = 5 # missing features: HEADER_TRACING_DATA HEADER_BRANCH_STACK HEADER_GROUP_DESC HEADER_AUXTRACE HEADER_STAT HEADER_CACHE # ======== # [acme@jouet ravi_bangoria]$ Suggested-by: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1480953407-7605-1-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-05 23:56:45 +08:00
return ins__raw_scnprintf(ins, bf, size, ops);
perf annotate: Fix branch instruction with multiple operands 'perf annotate' is dropping the cr* fields from branch instructions. Fix it by adding support to display branch instructions having multiple operands. Power Arch objdump of int_sqrt: 20.36 | c0000000004d2694: subf r10,r10,r3 | c0000000004d2698: v bgt cr6,c0000000004d26a0 <int_sqrt+0x40> 1.82 | c0000000004d269c: mr r3,r10 29.18 | c0000000004d26a0: mr r10,r8 | c0000000004d26a4: v bgt cr7,c0000000004d26ac <int_sqrt+0x4c> | c0000000004d26a8: mr r10,r7 Power Arch Before Patch: 20.36 | subf r10,r10,r3 | v bgt 40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt 4c | mr r10,r7 Power Arch After patch: 20.36 | subf r10,r10,r3 | v bgt cr6,40 1.82 | mr r3,r10 29.18 | 40: mr r10,r8 | v bgt cr7,4c | mr r10,r7 Also support AArch64 conditional branch instructions, which can have up to three operands: Aarch64 Non-simplified (raw objdump) view: │ffff0000083cd11c: ↑ cbz w0, ffff0000083cd100 <security_fil▒ ... 4.44 │ffff000│083cd134: ↓ tbnz w0, #26, ffff0000083cd190 <securit▒ ... 1.37 │ffff000│083cd144: ↓ tbnz w22, #5, ffff0000083cd1a4 <securit▒ │ffff000│083cd148: mov w19, #0x20000 //▒ 1.02 │ffff000│083cd14c: ↓ tbz w22, #2, ffff0000083cd1ac <securit▒ ... 0.68 │ffff000└──3cd16c: ↑ cbnz w0, ffff0000083cd120 <security_fil▒ Aarch64 Simplified, before this patch: │ ↑ cbz 40 ... 4.44 │ │↓ tbnz w0, #26, ffff0000083cd190 <security_file_permiss▒ ... 1.37 │ │↓ tbnz w22, #5, ffff0000083cd1a4 <security_file_permiss▒ │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ffff0000083cd1ac <security_file_permiss▒ ... 0.68 │ └──cbnz 60 the cbz operand is missing, and the tbz doesn't get simplified processing at all because the parsing function failed to match an address. Aarch64 Simplified, After this patch applied: │ ↑ cbz w0, 40 ... 4.44 │ │↓ tbnz w0, #26, d0 ... 1.37 │ │↓ tbnz w22, #5, e4 │ │ mov w19, #0x20000 // #131072 1.02 │ │↓ tbz w22, #2, ec ... 0.68 │ └──cbnz w0, 60 Originally-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Tested-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Reported-by: Anton Blanchard <anton@samba.org> Reported-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Kim Phillips <kim.phillips@arm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Link: http://lkml.kernel.org/r/20170601092959.f60d98912e8a1b66fd1e4c0e@arm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-06-01 22:29:59 +08:00
if (c != NULL) {
const char *c2 = strchr(c + 1, ',');
/* check for 3-op insn */
if (c2 != NULL)
c = c2;
c++;
/* mirror arch objdump's space-after-comma style */
if (*c == ' ')
c++;
}
return scnprintf(bf, size, "%-6.6s %.*s%" PRIx64,
ins->name, c ? c - ops->raw : 0, ops->raw,
ops->target.offset);
}
static struct ins_ops jump_ops = {
.parse = jump__parse,
.scnprintf = jump__scnprintf,
};
bool ins__is_jump(const struct ins *ins)
{
return ins->ops == &jump_ops;
}
static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
{
char *endptr, *name, *t;
if (strstr(raw, "(%rip)") == NULL)
return 0;
*addrp = strtoull(comment, &endptr, 16);
name = strchr(endptr, '<');
if (name == NULL)
return -1;
name++;
t = strchr(name, '>');
if (t == NULL)
return 0;
*t = '\0';
*namep = strdup(name);
*t = '>';
return 0;
}
static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map *map)
{
ops->locked.ops = zalloc(sizeof(*ops->locked.ops));
if (ops->locked.ops == NULL)
return 0;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (disasm_line__parse(ops->raw, &ops->locked.ins.name, &ops->locked.ops->raw) < 0)
goto out_free_ops;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
ops->locked.ins.ops = ins__find(arch, ops->locked.ins.name);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (ops->locked.ins.ops == NULL)
goto out_free_ops;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (ops->locked.ins.ops->parse &&
ops->locked.ins.ops->parse(arch, ops->locked.ops, map) < 0)
goto out_free_ops;
return 0;
out_free_ops:
zfree(&ops->locked.ops);
return 0;
}
static int lock__scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
int printed;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (ops->locked.ins.ops == NULL)
return ins__raw_scnprintf(ins, bf, size, ops);
printed = scnprintf(bf, size, "%-6.6s ", ins->name);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
return printed + ins__scnprintf(&ops->locked.ins, bf + printed,
size - printed, ops->locked.ops);
}
static void lock__delete(struct ins_operands *ops)
{
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
struct ins *ins = &ops->locked.ins;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (ins->ops && ins->ops->free)
ins->ops->free(ops->locked.ops);
else
ins__delete(ops->locked.ops);
zfree(&ops->locked.ops);
zfree(&ops->target.raw);
zfree(&ops->target.name);
}
static struct ins_ops lock_ops = {
.free = lock__delete,
.parse = lock__parse,
.scnprintf = lock__scnprintf,
};
static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map *map __maybe_unused)
{
char *s = strchr(ops->raw, ','), *target, *comment, prev;
if (s == NULL)
return -1;
*s = '\0';
ops->source.raw = strdup(ops->raw);
*s = ',';
if (ops->source.raw == NULL)
return -1;
target = ++s;
comment = strchr(s, arch->objdump.comment_char);
if (comment != NULL)
s = comment - 1;
else
s = strchr(s, '\0') - 1;
while (s > target && isspace(s[0]))
--s;
s++;
prev = *s;
*s = '\0';
ops->target.raw = strdup(target);
*s = prev;
if (ops->target.raw == NULL)
goto out_free_source;
if (comment == NULL)
return 0;
comment = ltrim(comment);
comment__symbol(ops->source.raw, comment, &ops->source.addr, &ops->source.name);
comment__symbol(ops->target.raw, comment, &ops->target.addr, &ops->target.name);
return 0;
out_free_source:
zfree(&ops->source.raw);
return -1;
}
static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
return scnprintf(bf, size, "%-6.6s %s,%s", ins->name,
ops->source.name ?: ops->source.raw,
ops->target.name ?: ops->target.raw);
}
static struct ins_ops mov_ops = {
.parse = mov__parse,
.scnprintf = mov__scnprintf,
};
static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map *map __maybe_unused)
{
char *target, *comment, *s, prev;
target = s = ops->raw;
while (s[0] != '\0' && !isspace(s[0]))
++s;
prev = *s;
*s = '\0';
ops->target.raw = strdup(target);
*s = prev;
if (ops->target.raw == NULL)
return -1;
comment = strchr(s, arch->objdump.comment_char);
if (comment == NULL)
return 0;
comment = ltrim(comment);
comment__symbol(ops->target.raw, comment, &ops->target.addr, &ops->target.name);
return 0;
}
static int dec__scnprintf(struct ins *ins, char *bf, size_t size,
struct ins_operands *ops)
{
return scnprintf(bf, size, "%-6.6s %s", ins->name,
ops->target.name ?: ops->target.raw);
}
static struct ins_ops dec_ops = {
.parse = dec__parse,
.scnprintf = dec__scnprintf,
};
perf tools: Use __maybe_used for unused variables perf defines both __used and __unused variables to use for marking unused variables. The variable __used is defined to __attribute__((__unused__)), which contradicts the kernel definition to __attribute__((__used__)) for new gcc versions. On Android, __used is also defined in system headers and this leads to warnings like: warning: '__used__' attribute ignored __unused is not defined in the kernel and is not a standard definition. If __unused is included everywhere instead of __used, this leads to conflicts with glibc headers, since glibc has a variables with this name in its headers. The best approach is to use __maybe_unused, the definition used in the kernel for __attribute__((unused)). In this way there is only one definition in perf sources (instead of 2 definitions that point to the same thing: __used and __unused) and it works on both Linux and Android. This patch simply replaces all instances of __used and __unused with __maybe_unused. Signed-off-by: Irina Tirdea <irina.tirdea@intel.com> Acked-by: Pekka Enberg <penberg@kernel.org> Cc: David Ahern <dsahern@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1347315303-29906-7-git-send-email-irina.tirdea@intel.com [ committer note: fixed up conflict with a116e05 in builtin-sched.c ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2012-09-11 06:15:03 +08:00
static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size,
struct ins_operands *ops __maybe_unused)
{
return scnprintf(bf, size, "%-6.6s", "nop");
}
static struct ins_ops nop_ops = {
.scnprintf = nop__scnprintf,
};
static struct ins_ops ret_ops = {
.scnprintf = ins__raw_scnprintf,
};
bool ins__is_ret(const struct ins *ins)
{
return ins->ops == &ret_ops;
}
bool ins__is_lock(const struct ins *ins)
{
return ins->ops == &lock_ops;
}
static int ins__key_cmp(const void *name, const void *insp)
{
const struct ins *ins = insp;
return strcmp(name, ins->name);
}
static int ins__cmp(const void *a, const void *b)
{
const struct ins *ia = a;
const struct ins *ib = b;
return strcmp(ia->name, ib->name);
}
static void ins__sort(struct arch *arch)
{
const int nmemb = arch->nr_instructions;
qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
}
perf annotate: Introduce alternative method of keeping instructions table Some arches may want to dynamically populate the table using regular expressions on the instruction names to associate them with a set of parsing/formatting/etc functions (struct ins_ops), so provide a fallback for when the ins__find() method fails. That fall back will be able to resize the arch->instructions, setting arch->nr_instructions appropriately, helper functions to associate an ins_ops to an instruction name, growing the arch->instructions if needed and resorting it are provided, all the arch specific callback needs to do is to decide if the missing instruction should be added to arch->instructions with a ins_ops association. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-auu13yradxf7g5dgtpnzt97a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:37:08 +08:00
static struct ins_ops *__ins__find(struct arch *arch, const char *name)
{
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
struct ins *ins;
const int nmemb = arch->nr_instructions;
if (!arch->sorted_instructions) {
ins__sort(arch);
arch->sorted_instructions = true;
}
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp);
return ins ? ins->ops : NULL;
}
perf annotate: Introduce alternative method of keeping instructions table Some arches may want to dynamically populate the table using regular expressions on the instruction names to associate them with a set of parsing/formatting/etc functions (struct ins_ops), so provide a fallback for when the ins__find() method fails. That fall back will be able to resize the arch->instructions, setting arch->nr_instructions appropriately, helper functions to associate an ins_ops to an instruction name, growing the arch->instructions if needed and resorting it are provided, all the arch specific callback needs to do is to decide if the missing instruction should be added to arch->instructions with a ins_ops association. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-auu13yradxf7g5dgtpnzt97a@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:37:08 +08:00
static struct ins_ops *ins__find(struct arch *arch, const char *name)
{
struct ins_ops *ops = __ins__find(arch, name);
if (!ops && arch->associate_instruction_ops)
ops = arch->associate_instruction_ops(arch, name);
return ops;
}
static int arch__key_cmp(const void *name, const void *archp)
{
const struct arch *arch = archp;
return strcmp(name, arch->name);
}
static int arch__cmp(const void *a, const void *b)
{
const struct arch *aa = a;
const struct arch *ab = b;
return strcmp(aa->name, ab->name);
}
static void arch__sort(void)
{
const int nmemb = ARRAY_SIZE(architectures);
qsort(architectures, nmemb, sizeof(struct arch), arch__cmp);
}
static struct arch *arch__find(const char *name)
{
const int nmemb = ARRAY_SIZE(architectures);
static bool sorted;
if (!sorted) {
arch__sort();
sorted = true;
}
return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp);
}
int symbol__alloc_hist(struct symbol *sym)
{
struct annotation *notes = symbol__annotation(sym);
const size_t size = symbol__size(sym);
size_t sizeof_sym_hist;
/* Check for overflow when calculating sizeof_sym_hist */
if (size > (SIZE_MAX - sizeof(struct sym_hist)) / sizeof(struct sym_hist_entry))
return -1;
sizeof_sym_hist = (sizeof(struct sym_hist) + size * sizeof(struct sym_hist_entry));
/* Check for overflow in zalloc argument */
if (sizeof_sym_hist > (SIZE_MAX - sizeof(*notes->src))
/ symbol_conf.nr_events)
return -1;
notes->src = zalloc(sizeof(*notes->src) + symbol_conf.nr_events * sizeof_sym_hist);
if (notes->src == NULL)
return -1;
notes->src->sizeof_sym_hist = sizeof_sym_hist;
notes->src->nr_histograms = symbol_conf.nr_events;
INIT_LIST_HEAD(&notes->src->source);
return 0;
}
/* The cycles histogram is lazily allocated. */
static int symbol__alloc_hist_cycles(struct symbol *sym)
{
struct annotation *notes = symbol__annotation(sym);
const size_t size = symbol__size(sym);
notes->src->cycles_hist = calloc(size, sizeof(struct cyc_hist));
if (notes->src->cycles_hist == NULL)
return -1;
return 0;
}
void symbol__annotate_zero_histograms(struct symbol *sym)
{
struct annotation *notes = symbol__annotation(sym);
pthread_mutex_lock(&notes->lock);
if (notes->src != NULL) {
memset(notes->src->histograms, 0,
notes->src->nr_histograms * notes->src->sizeof_sym_hist);
if (notes->src->cycles_hist)
memset(notes->src->cycles_hist, 0,
symbol__size(sym) * sizeof(struct cyc_hist));
}
pthread_mutex_unlock(&notes->lock);
}
static int __symbol__account_cycles(struct annotation *notes,
u64 start,
unsigned offset, unsigned cycles,
unsigned have_start)
{
struct cyc_hist *ch;
ch = notes->src->cycles_hist;
/*
* For now we can only account one basic block per
* final jump. But multiple could be overlapping.
* Always account the longest one. So when
* a shorter one has been already seen throw it away.
*
* We separately always account the full cycles.
*/
ch[offset].num_aggr++;
ch[offset].cycles_aggr += cycles;
if (!have_start && ch[offset].have_start)
return 0;
if (ch[offset].num) {
if (have_start && (!ch[offset].have_start ||
ch[offset].start > start)) {
ch[offset].have_start = 0;
ch[offset].cycles = 0;
ch[offset].num = 0;
if (ch[offset].reset < 0xffff)
ch[offset].reset++;
} else if (have_start &&
ch[offset].start < start)
return 0;
}
ch[offset].have_start = have_start;
ch[offset].start = start;
ch[offset].cycles += cycles;
ch[offset].num++;
return 0;
}
static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map,
struct annotation *notes, int evidx, u64 addr,
struct perf_sample *sample)
{
unsigned offset;
struct sym_hist *h;
pr_debug3("%s: addr=%#" PRIx64 "\n", __func__, map->unmap_ip(map, addr));
if ((addr < sym->start || addr >= sym->end) &&
(addr != sym->end || sym->start != sym->end)) {
pr_debug("%s(%d): ERANGE! sym->name=%s, start=%#" PRIx64 ", addr=%#" PRIx64 ", end=%#" PRIx64 "\n",
__func__, __LINE__, sym->name, sym->start, addr, sym->end);
return -ERANGE;
}
offset = addr - sym->start;
h = annotation__histogram(notes, evidx);
h->nr_samples++;
h->addr[offset].nr_samples++;
h->period += sample->period;
h->addr[offset].period += sample->period;
pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
sym->start, sym->name, addr, addr - sym->start, evidx,
h->addr[offset].nr_samples, h->addr[offset].period);
return 0;
}
static struct annotation *symbol__get_annotation(struct symbol *sym, bool cycles)
{
struct annotation *notes = symbol__annotation(sym);
if (notes->src == NULL) {
if (symbol__alloc_hist(sym) < 0)
return NULL;
}
if (!notes->src->cycles_hist && cycles) {
if (symbol__alloc_hist_cycles(sym) < 0)
return NULL;
}
return notes;
}
static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
int evidx, u64 addr,
struct perf_sample *sample)
{
struct annotation *notes;
if (sym == NULL)
return 0;
notes = symbol__get_annotation(sym, false);
if (notes == NULL)
return -ENOMEM;
return __symbol__inc_addr_samples(sym, map, notes, evidx, addr, sample);
}
static int symbol__account_cycles(u64 addr, u64 start,
struct symbol *sym, unsigned cycles)
{
struct annotation *notes;
unsigned offset;
if (sym == NULL)
return 0;
notes = symbol__get_annotation(sym, true);
if (notes == NULL)
return -ENOMEM;
if (addr < sym->start || addr >= sym->end)
return -ERANGE;
if (start) {
if (start < sym->start || start >= sym->end)
return -ERANGE;
if (start >= addr)
start = 0;
}
offset = addr - sym->start;
return __symbol__account_cycles(notes,
start ? start - sym->start : 0,
offset, cycles,
!!start);
}
int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
struct addr_map_symbol *start,
unsigned cycles)
{
u64 saddr = 0;
int err;
if (!cycles)
return 0;
/*
* Only set start when IPC can be computed. We can only
* compute it when the basic block is completely in a single
* function.
* Special case the case when the jump is elsewhere, but
* it starts on the function start.
*/
if (start &&
(start->sym == ams->sym ||
(ams->sym &&
start->addr == ams->sym->start + ams->map->start)))
saddr = start->al_addr;
if (saddr == 0)
pr_debug2("BB with bad start: addr %"PRIx64" start %"PRIx64" sym %"PRIx64" saddr %"PRIx64"\n",
ams->addr,
start ? start->addr : 0,
ams->sym ? ams->sym->start + ams->map->start : 0,
saddr);
err = symbol__account_cycles(ams->al_addr, saddr, ams->sym, cycles);
if (err)
pr_debug2("account_cycles failed %d\n", err);
return err;
}
int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample,
int evidx)
{
return symbol__inc_addr_samples(ams->sym, ams->map, evidx, ams->al_addr, sample);
}
int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample,
int evidx, u64 ip)
{
return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip, sample);
}
static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map *map)
{
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
dl->ins.ops = ins__find(arch, dl->ins.name);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (!dl->ins.ops)
return;
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (dl->ins.ops->parse && dl->ins.ops->parse(arch, &dl->ops, map) < 0)
dl->ins.ops = NULL;
}
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
static int disasm_line__parse(char *line, const char **namep, char **rawp)
{
char tmp, *name = ltrim(line);
if (name[0] == '\0')
return -1;
*rawp = name + 1;
while ((*rawp)[0] != '\0' && !isspace((*rawp)[0]))
++*rawp;
tmp = (*rawp)[0];
(*rawp)[0] = '\0';
*namep = strdup(name);
if (*namep == NULL)
goto out_free_name;
(*rawp)[0] = tmp;
*rawp = ltrim(*rawp);
return 0;
out_free_name:
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
free((void *)namep);
*namep = NULL;
return -1;
}
static struct disasm_line *disasm_line__new(s64 offset, char *line,
size_t privsize, int line_nr,
struct arch *arch,
struct map *map)
{
struct disasm_line *dl = zalloc(sizeof(*dl) + privsize);
if (dl != NULL) {
dl->offset = offset;
dl->line = strdup(line);
dl->line_nr = line_nr;
if (dl->line == NULL)
goto out_delete;
if (offset != -1) {
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (disasm_line__parse(dl->line, &dl->ins.name, &dl->ops.raw) < 0)
goto out_free_line;
disasm_line__init_ins(dl, arch, map);
}
}
return dl;
out_free_line:
zfree(&dl->line);
out_delete:
free(dl);
return NULL;
}
void disasm_line__free(struct disasm_line *dl)
{
zfree(&dl->line);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (dl->ins.ops && dl->ins.ops->free)
dl->ins.ops->free(&dl->ops);
else
ins__delete(&dl->ops);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
free((void *)dl->ins.name);
dl->ins.name = NULL;
free(dl);
}
int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw)
{
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (raw || !dl->ins.ops)
return scnprintf(bf, size, "%-6.6s %s", dl->ins.name, dl->ops.raw);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
return ins__scnprintf(&dl->ins, bf, size, &dl->ops);
}
static void disasm__add(struct list_head *head, struct disasm_line *line)
{
list_add_tail(&line->node, head);
}
struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disasm_line *pos)
{
list_for_each_entry_continue(pos, head, node)
if (pos->offset >= 0)
return pos;
return NULL;
}
double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
s64 end, const char **path, struct sym_hist_entry *sample)
{
struct source_line *src_line = notes->src->lines;
double percent = 0.0;
sample->nr_samples = sample->period = 0;
if (src_line) {
size_t sizeof_src_line = sizeof(*src_line) +
sizeof(src_line->samples) * (src_line->nr_pcnt - 1);
while (offset < end) {
src_line = (void *)notes->src->lines +
(sizeof_src_line * offset);
if (*path == NULL)
*path = src_line->path;
percent += src_line->samples[evidx].percent;
sample->nr_samples += src_line->samples[evidx].nr;
offset++;
}
} else {
struct sym_hist *h = annotation__histogram(notes, evidx);
unsigned int hits = 0;
u64 period = 0;
while (offset < end) {
hits += h->addr[offset].nr_samples;
period += h->addr[offset].period;
++offset;
}
if (h->nr_samples) {
sample->period = period;
sample->nr_samples = hits;
percent = 100.0 * hits / h->nr_samples;
}
}
return percent;
}
perf annotate: Add branch stack / basic block I wanted to know the hottest path through a function and figured the branch-stack (LBR) information should be able to help out with that. The below uses the branch-stack to create basic blocks and generate statistics from them. from to branch_i * ----> * | | block v * ----> * from to branch_i+1 The blocks are broken down into non-overlapping ranges, while tracking if the start of each range is an entry point and/or the end of a range is a branch. Each block iterates all ranges it covers (while splitting where required to exactly match the block) and increments the 'coverage' count. For the range including the branch we increment the taken counter, as well as the pred counter if flags.predicted. Using these number we can find if an instruction: - had coverage; given by: br->coverage / br->sym->max_coverage This metric ensures each symbol has a 100% spot, which reflects the observation that each symbol must have a most covered/hottest block. - is a branch target: br->is_target && br->start == add - for targets, how much of a branch's coverages comes from it: target->entry / branch->coverage - is a branch: br->is_branch && br->end == addr - for branches, how often it was taken: br->taken / br->coverage after all, all execution that didn't take the branch would have incremented the coverage and continued onward to a later branch. - for branches, how often it was predicted: br->pred / br->taken The coverage percentage is used to color the address and asm sections; for low (<1%) coverage we use NORMAL (uncolored), indicating that these instructions are not 'important'. For high coverage (>75%) we color the address RED. For each branch, we add an asm comment after the instruction with information on how often it was taken and predicted. Output looks like (sans color, which does loose a lot of the information :/) $ perf record --branch-filter u,any -e cycles:p ./branches 27 $ perf annotate branches Percent | Source code & Disassembly of branches for cycles:pu (217 samples) --------------------------------------------------------------------------------- : branches(): 0.00 : 40057a: push %rbp 0.00 : 40057b: mov %rsp,%rbp 0.00 : 40057e: sub $0x20,%rsp 0.00 : 400582: mov %rdi,-0x18(%rbp) 0.00 : 400586: mov %rsi,-0x20(%rbp) 0.00 : 40058a: mov -0x18(%rbp),%rax 0.00 : 40058e: mov %rax,-0x10(%rbp) 0.00 : 400592: movq $0x0,-0x8(%rbp) 0.00 : 40059a: jmpq 400656 <branches+0xdc> 1.84 : 40059f: mov -0x10(%rbp),%rax # +100.00% 3.23 : 4005a3: and $0x1,%eax 1.84 : 4005a6: test %rax,%rax 0.00 : 4005a9: je 4005bf <branches+0x45> # -54.50% (p:42.00%) 0.46 : 4005ab: mov 0x200bbe(%rip),%rax # 601170 <acc> 12.90 : 4005b2: add $0x1,%rax 2.30 : 4005b6: mov %rax,0x200bb3(%rip) # 601170 <acc> 0.46 : 4005bd: jmp 4005d1 <branches+0x57> # -100.00% (p:100.00%) 0.92 : 4005bf: mov 0x200baa(%rip),%rax # 601170 <acc> # +49.54% 13.82 : 4005c6: sub $0x1,%rax 0.46 : 4005ca: mov %rax,0x200b9f(%rip) # 601170 <acc> 2.30 : 4005d1: mov -0x10(%rbp),%rax # +50.46% 0.46 : 4005d5: mov %rax,%rdi 0.46 : 4005d8: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 4005dd: mov %rax,-0x10(%rbp) # +100.00% 0.92 : 4005e1: mov -0x18(%rbp),%rax 0.00 : 4005e5: and $0x1,%eax 0.00 : 4005e8: test %rax,%rax 0.00 : 4005eb: je 4005ff <branches+0x85> # -100.00% (p:100.00%) 0.00 : 4005ed: mov 0x200b7c(%rip),%rax # 601170 <acc> 0.00 : 4005f4: shr $0x2,%rax 0.00 : 4005f8: mov %rax,0x200b71(%rip) # 601170 <acc> 0.00 : 4005ff: mov -0x10(%rbp),%rax # +100.00% 7.37 : 400603: and $0x1,%eax 3.69 : 400606: test %rax,%rax 0.00 : 400609: jne 400612 <branches+0x98> # -59.25% (p:42.99%) 1.84 : 40060b: mov $0x1,%eax 14.29 : 400610: jmp 400617 <branches+0x9d> # -100.00% (p:100.00%) 1.38 : 400612: mov $0x0,%eax # +57.65% 10.14 : 400617: test %al,%al # +42.35% 0.00 : 400619: je 40062f <branches+0xb5> # -57.65% (p:100.00%) 0.46 : 40061b: mov 0x200b4e(%rip),%rax # 601170 <acc> 2.76 : 400622: sub $0x1,%rax 0.00 : 400626: mov %rax,0x200b43(%rip) # 601170 <acc> 0.46 : 40062d: jmp 400641 <branches+0xc7> # -100.00% (p:100.00%) 0.92 : 40062f: mov 0x200b3a(%rip),%rax # 601170 <acc> # +56.13% 2.30 : 400636: add $0x1,%rax 0.92 : 40063a: mov %rax,0x200b2f(%rip) # 601170 <acc> 0.92 : 400641: mov -0x10(%rbp),%rax # +43.87% 2.30 : 400645: mov %rax,%rdi 0.00 : 400648: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 40064d: mov %rax,-0x10(%rbp) # +100.00% 1.84 : 400651: addq $0x1,-0x8(%rbp) 0.92 : 400656: mov -0x8(%rbp),%rax 5.07 : 40065a: cmp -0x20(%rbp),%rax 0.00 : 40065e: jb 40059f <branches+0x25> # -100.00% (p:100.00%) 0.00 : 400664: nop 0.00 : 400665: leaveq 0.00 : 400666: retq (Note: the --branch-filter u,any was used to avoid spurious target and branch points due to interrupts/faults, they show up as very small -/+ annotations on 'weird' locations) Committer note: Please take a look at: http://vger.kernel.org/~acme/perf/annotate_basic_blocks.png To see the colors. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: David Carrillo-Cisneros <davidcc@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Stephane Eranian <eranian@google.com> [ Moved sym->max_coverage to 'struct annotate', aka symbol__annotate(sym) ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-09-06 03:08:12 +08:00
static const char *annotate__address_color(struct block_range *br)
{
double cov = block_range__coverage(br);
if (cov >= 0) {
/* mark red for >75% coverage */
if (cov > 0.75)
return PERF_COLOR_RED;
/* mark dull for <1% coverage */
if (cov < 0.01)
return PERF_COLOR_NORMAL;
}
return PERF_COLOR_MAGENTA;
}
static const char *annotate__asm_color(struct block_range *br)
{
double cov = block_range__coverage(br);
if (cov >= 0) {
/* mark dull for <1% coverage */
if (cov < 0.01)
return PERF_COLOR_NORMAL;
}
return PERF_COLOR_BLUE;
}
static void annotate__branch_printf(struct block_range *br, u64 addr)
{
bool emit_comment = true;
if (!br)
return;
#if 1
if (br->is_target && br->start == addr) {
struct block_range *branch = br;
double p;
/*
* Find matching branch to our target.
*/
while (!branch->is_branch)
branch = block_range__next(branch);
p = 100 *(double)br->entry / branch->coverage;
if (p > 0.1) {
if (emit_comment) {
emit_comment = false;
printf("\t#");
}
/*
* The percentage of coverage joined at this target in relation
* to the next branch.
*/
printf(" +%.2f%%", p);
}
}
#endif
if (br->is_branch && br->end == addr) {
double p = 100*(double)br->taken / br->coverage;
if (p > 0.1) {
if (emit_comment) {
emit_comment = false;
printf("\t#");
}
/*
* The percentage of coverage leaving at this branch, and
* its prediction ratio.
*/
printf(" -%.2f%% (p:%.2f%%)", p, 100*(double)br->pred / br->taken);
}
}
}
static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 start,
struct perf_evsel *evsel, u64 len, int min_pcnt, int printed,
int max_lines, struct disasm_line *queue)
{
static const char *prev_line;
static const char *prev_color;
if (dl->offset != -1) {
const char *path = NULL;
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
double percent, max_percent = 0.0;
double *ppercents = &percent;
struct sym_hist_entry sample;
struct sym_hist_entry *psamples = &sample;
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
int i, nr_percent = 1;
const char *color;
struct annotation *notes = symbol__annotation(sym);
s64 offset = dl->offset;
const u64 addr = start + offset;
struct disasm_line *next;
perf annotate: Add branch stack / basic block I wanted to know the hottest path through a function and figured the branch-stack (LBR) information should be able to help out with that. The below uses the branch-stack to create basic blocks and generate statistics from them. from to branch_i * ----> * | | block v * ----> * from to branch_i+1 The blocks are broken down into non-overlapping ranges, while tracking if the start of each range is an entry point and/or the end of a range is a branch. Each block iterates all ranges it covers (while splitting where required to exactly match the block) and increments the 'coverage' count. For the range including the branch we increment the taken counter, as well as the pred counter if flags.predicted. Using these number we can find if an instruction: - had coverage; given by: br->coverage / br->sym->max_coverage This metric ensures each symbol has a 100% spot, which reflects the observation that each symbol must have a most covered/hottest block. - is a branch target: br->is_target && br->start == add - for targets, how much of a branch's coverages comes from it: target->entry / branch->coverage - is a branch: br->is_branch && br->end == addr - for branches, how often it was taken: br->taken / br->coverage after all, all execution that didn't take the branch would have incremented the coverage and continued onward to a later branch. - for branches, how often it was predicted: br->pred / br->taken The coverage percentage is used to color the address and asm sections; for low (<1%) coverage we use NORMAL (uncolored), indicating that these instructions are not 'important'. For high coverage (>75%) we color the address RED. For each branch, we add an asm comment after the instruction with information on how often it was taken and predicted. Output looks like (sans color, which does loose a lot of the information :/) $ perf record --branch-filter u,any -e cycles:p ./branches 27 $ perf annotate branches Percent | Source code & Disassembly of branches for cycles:pu (217 samples) --------------------------------------------------------------------------------- : branches(): 0.00 : 40057a: push %rbp 0.00 : 40057b: mov %rsp,%rbp 0.00 : 40057e: sub $0x20,%rsp 0.00 : 400582: mov %rdi,-0x18(%rbp) 0.00 : 400586: mov %rsi,-0x20(%rbp) 0.00 : 40058a: mov -0x18(%rbp),%rax 0.00 : 40058e: mov %rax,-0x10(%rbp) 0.00 : 400592: movq $0x0,-0x8(%rbp) 0.00 : 40059a: jmpq 400656 <branches+0xdc> 1.84 : 40059f: mov -0x10(%rbp),%rax # +100.00% 3.23 : 4005a3: and $0x1,%eax 1.84 : 4005a6: test %rax,%rax 0.00 : 4005a9: je 4005bf <branches+0x45> # -54.50% (p:42.00%) 0.46 : 4005ab: mov 0x200bbe(%rip),%rax # 601170 <acc> 12.90 : 4005b2: add $0x1,%rax 2.30 : 4005b6: mov %rax,0x200bb3(%rip) # 601170 <acc> 0.46 : 4005bd: jmp 4005d1 <branches+0x57> # -100.00% (p:100.00%) 0.92 : 4005bf: mov 0x200baa(%rip),%rax # 601170 <acc> # +49.54% 13.82 : 4005c6: sub $0x1,%rax 0.46 : 4005ca: mov %rax,0x200b9f(%rip) # 601170 <acc> 2.30 : 4005d1: mov -0x10(%rbp),%rax # +50.46% 0.46 : 4005d5: mov %rax,%rdi 0.46 : 4005d8: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 4005dd: mov %rax,-0x10(%rbp) # +100.00% 0.92 : 4005e1: mov -0x18(%rbp),%rax 0.00 : 4005e5: and $0x1,%eax 0.00 : 4005e8: test %rax,%rax 0.00 : 4005eb: je 4005ff <branches+0x85> # -100.00% (p:100.00%) 0.00 : 4005ed: mov 0x200b7c(%rip),%rax # 601170 <acc> 0.00 : 4005f4: shr $0x2,%rax 0.00 : 4005f8: mov %rax,0x200b71(%rip) # 601170 <acc> 0.00 : 4005ff: mov -0x10(%rbp),%rax # +100.00% 7.37 : 400603: and $0x1,%eax 3.69 : 400606: test %rax,%rax 0.00 : 400609: jne 400612 <branches+0x98> # -59.25% (p:42.99%) 1.84 : 40060b: mov $0x1,%eax 14.29 : 400610: jmp 400617 <branches+0x9d> # -100.00% (p:100.00%) 1.38 : 400612: mov $0x0,%eax # +57.65% 10.14 : 400617: test %al,%al # +42.35% 0.00 : 400619: je 40062f <branches+0xb5> # -57.65% (p:100.00%) 0.46 : 40061b: mov 0x200b4e(%rip),%rax # 601170 <acc> 2.76 : 400622: sub $0x1,%rax 0.00 : 400626: mov %rax,0x200b43(%rip) # 601170 <acc> 0.46 : 40062d: jmp 400641 <branches+0xc7> # -100.00% (p:100.00%) 0.92 : 40062f: mov 0x200b3a(%rip),%rax # 601170 <acc> # +56.13% 2.30 : 400636: add $0x1,%rax 0.92 : 40063a: mov %rax,0x200b2f(%rip) # 601170 <acc> 0.92 : 400641: mov -0x10(%rbp),%rax # +43.87% 2.30 : 400645: mov %rax,%rdi 0.00 : 400648: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 40064d: mov %rax,-0x10(%rbp) # +100.00% 1.84 : 400651: addq $0x1,-0x8(%rbp) 0.92 : 400656: mov -0x8(%rbp),%rax 5.07 : 40065a: cmp -0x20(%rbp),%rax 0.00 : 40065e: jb 40059f <branches+0x25> # -100.00% (p:100.00%) 0.00 : 400664: nop 0.00 : 400665: leaveq 0.00 : 400666: retq (Note: the --branch-filter u,any was used to avoid spurious target and branch points due to interrupts/faults, they show up as very small -/+ annotations on 'weird' locations) Committer note: Please take a look at: http://vger.kernel.org/~acme/perf/annotate_basic_blocks.png To see the colors. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: David Carrillo-Cisneros <davidcc@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Stephane Eranian <eranian@google.com> [ Moved sym->max_coverage to 'struct annotate', aka symbol__annotate(sym) ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-09-06 03:08:12 +08:00
struct block_range *br;
next = disasm__get_next_ip_line(&notes->src->source, dl);
if (perf_evsel__is_group_event(evsel)) {
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
nr_percent = evsel->nr_members;
ppercents = calloc(nr_percent, sizeof(double));
psamples = calloc(nr_percent, sizeof(struct sym_hist_entry));
if (ppercents == NULL || psamples == NULL) {
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
return -1;
}
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
}
for (i = 0; i < nr_percent; i++) {
percent = disasm__calc_percent(notes,
notes->src->lines ? i : evsel->idx + i,
offset,
next ? next->offset : (s64) len,
&path, &sample);
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
ppercents[i] = percent;
psamples[i] = sample;
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
if (percent > max_percent)
max_percent = percent;
}
if (max_percent < min_pcnt)
return -1;
if (max_lines && printed >= max_lines)
return 1;
if (queue != NULL) {
list_for_each_entry_from(queue, &notes->src->source, node) {
if (queue == dl)
break;
disasm_line__print(queue, sym, start, evsel, len,
0, 0, 1, NULL);
}
}
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
color = get_percent_color(max_percent);
/*
* Also color the filename and line if needed, with
* the same color than the percentage. Don't print it
* twice for close colored addr with the same filename:line
*/
if (path) {
if (!prev_line || strcmp(prev_line, path)
|| color != prev_color) {
color_fprintf(stdout, color, " %s", path);
prev_line = path;
prev_color = color;
}
}
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
for (i = 0; i < nr_percent; i++) {
percent = ppercents[i];
sample = psamples[i];
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
color = get_percent_color(percent);
if (symbol_conf.show_total_period)
color_fprintf(stdout, color, " %11" PRIu64,
sample.period);
else if (symbol_conf.show_nr_samples)
color_fprintf(stdout, color, " %7" PRIu64,
sample.nr_samples);
else
color_fprintf(stdout, color, " %7.2f", percent);
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
}
printf(" : ");
perf annotate: Add branch stack / basic block I wanted to know the hottest path through a function and figured the branch-stack (LBR) information should be able to help out with that. The below uses the branch-stack to create basic blocks and generate statistics from them. from to branch_i * ----> * | | block v * ----> * from to branch_i+1 The blocks are broken down into non-overlapping ranges, while tracking if the start of each range is an entry point and/or the end of a range is a branch. Each block iterates all ranges it covers (while splitting where required to exactly match the block) and increments the 'coverage' count. For the range including the branch we increment the taken counter, as well as the pred counter if flags.predicted. Using these number we can find if an instruction: - had coverage; given by: br->coverage / br->sym->max_coverage This metric ensures each symbol has a 100% spot, which reflects the observation that each symbol must have a most covered/hottest block. - is a branch target: br->is_target && br->start == add - for targets, how much of a branch's coverages comes from it: target->entry / branch->coverage - is a branch: br->is_branch && br->end == addr - for branches, how often it was taken: br->taken / br->coverage after all, all execution that didn't take the branch would have incremented the coverage and continued onward to a later branch. - for branches, how often it was predicted: br->pred / br->taken The coverage percentage is used to color the address and asm sections; for low (<1%) coverage we use NORMAL (uncolored), indicating that these instructions are not 'important'. For high coverage (>75%) we color the address RED. For each branch, we add an asm comment after the instruction with information on how often it was taken and predicted. Output looks like (sans color, which does loose a lot of the information :/) $ perf record --branch-filter u,any -e cycles:p ./branches 27 $ perf annotate branches Percent | Source code & Disassembly of branches for cycles:pu (217 samples) --------------------------------------------------------------------------------- : branches(): 0.00 : 40057a: push %rbp 0.00 : 40057b: mov %rsp,%rbp 0.00 : 40057e: sub $0x20,%rsp 0.00 : 400582: mov %rdi,-0x18(%rbp) 0.00 : 400586: mov %rsi,-0x20(%rbp) 0.00 : 40058a: mov -0x18(%rbp),%rax 0.00 : 40058e: mov %rax,-0x10(%rbp) 0.00 : 400592: movq $0x0,-0x8(%rbp) 0.00 : 40059a: jmpq 400656 <branches+0xdc> 1.84 : 40059f: mov -0x10(%rbp),%rax # +100.00% 3.23 : 4005a3: and $0x1,%eax 1.84 : 4005a6: test %rax,%rax 0.00 : 4005a9: je 4005bf <branches+0x45> # -54.50% (p:42.00%) 0.46 : 4005ab: mov 0x200bbe(%rip),%rax # 601170 <acc> 12.90 : 4005b2: add $0x1,%rax 2.30 : 4005b6: mov %rax,0x200bb3(%rip) # 601170 <acc> 0.46 : 4005bd: jmp 4005d1 <branches+0x57> # -100.00% (p:100.00%) 0.92 : 4005bf: mov 0x200baa(%rip),%rax # 601170 <acc> # +49.54% 13.82 : 4005c6: sub $0x1,%rax 0.46 : 4005ca: mov %rax,0x200b9f(%rip) # 601170 <acc> 2.30 : 4005d1: mov -0x10(%rbp),%rax # +50.46% 0.46 : 4005d5: mov %rax,%rdi 0.46 : 4005d8: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 4005dd: mov %rax,-0x10(%rbp) # +100.00% 0.92 : 4005e1: mov -0x18(%rbp),%rax 0.00 : 4005e5: and $0x1,%eax 0.00 : 4005e8: test %rax,%rax 0.00 : 4005eb: je 4005ff <branches+0x85> # -100.00% (p:100.00%) 0.00 : 4005ed: mov 0x200b7c(%rip),%rax # 601170 <acc> 0.00 : 4005f4: shr $0x2,%rax 0.00 : 4005f8: mov %rax,0x200b71(%rip) # 601170 <acc> 0.00 : 4005ff: mov -0x10(%rbp),%rax # +100.00% 7.37 : 400603: and $0x1,%eax 3.69 : 400606: test %rax,%rax 0.00 : 400609: jne 400612 <branches+0x98> # -59.25% (p:42.99%) 1.84 : 40060b: mov $0x1,%eax 14.29 : 400610: jmp 400617 <branches+0x9d> # -100.00% (p:100.00%) 1.38 : 400612: mov $0x0,%eax # +57.65% 10.14 : 400617: test %al,%al # +42.35% 0.00 : 400619: je 40062f <branches+0xb5> # -57.65% (p:100.00%) 0.46 : 40061b: mov 0x200b4e(%rip),%rax # 601170 <acc> 2.76 : 400622: sub $0x1,%rax 0.00 : 400626: mov %rax,0x200b43(%rip) # 601170 <acc> 0.46 : 40062d: jmp 400641 <branches+0xc7> # -100.00% (p:100.00%) 0.92 : 40062f: mov 0x200b3a(%rip),%rax # 601170 <acc> # +56.13% 2.30 : 400636: add $0x1,%rax 0.92 : 40063a: mov %rax,0x200b2f(%rip) # 601170 <acc> 0.92 : 400641: mov -0x10(%rbp),%rax # +43.87% 2.30 : 400645: mov %rax,%rdi 0.00 : 400648: callq 400526 <lfsr> # -100.00% (p:100.00%) 0.00 : 40064d: mov %rax,-0x10(%rbp) # +100.00% 1.84 : 400651: addq $0x1,-0x8(%rbp) 0.92 : 400656: mov -0x8(%rbp),%rax 5.07 : 40065a: cmp -0x20(%rbp),%rax 0.00 : 40065e: jb 40059f <branches+0x25> # -100.00% (p:100.00%) 0.00 : 400664: nop 0.00 : 400665: leaveq 0.00 : 400666: retq (Note: the --branch-filter u,any was used to avoid spurious target and branch points due to interrupts/faults, they show up as very small -/+ annotations on 'weird' locations) Committer note: Please take a look at: http://vger.kernel.org/~acme/perf/annotate_basic_blocks.png To see the colors. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: David Carrillo-Cisneros <davidcc@google.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Stephane Eranian <eranian@google.com> [ Moved sym->max_coverage to 'struct annotate', aka symbol__annotate(sym) ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-09-06 03:08:12 +08:00
br = block_range__find(addr);
color_fprintf(stdout, annotate__address_color(br), " %" PRIx64 ":", addr);
color_fprintf(stdout, annotate__asm_color(br), "%s", dl->line);
annotate__branch_printf(br, addr);
printf("\n");
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
if (ppercents != &percent)
free(ppercents);
if (psamples != &sample)
free(psamples);
} else if (max_lines && printed >= max_lines)
return 1;
else {
int width = symbol_conf.show_total_period ? 12 : 8;
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
if (queue)
return -1;
if (perf_evsel__is_group_event(evsel))
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
width *= evsel->nr_members;
if (!*dl->line)
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
printf(" %*s:\n", width, " ");
else
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
printf(" %*s: %s\n", width, " ", dl->line);
}
return 0;
}
/*
* symbol__parse_objdump_line() parses objdump output (with -d --no-show-raw)
* which looks like following
*
* 0000000000415500 <_init>:
* 415500: sub $0x8,%rsp
* 415504: mov 0x2f5ad5(%rip),%rax # 70afe0 <_DYNAMIC+0x2f8>
* 41550b: test %rax,%rax
* 41550e: je 415515 <_init+0x15>
* 415510: callq 416e70 <__gmon_start__@plt>
* 415515: add $0x8,%rsp
* 415519: retq
*
* it will be parsed and saved into struct disasm_line as
* <offset> <name> <ops.raw>
*
* The offset will be a relative offset from the start of the symbol and -1
* means that it's not a disassembly line so should be treated differently.
* The ops.raw part will be parsed further according to type of the instruction.
*/
static int symbol__parse_objdump_line(struct symbol *sym, struct map *map,
struct arch *arch,
FILE *file, size_t privsize,
int *line_nr)
{
struct annotation *notes = symbol__annotation(sym);
struct disasm_line *dl;
char *line = NULL, *parsed_line, *tmp, *tmp2;
size_t line_len;
s64 line_ip, offset = -1;
regmatch_t match[2];
if (getline(&line, &line_len, file) < 0)
return -1;
if (!line)
return -1;
line_ip = -1;
parsed_line = rtrim(line);
/* /filename:linenr ? Save line number and ignore. */
if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) {
*line_nr = atoi(parsed_line + match[1].rm_so);
return 0;
}
tmp = ltrim(parsed_line);
if (*tmp) {
/*
* Parse hexa addresses followed by ':'
*/
line_ip = strtoull(tmp, &tmp2, 16);
if (*tmp2 != ':' || tmp == tmp2 || tmp2[1] == '\0')
line_ip = -1;
}
if (line_ip != -1) {
u64 start = map__rip_2objdump(map, sym->start),
end = map__rip_2objdump(map, sym->end);
offset = line_ip - start;
if ((u64)line_ip < start || (u64)line_ip >= end)
offset = -1;
else
parsed_line = tmp2 + 1;
}
dl = disasm_line__new(offset, parsed_line, privsize, *line_nr, arch, map);
free(line);
(*line_nr)++;
if (dl == NULL)
return -1;
perf annotate: Fix jump target outside of function address range If jump target is outside of function range, perf is not handling it correctly. Especially when target address is lesser than function start address, target offset will be negative. But, target address declared to be unsigned, converts negative number into 2's complement. See below example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is lesser than function start address(34cf0). 34ac0 - 34cf0 = -0x230 = 0xfffffffffffffdd0 Objdump output: 0000000000034cf0 <__sigaction>: __GI___sigaction(): 34cf0: lea -0x20(%rdi),%eax 34cf3: cmp -bashx1,%eax 34cf6: jbe 34d00 <__sigaction+0x10> 34cf8: jmpq 34ac0 <__GI___libc_sigaction> 34cfd: nopl (%rax) 34d00: mov 0x386161(%rip),%rax # 3bae68 <_DYNAMIC+0x2e8> 34d07: movl -bashx16,%fs:(%rax) 34d0e: mov -bashxffffffff,%eax 34d13: retq perf annotate before applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 v jmpq fffffffffffffdd0 nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq perf annotate after applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 ^ jmpq 34ac0 <__GI___libc_sigaction> nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1480953407-7605-3-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-05 23:56:47 +08:00
if (!disasm_line__has_offset(dl)) {
dl->ops.target.offset = dl->ops.target.addr -
map__rip_2objdump(map, sym->start);
perf annotate: Fix jump target outside of function address range If jump target is outside of function range, perf is not handling it correctly. Especially when target address is lesser than function start address, target offset will be negative. But, target address declared to be unsigned, converts negative number into 2's complement. See below example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is lesser than function start address(34cf0). 34ac0 - 34cf0 = -0x230 = 0xfffffffffffffdd0 Objdump output: 0000000000034cf0 <__sigaction>: __GI___sigaction(): 34cf0: lea -0x20(%rdi),%eax 34cf3: cmp -bashx1,%eax 34cf6: jbe 34d00 <__sigaction+0x10> 34cf8: jmpq 34ac0 <__GI___libc_sigaction> 34cfd: nopl (%rax) 34d00: mov 0x386161(%rip),%rax # 3bae68 <_DYNAMIC+0x2e8> 34d07: movl -bashx16,%fs:(%rax) 34d0e: mov -bashxffffffff,%eax 34d13: retq perf annotate before applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 v jmpq fffffffffffffdd0 nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq perf annotate after applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea -0x20(%rdi),%eax cmp -bashx1,%eax v jbe 10 ^ jmpq 34ac0 <__GI___libc_sigaction> nop 10: mov _DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov -bashxffffffff,%eax retq Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1480953407-7605-3-git-send-email-ravi.bangoria@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-12-05 23:56:47 +08:00
dl->ops.target.offset_avail = true;
}
/* kcore has no symbols, so add the call target name */
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.name) {
struct addr_map_symbol target = {
.map = map,
.addr = dl->ops.target.addr,
};
if (!map_groups__find_ams(&target) &&
target.sym->start == target.al_addr)
dl->ops.target.name = strdup(target.sym->name);
}
disasm__add(&notes->src->source, dl);
return 0;
}
static __attribute__((constructor)) void symbol__init_regexpr(void)
{
regcomp(&file_lineno, "^/[^:]+:([0-9]+)", REG_EXTENDED);
}
static void delete_last_nop(struct symbol *sym)
{
struct annotation *notes = symbol__annotation(sym);
struct list_head *list = &notes->src->source;
struct disasm_line *dl;
while (!list_empty(list)) {
dl = list_entry(list->prev, struct disasm_line, node);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
if (dl->ins.ops) {
if (dl->ins.ops != &nop_ops)
return;
} else {
if (!strstr(dl->line, " nop ") &&
!strstr(dl->line, " nopl ") &&
!strstr(dl->line, " nopw "))
return;
}
list_del(&dl->node);
disasm_line__free(dl);
}
}
int symbol__strerror_disassemble(struct symbol *sym __maybe_unused, struct map *map,
int errnum, char *buf, size_t buflen)
{
struct dso *dso = map->dso;
BUG_ON(buflen == 0);
if (errnum >= 0) {
str_error_r(errnum, buf, buflen);
return 0;
}
switch (errnum) {
case SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX: {
char bf[SBUILD_ID_SIZE + 15] = " with build id ";
char *build_id_msg = NULL;
if (dso->has_build_id) {
build_id__sprintf(dso->build_id,
sizeof(dso->build_id), bf + 15);
build_id_msg = bf;
}
scnprintf(buf, buflen,
"No vmlinux file%s\nwas found in the path.\n\n"
"Note that annotation using /proc/kcore requires CAP_SYS_RAWIO capability.\n\n"
"Please use:\n\n"
" perf buildid-cache -vu vmlinux\n\n"
"or:\n\n"
" --vmlinux vmlinux\n", build_id_msg ?: "");
}
break;
default:
scnprintf(buf, buflen, "Internal error: Invalid %d error code\n", errnum);
break;
}
return 0;
}
static int dso__disassemble_filename(struct dso *dso, char *filename, size_t filename_size)
{
char linkname[PATH_MAX];
char *build_id_filename;
char *build_id_path = NULL;
char *pos;
if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
!dso__is_kcore(dso))
return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
build_id_filename = dso__build_id_filename(dso, NULL, 0, false);
if (build_id_filename) {
__symbol__join_symfs(filename, filename_size, build_id_filename);
free(build_id_filename);
} else {
if (dso->has_build_id)
return ENOMEM;
goto fallback;
}
build_id_path = strdup(filename);
if (!build_id_path)
return -1;
/*
* old style build-id cache has name of XX/XXXXXXX.. while
* new style has XX/XXXXXXX../{elf,kallsyms,vdso}.
* extract the build-id part of dirname in the new style only.
*/
pos = strrchr(build_id_path, '/');
if (pos && strlen(pos) < SBUILD_ID_SIZE - 2)
dirname(build_id_path);
if (dso__is_kcore(dso) ||
readlink(build_id_path, linkname, sizeof(linkname)) < 0 ||
strstr(linkname, DSO__NAME_KALLSYMS) ||
access(filename, R_OK)) {
fallback:
/*
* If we don't have build-ids or the build-id file isn't in the
* cache, or is just a kallsyms file, well, lets hope that this
* DSO is the same as when 'perf record' ran.
*/
__symbol__join_symfs(filename, filename_size, dso->long_name);
}
free(build_id_path);
return 0;
}
static const char *annotate__norm_arch(const char *arch_name)
{
struct utsname uts;
if (!arch_name) { /* Assume we are annotating locally. */
if (uname(&uts) < 0)
return NULL;
arch_name = uts.machine;
}
return normalize_arch((char *)arch_name);
}
int symbol__disassemble(struct symbol *sym, struct map *map,
const char *arch_name, size_t privsize,
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
struct arch **parch, char *cpuid)
{
struct dso *dso = map->dso;
char command[PATH_MAX * 2];
struct arch *arch = NULL;
FILE *file;
char symfs_filename[PATH_MAX];
struct kcore_extract kce;
bool delete_extract = false;
int stdout_fd[2];
int lineno = 0;
int nline;
pid_t pid;
int err = dso__disassemble_filename(dso, symfs_filename, sizeof(symfs_filename));
if (err)
return err;
arch_name = annotate__norm_arch(arch_name);
if (!arch_name)
return -1;
arch = arch__find(arch_name);
if (arch == NULL)
return -ENOTSUP;
if (parch)
*parch = arch;
if (arch->init) {
err = arch->init(arch);
if (err) {
pr_err("%s: failed to initialize %s arch priv area\n", __func__, arch->name);
return err;
}
}
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
if (arch->cpuid_parse && cpuid)
arch->cpuid_parse(arch, cpuid);
pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
symfs_filename, sym->name, map->unmap_ip(map, sym->start),
map->unmap_ip(map, sym->end));
pr_debug("annotating [%p] %30s : [%p] %30s\n",
dso, dso->long_name, sym, sym->name);
if (dso__is_kcore(dso)) {
kce.kcore_filename = symfs_filename;
kce.addr = map__rip_2objdump(map, sym->start);
kce.offs = sym->start;
kce.len = sym->end - sym->start;
if (!kcore_extract__create(&kce)) {
delete_extract = true;
strlcpy(symfs_filename, kce.extract_filename,
sizeof(symfs_filename));
}
} else if (dso__needs_decompress(dso)) {
char tmp[KMOD_DECOMP_LEN];
if (dso__decompress_kmodule_path(dso, symfs_filename,
tmp, sizeof(tmp)) < 0)
goto out;
strcpy(symfs_filename, tmp);
}
snprintf(command, sizeof(command),
"%s %s%s --start-address=0x%016" PRIx64
" --stop-address=0x%016" PRIx64
" -l -d %s %s -C \"%s\" 2>/dev/null|grep -v \"%s:\"|expand",
objdump_path ? objdump_path : "objdump",
disassembler_style ? "-M " : "",
disassembler_style ? disassembler_style : "",
map__rip_2objdump(map, sym->start),
map__rip_2objdump(map, sym->end),
symbol_conf.annotate_asm_raw ? "" : "--no-show-raw",
symbol_conf.annotate_src ? "-S" : "",
symfs_filename, symfs_filename);
pr_debug("Executing: %s\n", command);
err = -1;
if (pipe(stdout_fd) < 0) {
pr_err("Failure creating the pipe to run %s\n", command);
goto out_remove_tmp;
}
pid = fork();
if (pid < 0) {
pr_err("Failure forking to run %s\n", command);
goto out_close_stdout;
}
if (pid == 0) {
close(stdout_fd[0]);
dup2(stdout_fd[1], 1);
close(stdout_fd[1]);
execl("/bin/sh", "sh", "-c", command, NULL);
perror(command);
exit(-1);
}
close(stdout_fd[1]);
file = fdopen(stdout_fd[0], "r");
perf annotate: Inform the user about objdump failures in --stdio When the browser fails to annotate it is difficult for users to find out what went wrong. Add some errors for objdump failures that are displayed in the UI. Note it would be even better to handle these errors smarter, like falling back to the binary when the debug info is somehow corrupted. But for now just giving a better error is an improvement. Committer note: This works for --stdio, where errors just scroll by the screen: # perf annotate --stdio intel_idle Failure running objdump --start-address=0xffffffff81418290 --stop-address=0xffffffff814183ae -l -d --no-show-raw -S -C /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1 2>/dev/null|grep -v /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1|expand Percent | Source code & Disassembly of vmlinux for cycles:pp ------------------------------------------------------------------ And with that one can use that command line to try to find out more about what happened instead of getting a blank screen, an improvement. We need tho to improve this further to get it to work with other UIs, like --tui and --gtk, where it continues showing a blank screen, no messages, as the pr_err() used is enough just for --stdio. Signed-off-by: Andi Kleen <ak@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1446779167-18949-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-11-06 11:06:07 +08:00
if (!file) {
pr_err("Failure creating FILE stream for %s\n", command);
perf annotate: Inform the user about objdump failures in --stdio When the browser fails to annotate it is difficult for users to find out what went wrong. Add some errors for objdump failures that are displayed in the UI. Note it would be even better to handle these errors smarter, like falling back to the binary when the debug info is somehow corrupted. But for now just giving a better error is an improvement. Committer note: This works for --stdio, where errors just scroll by the screen: # perf annotate --stdio intel_idle Failure running objdump --start-address=0xffffffff81418290 --stop-address=0xffffffff814183ae -l -d --no-show-raw -S -C /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1 2>/dev/null|grep -v /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1|expand Percent | Source code & Disassembly of vmlinux for cycles:pp ------------------------------------------------------------------ And with that one can use that command line to try to find out more about what happened instead of getting a blank screen, an improvement. We need tho to improve this further to get it to work with other UIs, like --tui and --gtk, where it continues showing a blank screen, no messages, as the pr_err() used is enough just for --stdio. Signed-off-by: Andi Kleen <ak@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1446779167-18949-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-11-06 11:06:07 +08:00
/*
* If we were using debug info should retry with
* original binary.
*/
goto out_remove_tmp;
perf annotate: Inform the user about objdump failures in --stdio When the browser fails to annotate it is difficult for users to find out what went wrong. Add some errors for objdump failures that are displayed in the UI. Note it would be even better to handle these errors smarter, like falling back to the binary when the debug info is somehow corrupted. But for now just giving a better error is an improvement. Committer note: This works for --stdio, where errors just scroll by the screen: # perf annotate --stdio intel_idle Failure running objdump --start-address=0xffffffff81418290 --stop-address=0xffffffff814183ae -l -d --no-show-raw -S -C /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1 2>/dev/null|grep -v /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1|expand Percent | Source code & Disassembly of vmlinux for cycles:pp ------------------------------------------------------------------ And with that one can use that command line to try to find out more about what happened instead of getting a blank screen, an improvement. We need tho to improve this further to get it to work with other UIs, like --tui and --gtk, where it continues showing a blank screen, no messages, as the pr_err() used is enough just for --stdio. Signed-off-by: Andi Kleen <ak@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1446779167-18949-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-11-06 11:06:07 +08:00
}
perf annotate: Inform the user about objdump failures in --stdio When the browser fails to annotate it is difficult for users to find out what went wrong. Add some errors for objdump failures that are displayed in the UI. Note it would be even better to handle these errors smarter, like falling back to the binary when the debug info is somehow corrupted. But for now just giving a better error is an improvement. Committer note: This works for --stdio, where errors just scroll by the screen: # perf annotate --stdio intel_idle Failure running objdump --start-address=0xffffffff81418290 --stop-address=0xffffffff814183ae -l -d --no-show-raw -S -C /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1 2>/dev/null|grep -v /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1|expand Percent | Source code & Disassembly of vmlinux for cycles:pp ------------------------------------------------------------------ And with that one can use that command line to try to find out more about what happened instead of getting a blank screen, an improvement. We need tho to improve this further to get it to work with other UIs, like --tui and --gtk, where it continues showing a blank screen, no messages, as the pr_err() used is enough just for --stdio. Signed-off-by: Andi Kleen <ak@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1446779167-18949-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-11-06 11:06:07 +08:00
nline = 0;
while (!feof(file)) {
/*
* The source code line number (lineno) needs to be kept in
* accross calls to symbol__parse_objdump_line(), so that it
* can associate it with the instructions till the next one.
* See disasm_line__new() and struct disasm_line::line_nr.
*/
if (symbol__parse_objdump_line(sym, map, arch, file, privsize,
&lineno) < 0)
break;
perf annotate: Inform the user about objdump failures in --stdio When the browser fails to annotate it is difficult for users to find out what went wrong. Add some errors for objdump failures that are displayed in the UI. Note it would be even better to handle these errors smarter, like falling back to the binary when the debug info is somehow corrupted. But for now just giving a better error is an improvement. Committer note: This works for --stdio, where errors just scroll by the screen: # perf annotate --stdio intel_idle Failure running objdump --start-address=0xffffffff81418290 --stop-address=0xffffffff814183ae -l -d --no-show-raw -S -C /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1 2>/dev/null|grep -v /root/.debug/.build-id/28/2777c262e6b3c0451375163c9a81c893218ab1|expand Percent | Source code & Disassembly of vmlinux for cycles:pp ------------------------------------------------------------------ And with that one can use that command line to try to find out more about what happened instead of getting a blank screen, an improvement. We need tho to improve this further to get it to work with other UIs, like --tui and --gtk, where it continues showing a blank screen, no messages, as the pr_err() used is enough just for --stdio. Signed-off-by: Andi Kleen <ak@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Link: http://lkml.kernel.org/r/1446779167-18949-1-git-send-email-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-11-06 11:06:07 +08:00
nline++;
}
if (nline == 0)
pr_err("No output from %s\n", command);
/*
* kallsyms does not have symbol sizes so there may a nop at the end.
* Remove it.
*/
if (dso__is_kcore(dso))
delete_last_nop(sym);
fclose(file);
err = 0;
out_remove_tmp:
close(stdout_fd[0]);
if (dso__needs_decompress(dso))
unlink(symfs_filename);
if (delete_extract)
kcore_extract__delete(&kce);
out:
return err;
out_close_stdout:
close(stdout_fd[1]);
goto out_remove_tmp;
}
static void insert_source_line(struct rb_root *root, struct source_line *src_line)
{
struct source_line *iter;
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
int i, ret;
while (*p != NULL) {
parent = *p;
iter = rb_entry(parent, struct source_line, node);
ret = strcmp(iter->path, src_line->path);
if (ret == 0) {
for (i = 0; i < src_line->nr_pcnt; i++)
iter->samples[i].percent_sum += src_line->samples[i].percent;
return;
}
if (ret < 0)
p = &(*p)->rb_left;
else
p = &(*p)->rb_right;
}
for (i = 0; i < src_line->nr_pcnt; i++)
src_line->samples[i].percent_sum = src_line->samples[i].percent;
rb_link_node(&src_line->node, parent, p);
rb_insert_color(&src_line->node, root);
}
static int cmp_source_line(struct source_line *a, struct source_line *b)
{
int i;
for (i = 0; i < a->nr_pcnt; i++) {
if (a->samples[i].percent_sum == b->samples[i].percent_sum)
continue;
return a->samples[i].percent_sum > b->samples[i].percent_sum;
}
return 0;
}
static void __resort_source_line(struct rb_root *root, struct source_line *src_line)
{
struct source_line *iter;
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
while (*p != NULL) {
parent = *p;
iter = rb_entry(parent, struct source_line, node);
if (cmp_source_line(src_line, iter))
p = &(*p)->rb_left;
else
p = &(*p)->rb_right;
}
rb_link_node(&src_line->node, parent, p);
rb_insert_color(&src_line->node, root);
}
static void resort_source_line(struct rb_root *dest_root, struct rb_root *src_root)
{
struct source_line *src_line;
struct rb_node *node;
node = rb_first(src_root);
while (node) {
struct rb_node *next;
src_line = rb_entry(node, struct source_line, node);
next = rb_next(node);
rb_erase(node, src_root);
__resort_source_line(dest_root, src_line);
node = next;
}
}
static void symbol__free_source_line(struct symbol *sym, int len)
{
struct annotation *notes = symbol__annotation(sym);
struct source_line *src_line = notes->src->lines;
size_t sizeof_src_line;
int i;
sizeof_src_line = sizeof(*src_line) +
(sizeof(src_line->samples) * (src_line->nr_pcnt - 1));
for (i = 0; i < len; i++) {
free_srcline(src_line->path);
src_line = (void *)src_line + sizeof_src_line;
}
zfree(&notes->src->lines);
}
/* Get the filename:line for the colored entries */
static int symbol__get_source_line(struct symbol *sym, struct map *map,
struct perf_evsel *evsel,
struct rb_root *root, int len)
{
u64 start;
int i, k;
int evidx = evsel->idx;
struct source_line *src_line;
struct annotation *notes = symbol__annotation(sym);
struct sym_hist *h = annotation__histogram(notes, evidx);
struct rb_root tmp_root = RB_ROOT;
int nr_pcnt = 1;
u64 nr_samples = h->nr_samples;
size_t sizeof_src_line = sizeof(struct source_line);
if (perf_evsel__is_group_event(evsel)) {
for (i = 1; i < evsel->nr_members; i++) {
h = annotation__histogram(notes, evidx + i);
nr_samples += h->nr_samples;
}
nr_pcnt = evsel->nr_members;
sizeof_src_line += (nr_pcnt - 1) * sizeof(src_line->samples);
}
if (!nr_samples)
return 0;
src_line = notes->src->lines = calloc(len, sizeof_src_line);
if (!notes->src->lines)
return -1;
start = map__rip_2objdump(map, sym->start);
for (i = 0; i < len; i++) {
u64 offset;
double percent_max = 0.0;
src_line->nr_pcnt = nr_pcnt;
for (k = 0; k < nr_pcnt; k++) {
perf annotate: Fix a bug of division by zero when calculating percent Currently perf-annotate with --print-line can print -nan(0x8000000000000) because of division by zero when calculating percent. The division by zero happens when a sum of samples is zero in symbol__get_source_line(), so fix it. For example: After running 'perf record' like below, $ perf record -e "{cycles,page-faults,branch-misses}" ./a.out Before: $ perf annotate --stdio -l Sorted summary for file /home/taeung/workspace/a.out ---------------------------------------------- 32.89 -nan 7.04 a.c:38 25.14 -nan 0.00 a.c:34 16.26 -nan 56.34 a.c:31 15.88 -nan 1.41 a.c:37 5.67 -nan 0.00 a.c:39 1.13 -nan 35.21 a.c:26 0.95 -nan 0.00 a.c:44 0.57 -nan 0.00 a.c:32 Percent | Source code & Disassembly of a.out for cycles (529 samples) ----------------------------------------------------------------------------------------- : ... a.c:26 0.57 -nan 4.23 : 40081a: mov %edi,-0x24(%rbp) a.c:26 0.00 -nan 9.86 : 40081d: mov %rsi,-0x30(%rbp) ... However, if a sum of samples is zero (e.g. 'page-faults'), skip calculating percent. After: $ perf annotate --stdio -l Sorted summary for file /home/taeung/workspace/a.out ---------------------------------------------- 32.89 0.00 7.04 a.c:38 25.14 0.00 0.00 a.c:34 16.26 0.00 56.34 a.c:31 15.88 0.00 1.41 a.c:37 5.67 0.00 0.00 a.c:39 1.13 0.00 35.21 a.c:26 0.95 0.00 0.00 a.c:44 0.57 0.00 0.00 a.c:32 Percent | Source code & Disassembly of old for cycles (529 samples) ----------------------------------------------------------------------------------------- : ... a.c:26 0.57 0.00 4.23 : 40081a: mov %edi,-0x24(%rbp) a.c:26 0.00 0.00 9.86 : 40081d: mov %rsi,-0x30(%rbp) ... Signed-off-by: Taeung Song <treeze.taeung@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/r/1490598638-13947-3-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-03-27 15:10:37 +08:00
double percent = 0.0;
h = annotation__histogram(notes, evidx + k);
nr_samples = h->addr[i].nr_samples;
if (h->nr_samples)
percent = 100.0 * nr_samples / h->nr_samples;
perf annotate: Fix a bug of division by zero when calculating percent Currently perf-annotate with --print-line can print -nan(0x8000000000000) because of division by zero when calculating percent. The division by zero happens when a sum of samples is zero in symbol__get_source_line(), so fix it. For example: After running 'perf record' like below, $ perf record -e "{cycles,page-faults,branch-misses}" ./a.out Before: $ perf annotate --stdio -l Sorted summary for file /home/taeung/workspace/a.out ---------------------------------------------- 32.89 -nan 7.04 a.c:38 25.14 -nan 0.00 a.c:34 16.26 -nan 56.34 a.c:31 15.88 -nan 1.41 a.c:37 5.67 -nan 0.00 a.c:39 1.13 -nan 35.21 a.c:26 0.95 -nan 0.00 a.c:44 0.57 -nan 0.00 a.c:32 Percent | Source code & Disassembly of a.out for cycles (529 samples) ----------------------------------------------------------------------------------------- : ... a.c:26 0.57 -nan 4.23 : 40081a: mov %edi,-0x24(%rbp) a.c:26 0.00 -nan 9.86 : 40081d: mov %rsi,-0x30(%rbp) ... However, if a sum of samples is zero (e.g. 'page-faults'), skip calculating percent. After: $ perf annotate --stdio -l Sorted summary for file /home/taeung/workspace/a.out ---------------------------------------------- 32.89 0.00 7.04 a.c:38 25.14 0.00 0.00 a.c:34 16.26 0.00 56.34 a.c:31 15.88 0.00 1.41 a.c:37 5.67 0.00 0.00 a.c:39 1.13 0.00 35.21 a.c:26 0.95 0.00 0.00 a.c:44 0.57 0.00 0.00 a.c:32 Percent | Source code & Disassembly of old for cycles (529 samples) ----------------------------------------------------------------------------------------- : ... a.c:26 0.57 0.00 4.23 : 40081a: mov %edi,-0x24(%rbp) a.c:26 0.00 0.00 9.86 : 40081d: mov %rsi,-0x30(%rbp) ... Signed-off-by: Taeung Song <treeze.taeung@gmail.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/r/1490598638-13947-3-git-send-email-treeze.taeung@gmail.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-03-27 15:10:37 +08:00
if (percent > percent_max)
percent_max = percent;
src_line->samples[k].percent = percent;
2017-03-28 20:12:05 +08:00
src_line->samples[k].nr = nr_samples;
}
if (percent_max <= 0.5)
goto next;
offset = start + i;
perf report: Enable sorting by srcline as key Often it is interesting to know how costly a given source line is in total. Previously, one had to build these sums manually based on all addresses that pointed to the same source line. This patch introduces srcline as a sort key, which will do the aggregation for us. Paired with the recent addition of showing inline frames, this makes perf report much more useful for many C++ work loads. The following shows the new feature in action. First, let's show the status quo output when we sort by address. The result contains many hist entries that generate the same output: ~~~~~~~~~~~~~~~~ $ perf report --stdio --inline -g address # Children Self Command Shared Object Symbol # ........ ........ ............ ................... ......................................... # 99.89% 35.34% cpp-inlining cpp-inlining [.] main | |--64.55%--main complex:655 | /home/milian/projects/kdab/rnd/hotspot/tests/test-clients/cpp-inlining/main.cpp:39 (inline) | /usr/include/c++/6.3.1/complex:664 (inline) | | | |--60.31%--hypot +20 | | | | | |--8.52%--__hypot_finite +273 | | | | | |--7.32%--__hypot_finite +411 ... --35.34%--_start +4194346 __libc_start_main +241 | |--6.65%--main random.tcc:3326 | /home/milian/projects/kdab/rnd/hotspot/tests/test-clients/cpp-inlining/main.cpp:39 (inline) | /usr/include/c++/6.3.1/bits/random.h:1809 (inline) | /usr/include/c++/6.3.1/bits/random.h:1818 (inline) | /usr/include/c++/6.3.1/bits/random.h:185 (inline) | |--2.70%--main random.tcc:3326 | /home/milian/projects/kdab/rnd/hotspot/tests/test-clients/cpp-inlining/main.cpp:39 (inline) | /usr/include/c++/6.3.1/bits/random.h:1809 (inline) | /usr/include/c++/6.3.1/bits/random.h:1818 (inline) | /usr/include/c++/6.3.1/bits/random.h:185 (inline) | |--1.69%--main random.tcc:3326 | /home/milian/projects/kdab/rnd/hotspot/tests/test-clients/cpp-inlining/main.cpp:39 (inline) | /usr/include/c++/6.3.1/bits/random.h:1809 (inline) | /usr/include/c++/6.3.1/bits/random.h:1818 (inline) | /usr/include/c++/6.3.1/bits/random.h:185 (inline) ... ~~~~~~~~~~~~~~~~ With this patch and `-g srcline` we instead get the following output: ~~~~~~~~~~~~~~~~ $ perf report --stdio --inline -g srcline # Children Self Command Shared Object Symbol # ........ ........ ............ ................... ......................................... # 99.89% 35.34% cpp-inlining cpp-inlining [.] main | |--64.55%--main complex:655 | /home/milian/projects/kdab/rnd/hotspot/tests/test-clients/cpp-inlining/main.cpp:39 (inline) | /usr/include/c++/6.3.1/complex:664 (inline) | | | |--64.02%--hypot | | | | | --59.81%--__hypot_finite | | | --0.53%--cabs | --35.34%--_start __libc_start_main | |--12.48%--main random.tcc:3326 | /home/milian/projects/kdab/rnd/hotspot/tests/test-clients/cpp-inlining/main.cpp:39 (inline) | /usr/include/c++/6.3.1/bits/random.h:1809 (inline) | /usr/include/c++/6.3.1/bits/random.h:1818 (inline) | /usr/include/c++/6.3.1/bits/random.h:185 (inline) ... ~~~~~~~~~~~~~~~~ Signed-off-by: Milian Wolff <milian.wolff@kdab.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Yao Jin <yao.jin@linux.intel.com> Link: http://lkml.kernel.org/r/20170318214928.9047-1-milian.wolff@kdab.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-03-19 05:49:28 +08:00
src_line->path = get_srcline(map->dso, offset, NULL,
false, true);
insert_source_line(&tmp_root, src_line);
next:
src_line = (void *)src_line + sizeof_src_line;
}
resort_source_line(root, &tmp_root);
return 0;
}
static void print_summary(struct rb_root *root, const char *filename)
{
struct source_line *src_line;
struct rb_node *node;
printf("\nSorted summary for file %s\n", filename);
printf("----------------------------------------------\n\n");
if (RB_EMPTY_ROOT(root)) {
printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
return;
}
node = rb_first(root);
while (node) {
double percent, percent_max = 0.0;
const char *color;
char *path;
int i;
src_line = rb_entry(node, struct source_line, node);
for (i = 0; i < src_line->nr_pcnt; i++) {
percent = src_line->samples[i].percent_sum;
color = get_percent_color(percent);
color_fprintf(stdout, color, " %7.2f", percent);
if (percent > percent_max)
percent_max = percent;
}
path = src_line->path;
color = get_percent_color(percent_max);
color_fprintf(stdout, color, " %s\n", path);
node = rb_next(node);
}
}
static void symbol__annotate_hits(struct symbol *sym, struct perf_evsel *evsel)
{
struct annotation *notes = symbol__annotation(sym);
struct sym_hist *h = annotation__histogram(notes, evsel->idx);
u64 len = symbol__size(sym), offset;
for (offset = 0; offset < len; ++offset)
if (h->addr[offset].nr_samples != 0)
printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
sym->start + offset, h->addr[offset].nr_samples);
printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
}
int symbol__annotate_printf(struct symbol *sym, struct map *map,
struct perf_evsel *evsel, bool full_paths,
int min_pcnt, int max_lines, int context)
{
struct dso *dso = map->dso;
char *filename;
const char *d_filename;
const char *evsel_name = perf_evsel__name(evsel);
struct annotation *notes = symbol__annotation(sym);
struct sym_hist *h = annotation__histogram(notes, evsel->idx);
struct disasm_line *pos, *queue = NULL;
u64 start = map__rip_2objdump(map, sym->start);
int printed = 2, queue_len = 0;
int more = 0;
u64 len;
int width = symbol_conf.show_total_period ? 12 : 8;
int graph_dotted_len;
filename = strdup(dso->long_name);
if (!filename)
return -ENOMEM;
if (full_paths)
d_filename = filename;
else
d_filename = basename(filename);
len = symbol__size(sym);
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
if (perf_evsel__is_group_event(evsel))
perf annotate: Add basic support to event group view Add --group option to enable event grouping. When enabled, all the group members information will be shown with the leader so skip non-leader events. It only supports --stdio output currently. Later patches will extend additional features. $ perf annotate --group --stdio ... Percent | Source code & Disassembly of libpthread-2.15.so -------------------------------------------------------------------------------- : : : : Disassembly of section .text: : : 000000387dc0aa50 <__pthread_mutex_unlock_usercnt>: 8.08 2.40 5.29 : 387dc0aa50: mov %rdi,%rdx 0.00 0.00 0.00 : 387dc0aa53: mov 0x10(%rdi),%edi 0.00 0.00 0.00 : 387dc0aa56: mov %edi,%eax 0.00 0.80 0.00 : 387dc0aa58: and $0x7f,%eax 3.03 2.40 3.53 : 387dc0aa5b: test $0x7c,%dil 0.00 0.00 0.00 : 387dc0aa5f: jne 387dc0aaa9 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa61: test %eax,%eax 0.00 0.00 0.00 : 387dc0aa63: jne 387dc0aa85 <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa65: and $0x80,%edi 0.00 0.00 0.00 : 387dc0aa6b: test %esi,%esi 3.03 5.60 7.06 : 387dc0aa6d: movl $0x0,0x8(%rdx) 0.00 0.00 0.59 : 387dc0aa74: je 387dc0aa7a <__pthread_mutex_unlock_use 0.00 0.00 0.00 : 387dc0aa76: subl $0x1,0xc(%rdx) 2.02 5.60 1.18 : 387dc0aa7a: mov %edi,%esi 0.00 0.00 0.00 : 387dc0aa7c: lock decl (%rdx) 83.84 83.20 82.35 : 387dc0aa7f: jne 387dc0aada <_L_unlock_586> 0.00 0.00 0.00 : 387dc0aa81: nop 0.00 0.00 0.00 : 387dc0aa82: xor %eax,%eax 0.00 0.00 0.00 : 387dc0aa84: retq ... Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1362462812-30885-6-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2013-03-05 13:53:25 +08:00
width *= evsel->nr_members;
graph_dotted_len = printf(" %-*.*s| Source code & Disassembly of %s for %s (%" PRIu64 " samples)\n",
width, width, symbol_conf.show_total_period ? "Period" :
symbol_conf.show_nr_samples ? "Samples" : "Percent",
d_filename, evsel_name, h->nr_samples);
printf("%-*.*s----\n",
graph_dotted_len, graph_dotted_len, graph_dotted_line);
if (verbose > 0)
symbol__annotate_hits(sym, evsel);
list_for_each_entry(pos, &notes->src->source, node) {
if (context && queue == NULL) {
queue = pos;
queue_len = 0;
}
switch (disasm_line__print(pos, sym, start, evsel, len,
min_pcnt, printed, max_lines,
queue)) {
case 0:
++printed;
if (context) {
printed += queue_len;
queue = NULL;
queue_len = 0;
}
break;
case 1:
/* filtered by max_lines */
++more;
break;
case -1:
default:
/*
* Filtered by min_pcnt or non IP lines when
* context != 0
*/
if (!context)
break;
if (queue_len == context)
queue = list_entry(queue->node.next, typeof(*queue), node);
else
++queue_len;
break;
}
}
free(filename);
return more;
}
void symbol__annotate_zero_histogram(struct symbol *sym, int evidx)
{
struct annotation *notes = symbol__annotation(sym);
struct sym_hist *h = annotation__histogram(notes, evidx);
memset(h, 0, notes->src->sizeof_sym_hist);
}
void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
{
struct annotation *notes = symbol__annotation(sym);
struct sym_hist *h = annotation__histogram(notes, evidx);
int len = symbol__size(sym), offset;
h->nr_samples = 0;
for (offset = 0; offset < len; ++offset) {
h->addr[offset].nr_samples = h->addr[offset].nr_samples * 7 / 8;
h->nr_samples += h->addr[offset].nr_samples;
}
}
void disasm__purge(struct list_head *head)
{
struct disasm_line *pos, *n;
list_for_each_entry_safe(pos, n, head, node) {
list_del(&pos->node);
disasm_line__free(pos);
}
}
static size_t disasm_line__fprintf(struct disasm_line *dl, FILE *fp)
{
size_t printed;
if (dl->offset == -1)
return fprintf(fp, "%s\n", dl->line);
perf annotate: Remove duplicate 'name' field from disasm_line The disasm_line::name field is always equal to ins::name, being used just to locate the instruction's ins_ops from the per-arch instructions table. Eliminate this duplication, nuking that field and instead make ins__find() return an ins_ops, store it in disasm_line::ins.ops, and keep just in disasm_line::ins.name what was in disasm_line::name, this way we end up not keeping a reference to entries in the per-arch instructions table. This in turn will help supporting multiple ways to manage the per-arch instructions table, allowing resorting that array, for instance, when the entries will move after references to its addresses were made. The same problem is avoided when one grows the array with realloc. So architectures simply keeping a constant array will work as well as architectures building the table using regular expressions or other logic that involves resorting the table. Reviewed-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Chris Riyder <chris.ryder@arm.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kim Phillips <kim.phillips@arm.com> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Cc: Pawel Moll <pawel.moll@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King <rmk+kernel@arm.linux.org.uk> Cc: Taeung Song <treeze.taeung@gmail.com> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-vr899azvabnw9gtuepuqfd9t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-11-24 22:16:06 +08:00
printed = fprintf(fp, "%#" PRIx64 " %s", dl->offset, dl->ins.name);
if (dl->ops.raw[0] != '\0') {
printed += fprintf(fp, "%.*s %s\n", 6 - (int)printed, " ",
dl->ops.raw);
}
return printed + fprintf(fp, "\n");
}
size_t disasm__fprintf(struct list_head *head, FILE *fp)
{
struct disasm_line *pos;
size_t printed = 0;
list_for_each_entry(pos, head, node)
printed += disasm_line__fprintf(pos, fp);
return printed;
}
int symbol__tty_annotate(struct symbol *sym, struct map *map,
struct perf_evsel *evsel, bool print_lines,
bool full_paths, int min_pcnt, int max_lines)
{
struct dso *dso = map->dso;
struct rb_root source_line = RB_ROOT;
u64 len;
if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
perf annotate: Check for fused instructions Macro fusion merges two instructions to a single micro-op. Intel core platform performs this hardware optimization under limited circumstances. For example, CMP + JCC can be "fused" and executed /retired together. While with sampling this can result in the sample sometimes being on the JCC and sometimes on the CMP. So for the fused instruction pair, they could be considered together. On Nehalem, fused instruction pairs: cmp/test + jcc. On other new CPU: cmp/test/add/sub/and/inc/dec + jcc. This patch adds an x86-specific function which checks if 2 instructions are in a "fused" pair. For non-x86 arch, the function is just NULL. Changelog: v4: Move the CPU model checking to symbol__disassemble and save the CPU family/model in arch structure. It avoids checking every time when jump arrow printed. v3: Add checking for Nehalem (CMP, TEST). For other newer Intel CPUs just check it by default (CMP, TEST, ADD, SUB, AND, INC, DEC). v2: Remove the original weak function. Arnaldo points out that doing it as a weak function that will be overridden by the host arch doesn't work. So now it's implemented as an arch-specific function. Committer fix: Do not access evsel->evlist->env->cpuid, ->env can be null, introduce perf_evsel__env_cpuid(), just like perf_evsel__env_arch(), also used in this function call. The original patch was segfaulting 'perf top' + annotation. But this essentially disables this fused instructions augmentation in 'perf top', the right thing is to get the cpuid from the running kernel, left for a later patch tho. Signed-off-by: Yao Jin <yao.jin@linux.intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1499403995-19857-2-git-send-email-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2017-07-07 13:06:34 +08:00
0, NULL, NULL) < 0)
return -1;
len = symbol__size(sym);
if (print_lines) {
perf annotate: Support full source file paths for srcline fix The --full-paths option did not show the full source file paths in the 'perf annotate' tool, because the value of the option was not propagated into the related functions. With this patch the value of the --full-paths option is known to the function that composes the srcline string, so it prints the full path when necessary. Committer Note: This affects annotate when the --print-line option is used: # perf annotate -h 2>&1 | grep print-line -l, --print-line print matching source lines (may be slow) Looking just at the lines that should be affected by this change: Before: # perf annotate --print-line --full-paths --stdio fput | grep '\.[ch]:[0-9]\+' 94.44 atomic64_64.h:114 5.56 file_table.c:265 file_table.c:265 5.56 : ffffffff81219a00: callq ffffffff81769360 <__fentry__> atomic64_64.h:114 94.44 : ffffffff81219a05: lock decq 0x38(%rdi) After: # perf annotate --print-line --full-paths --stdio fput | grep '\.[ch]:[0-9]\+' 94.44 /home/git/linux/arch/x86/include/asm/atomic64_64.h:114 5.56 /home/git/linux/fs/file_table.c:265 /home/git/linux/fs/file_table.c:265 5.56 : ffffffff81219a00: callq ffffffff81769360 <__fentry__> /home/git/linux/arch/x86/include/asm/atomic64_64.h:114 94.44 : ffffffff81219a05: lock decq 0x38(%rdi) # Signed-off-by: Michael Petlan <mpetlan@redhat.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Link: http://permalink.gmane.org/gmane.linux.kernel.perf.user/2365 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2015-11-09 23:33:31 +08:00
srcline_full_filename = full_paths;
symbol__get_source_line(sym, map, evsel, &source_line, len);
print_summary(&source_line, dso->long_name);
}
symbol__annotate_printf(sym, map, evsel, full_paths,
min_pcnt, max_lines, 0);
if (print_lines)
symbol__free_source_line(sym, len);
disasm__purge(&symbol__annotation(sym)->src->source);
return 0;
}
bool ui__has_annotation(void)
{
return use_browser == 1 && perf_hpp_list.sym;
}