From 8ea92ceb748535799e3e9f35afb85bdc23bf6d7c Mon Sep 17 00:00:00 2001 From: He Kuang Date: Tue, 7 Apr 2015 17:31:10 +0800 Subject: [PATCH 01/19] perf evlist: Fix inverted logic in perf_mmap__empty perf_evlist__mmap_consume() uses perf_mmap__empty() to judge whether perf_mmap is empty and can be released. But the result is inverted so fix it. Signed-off-by: He Kuang Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1428399071-7141-1-git-send-email-hekuang@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 82bf224bbee9..76ef7ee62640 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -695,7 +695,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) static bool perf_mmap__empty(struct perf_mmap *md) { - return perf_mmap__read_head(md) != md->prev; + return perf_mmap__read_head(md) == md->prev; } static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx) From ba92732e9808df679ddf75c5ea1c0caae6d7dce2 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Tue, 7 Apr 2015 08:22:45 +0000 Subject: [PATCH 02/19] perf kmaps: Check kmaps to make code more robust This patch add checks in places where map__kmap is used to get kmaps from struct kmap. Error messages are added at map__kmap to warn invalid accessing of kmap (for the case of !map->dso->kernel, kmap(map) does not exists at all). Also, introduces map__kmaps() to warn uninitialized kmaps. Reviewed-by: Ingo Molnar Signed-off-by: Wang Nan Cc: pi3orama@163.com Cc: Jiri Olsa Cc: Namhyung Kim Cc: Zefan Li Link: http://lkml.kernel.org/r/1428394966-131044-2-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 5 ++++- tools/perf/util/map.c | 20 ++++++++++++++++++++ tools/perf/util/map.h | 6 ++---- tools/perf/util/probe-event.c | 2 ++ tools/perf/util/session.c | 3 +++ tools/perf/util/symbol-elf.c | 16 +++++++++++----- tools/perf/util/symbol.c | 34 ++++++++++++++++++++++++++++------ 7 files changed, 70 insertions(+), 16 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index e45c8f33a8fd..9c380a2caa54 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -679,6 +679,9 @@ int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) machine->vmlinux_maps[type]->unmap_ip = identity__map_ip; kmap = map__kmap(machine->vmlinux_maps[type]); + if (!kmap) + return -1; + kmap->kmaps = &machine->kmaps; map_groups__insert(&machine->kmaps, machine->vmlinux_maps[type]); @@ -700,7 +703,7 @@ void machine__destroy_kernel_maps(struct machine *machine) kmap = map__kmap(machine->vmlinux_maps[type]); map_groups__remove(&machine->kmaps, machine->vmlinux_maps[type]); - if (kmap->ref_reloc_sym) { + if (kmap && kmap->ref_reloc_sym) { /* * ref_reloc_sym is shared among all maps, so free just * on one of them. diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 62ca9f2607d5..a14f08f41686 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -778,3 +778,23 @@ struct map *maps__next(struct map *map) return rb_entry(next, struct map, rb_node); return NULL; } + +struct kmap *map__kmap(struct map *map) +{ + if (!map->dso || !map->dso->kernel) { + pr_err("Internal error: map__kmap with a non-kernel map\n"); + return NULL; + } + return (struct kmap *)(map + 1); +} + +struct map_groups *map__kmaps(struct map *map) +{ + struct kmap *kmap = map__kmap(map); + + if (!kmap || !kmap->kmaps) { + pr_err("Internal error: map__kmaps with a non-kernel map\n"); + return NULL; + } + return kmap->kmaps; +} diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 0e42438b1e59..ec19c59ca38e 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -76,10 +76,8 @@ static inline struct map_groups *map_groups__get(struct map_groups *mg) void map_groups__put(struct map_groups *mg); -static inline struct kmap *map__kmap(struct map *map) -{ - return (struct kmap *)(map + 1); -} +struct kmap *map__kmap(struct map *map); +struct map_groups *map__kmaps(struct map *map); static inline u64 map__map_ip(struct map *map, u64 ip) { diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 8feac0774c41..4fd49f021073 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -135,6 +135,8 @@ static struct ref_reloc_sym *kernel_get_ref_reloc_sym(void) return NULL; kmap = map__kmap(host_machine->vmlinux_maps[MAP__FUNCTION]); + if (!kmap) + return NULL; return kmap->ref_reloc_sym; } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index dfacf1d50162..0c74012575ac 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1466,6 +1466,9 @@ int maps__set_kallsyms_ref_reloc_sym(struct map **maps, for (i = 0; i < MAP__NR_TYPES; ++i) { struct kmap *kmap = map__kmap(maps[i]); + + if (!kmap) + continue; kmap->ref_reloc_sym = ref; } diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 476268c99431..a7ab6063e038 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -776,6 +776,7 @@ int dso__load_sym(struct dso *dso, struct map *map, symbol_filter_t filter, int kmodule) { struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL; + struct map_groups *kmaps = kmap ? map__kmaps(map) : NULL; struct map *curr_map = map; struct dso *curr_dso = dso; Elf_Data *symstrs, *secstrs; @@ -791,6 +792,9 @@ int dso__load_sym(struct dso *dso, struct map *map, int nr = 0; bool remap_kernel = false, adjust_kernel_syms = false; + if (kmap && !kmaps) + return -1; + dso->symtab_type = syms_ss->type; dso->is_64_bit = syms_ss->is_64_bit; dso->rel = syms_ss->ehdr.e_type == ET_REL; @@ -958,8 +962,10 @@ int dso__load_sym(struct dso *dso, struct map *map, map->map_ip = map__map_ip; map->unmap_ip = map__unmap_ip; /* Ensure maps are correctly ordered */ - map_groups__remove(kmap->kmaps, map); - map_groups__insert(kmap->kmaps, map); + if (kmaps) { + map_groups__remove(kmaps, map); + map_groups__insert(kmaps, map); + } } /* @@ -983,7 +989,7 @@ int dso__load_sym(struct dso *dso, struct map *map, snprintf(dso_name, sizeof(dso_name), "%s%s", dso->short_name, section_name); - curr_map = map_groups__find_by_name(kmap->kmaps, map->type, dso_name); + curr_map = map_groups__find_by_name(kmaps, map->type, dso_name); if (curr_map == NULL) { u64 start = sym.st_value; @@ -1013,7 +1019,7 @@ int dso__load_sym(struct dso *dso, struct map *map, curr_map->unmap_ip = identity__map_ip; } curr_dso->symtab_type = dso->symtab_type; - map_groups__insert(kmap->kmaps, curr_map); + map_groups__insert(kmaps, curr_map); /* * The new DSO should go to the kernel DSOS */ @@ -1075,7 +1081,7 @@ int dso__load_sym(struct dso *dso, struct map *map, * We need to fixup this here too because we create new * maps here, for things like vsyscall sections. */ - __map_groups__fixup_end(kmap->kmaps, map->type); + __map_groups__fixup_end(kmaps, map->type); } } err = nr; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index fddeb9073039..201f6c4ca738 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -630,13 +630,16 @@ static int dso__load_all_kallsyms(struct dso *dso, const char *filename, static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map, symbol_filter_t filter) { - struct map_groups *kmaps = map__kmap(map)->kmaps; + struct map_groups *kmaps = map__kmaps(map); struct map *curr_map; struct symbol *pos; int count = 0, moved = 0; struct rb_root *root = &dso->symbols[map->type]; struct rb_node *next = rb_first(root); + if (!kmaps) + return -1; + while (next) { char *module; @@ -682,8 +685,8 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map, static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta, symbol_filter_t filter) { - struct map_groups *kmaps = map__kmap(map)->kmaps; - struct machine *machine = kmaps->machine; + struct map_groups *kmaps = map__kmaps(map); + struct machine *machine; struct map *curr_map = map; struct symbol *pos; int count = 0, moved = 0; @@ -691,6 +694,11 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta, struct rb_node *next = rb_first(root); int kernel_range = 0; + if (!kmaps) + return -1; + + machine = kmaps->machine; + while (next) { char *module; @@ -1025,9 +1033,12 @@ static bool filename_from_kallsyms_filename(char *filename, static int validate_kcore_modules(const char *kallsyms_filename, struct map *map) { - struct map_groups *kmaps = map__kmap(map)->kmaps; + struct map_groups *kmaps = map__kmaps(map); char modules_filename[PATH_MAX]; + if (!kmaps) + return -EINVAL; + if (!filename_from_kallsyms_filename(modules_filename, "modules", kallsyms_filename)) return -EINVAL; @@ -1043,6 +1054,9 @@ static int validate_kcore_addresses(const char *kallsyms_filename, { struct kmap *kmap = map__kmap(map); + if (!kmap) + return -EINVAL; + if (kmap->ref_reloc_sym && kmap->ref_reloc_sym->name) { u64 start; @@ -1081,8 +1095,8 @@ static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data) static int dso__load_kcore(struct dso *dso, struct map *map, const char *kallsyms_filename) { - struct map_groups *kmaps = map__kmap(map)->kmaps; - struct machine *machine = kmaps->machine; + struct map_groups *kmaps = map__kmaps(map); + struct machine *machine; struct kcore_mapfn_data md; struct map *old_map, *new_map, *replacement_map = NULL; bool is_64_bit; @@ -1090,6 +1104,11 @@ static int dso__load_kcore(struct dso *dso, struct map *map, char kcore_filename[PATH_MAX]; struct symbol *sym; + if (!kmaps) + return -EINVAL; + + machine = kmaps->machine; + /* This function requires that the map is the kernel map */ if (map != machine->vmlinux_maps[map->type]) return -EINVAL; @@ -1202,6 +1221,9 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta) struct kmap *kmap = map__kmap(map); u64 addr; + if (!kmap) + return -1; + if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->name) return 0; From 3201f0dc42f7fad9387afc4692cea3d0c730cba2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 6 Apr 2015 14:36:16 +0900 Subject: [PATCH 03/19] tools lib traceevent: Honor operator priority Currently it ignores operator priority and just sets processed args as a right operand. But it could result in priority inversion in case that the right operand is also a operator arg and its priority is lower. For example, following print format is from new kmem events. "page=%p", REC->pfn != -1UL ? (((struct page *)(0xffffea0000000000UL)) + (REC->pfn)) : ((void *)0) But this was treated as below: REC->pfn != ((null - 1UL) ? ((struct page *)0xffffea0000000000UL + REC->pfn) : (void *) 0) In this case, the right arg was '?' operator which has lower priority. But it just sets the whole arg so making the output confusing - page was always 0 or 1 since that's the result of logical operation. With this patch, it can handle it properly like following: ((REC->pfn != (null - 1UL)) ? ((struct page *)0xffffea0000000000UL + REC->pfn) : (void *) 0) Signed-off-by: Namhyung Kim Acked-by: Steven Rostedt Cc: David Ahern Cc: Jiri Olsa Cc: Joonsoo Kim Cc: Minchan Kim Cc: Peter Zijlstra Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1428298576-9785-10-git-send-email-namhyung@kernel.org [ Replaced 'swap' with 'rotate' in a comment as requested by Steve and agreed by Namhyung ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/traceevent/event-parse.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index 6d31b6419d37..12a7e2a40c89 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -1939,7 +1939,22 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok) goto out_warn_free; type = process_arg_token(event, right, tok, type); - arg->op.right = right; + + if (right->type == PRINT_OP && + get_op_prio(arg->op.op) < get_op_prio(right->op.op)) { + struct print_arg tmp; + + /* rotate ops according to the priority */ + arg->op.right = right->op.left; + + tmp = *arg; + *arg = *right; + *right = tmp; + + arg->op.left = right; + } else { + arg->op.right = right; + } } else if (strcmp(token, "[") == 0) { From 28939e1a1f6d198239d86b1d77fa9fd55773189a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 Apr 2015 14:36:08 +0900 Subject: [PATCH 04/19] perf kmem: Respect -i option Currently the perf kmem does not respect -i option. Initializing the file.path properly after options get parsed. Signed-off-by: Jiri Olsa Cc: David Ahern Cc: Jiri Olsa Cc: Joonsoo Kim Cc: Minchan Kim Cc: Peter Zijlstra Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1428298576-9785-2-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index ac303ef9f2f0..4ebf65c79434 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -663,7 +663,6 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) { const char * const default_sort_order = "frag,hit,bytes"; struct perf_data_file file = { - .path = input_name, .mode = PERF_DATA_MODE_READ, }; const struct option kmem_options[] = { @@ -701,6 +700,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused) return __cmd_record(argc, argv); } + file.path = input_name; + session = perf_session__new(&file, false, &perf_kmem); if (session == NULL) return -1; From 0755bc4dc77a876aa60d4b3d33b5f6506f21f91b Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:28 +0800 Subject: [PATCH 05/19] perf sched replay: Use struct task_desc instead of struct task_task for correct meaning There is no struct task_task at all, thus it is a typo error in the old commits, now fix it to what it should be in order to avoid unnecessary misunderstanding. Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-2-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 3b3a5bb97059..a1893e8dfe17 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -346,7 +346,7 @@ static struct task_desc *register_pid(struct perf_sched *sched, sched->pid_to_task[pid] = task; sched->nr_tasks++; - sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *)); + sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *)); BUG_ON(!sched->tasks); sched->tasks[task->nr] = task; From a35e27d0e5d801ff75481a8f639bb4d59ea1aafa Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:29 +0800 Subject: [PATCH 06/19] perf sched replay: Increase the MAX_PID value to fix assertion failure problem Current MAX_PID is only 65536, which will cause assertion failure problem when CPU cores are more than 64 in x86_64. This is because the pid_max value in x86_64 is at least PIDS_PER_CPU_DEFAULT * num_possible_cpus() (see function pidmap_init defined in kernel/pid.c), where PIDS_PER_CPU_DEFAULT is 1024 (defined in include/linux/threads.h). Thus for MAX_PID = 65536, the correspoinding CPU cores are 65536/1024=64. This is obviously not enough at all for x86_64, and will cause an assertion failure problem due to BUG_ON(pid >= MAX_PID) in the codes. We increase MAX_PID value from 65536 to 1024*1000, which can be used in x86_64 with 1000 cores. This number is finally decided according to the limitation of stack size of calling process. Use 'ulimit -a', the result shows the stack size of any process is 8192 Kbytes, which is defined in include/uapi/linux/resource.h (#define _STK_LIM (8*1024*1024)). Thus we choose a large enough value for MAX_PID, and make it satisfy to the limitation of the stack size, i.e., making the perf process take up a memory space just smaller than 8192 Kbytes. We have calculated and tested that 1024*1000 is OK for MAX_PID. This means perf sched replay can now be used with at most 1000 cores in x86_64 without any assertion failure problem. Example: Test environment: x86_64 with 160 cores $ cat /proc/sys/kernel/pid_max 163840 Before this patch: $ perf sched replay run measurement overhead: 240 nsecs sleep measurement overhead: 55379 nsecs the run test took 1000004 nsecs the sleep test took 1059424 nsecs perf: builtin-sched.c:330: register_pid: Assertion `!(pid >= 65536)' failed. Aborted After this patch: $ perf sched replay run measurement overhead: 221 nsecs sleep measurement overhead: 55397 nsecs the run test took 999920 nsecs the sleep test took 1053313 nsecs nr_run_events: 10 nr_sleep_events: 1562 nr_wakeup_events: 5 task 0 ( :1: 1), nr_events: 1 task 1 ( :2: 2), nr_events: 1 task 2 ( :3: 3), nr_events: 1 task 3 ( :5: 5), nr_events: 1 ... Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-3-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index a1893e8dfe17..c46610447ede 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -28,7 +28,7 @@ #define MAX_CPUS 4096 #define COMM_LEN 20 #define SYM_LEN 129 -#define MAX_PID 65536 +#define MAX_PID 1024000 struct sched_atom; From cb06ac256a16fc1a5ab063107c2b35b3b9e95102 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:30 +0800 Subject: [PATCH 07/19] perf sched replay: Alloc the memory of pid_to_task dynamically to adapt to the unexpected change of pid_max The current memory allocation of struct task_desc *pid_to_task[MAX_PID] is in a permanent and preset way, and it has two problems: Problem 1: If the pid_max, which is the max number of pids in the system, is much smaller than MAX_PID (1024*1000), then it causes a waste of stack memory. This may happen in the case where the number of cpu cores is much smaller than 1000. Problem 2: If the pid_max is changed from the default value to a value larger than MAX_PID, then it will cause assertion failure problem. The maximum value of pid_max can be set to pid_max_max (see pidmap_init defined in kernel/pid.c), which equals to PID_MAX_LIMIT. In x86_64, PID_MAX_LIMIT is 4*1024*1024 (defined in include/linux/threads.h). This value is much larger than MAX_PID, and will take up 32768 Kbytes (4*1024*1024*8/1024) for memory allocation of pid_to_task, which is much larger than the default 8192 Kbytes of the stack size of calling process. Due to these two problems, we use calloc to allocate the memory of pid_to_task dynamically. Example: Test environment: x86_64 with 160 cores $ cat /proc/sys/kernel/pid_max 163840 $ echo 1025000 > /proc/sys/kernel/pid_max $ cat /proc/sys/kernel/pid_max 1025000 Run some applications until the pid of some process is greater than the value of MAX_PID (1024*1000). Before this patch: $ perf sched replay run measurement overhead: 221 nsecs sleep measurement overhead: 55480 nsecs the run test took 1000008 nsecs the sleep test took 1063151 nsecs perf: builtin-sched.c:330: register_pid: Assertion `!(pid >= 1024000)' failed. Aborted After this patch: $ perf sched replay run measurement overhead: 221 nsecs sleep measurement overhead: 55435 nsecs the run test took 1000004 nsecs the sleep test took 1059312 nsecs nr_run_events: 10 nr_sleep_events: 1562 nr_wakeup_events: 5 task 0 ( :1: 1), nr_events: 1 task 1 ( :2: 2), nr_events: 1 task 2 ( :3: 3), nr_events: 1 task 3 ( :5: 5), nr_events: 1 ... Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-4-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index c46610447ede..20d887b222e4 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -23,6 +23,7 @@ #include #include #include +#include #define PR_SET_NAME 15 /* Set process name */ #define MAX_CPUS 4096 @@ -124,7 +125,7 @@ struct perf_sched { struct perf_tool tool; const char *sort_order; unsigned long nr_tasks; - struct task_desc *pid_to_task[MAX_PID]; + struct task_desc **pid_to_task; struct task_desc **tasks; const struct trace_sched_handler *tp_handler; pthread_mutex_t start_work_mutex; @@ -326,8 +327,14 @@ static struct task_desc *register_pid(struct perf_sched *sched, unsigned long pid, const char *comm) { struct task_desc *task; + static int pid_max; - BUG_ON(pid >= MAX_PID); + if (sched->pid_to_task == NULL) { + if (sysctl__read_int("kernel/pid_max", &pid_max) < 0) + pid_max = MAX_PID; + BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); + } + BUG_ON(pid >= (unsigned long)pid_max); task = sched->pid_to_task[pid]; From 3a423a5c36d1a28a258beaa7db855568b82d07ab Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:31 +0800 Subject: [PATCH 08/19] perf sched replay: Realloc the memory of pid_to_task stepwise to adapt to the different pid_max configurations Although the memory of pid_to_task can be allocated via calloc according to the value of /proc/sys/kernel/pid_max, it cannot handle the case when pid_max is changed after 'perf sched record' has created its perf.data. If the new pid_max configured in 'perf sched replay' is smaller than the old pid_max configured in 'perf sched record', then it will cause the assertion failure problem. To solve this problem, we realloc the memory of pid_to_task stepwise once the passed-in pid parameter in register_pid is larger than the current pid_max. Example: Test environment: x86_64 with 160 cores $ cat /proc/sys/kernel/pid_max 163840 $ perf sched record ls $ echo 5000 > /proc/sys/kernel/pid_max $ cat /proc/sys/kernel/pid_max 5000 Before this patch: $ perf sched replay run measurement overhead: 221 nsecs sleep measurement overhead: 55356 nsecs the run test took 1000011 nsecs the sleep test took 1060940 nsecs perf: builtin-sched.c:337: register_pid: Assertion `!(pid >= (unsigned long)pid_max)' failed. Aborted After this patch: $ perf sched replay run measurement overhead: 221 nsecs sleep measurement overhead: 55611 nsecs the run test took 1000026 nsecs the sleep test took 1060486 nsecs nr_run_events: 10 nr_sleep_events: 1562 nr_wakeup_events: 5 task 0 ( :1: 1), nr_events: 1 task 1 ( :2: 2), nr_events: 1 task 2 ( :3: 3), nr_events: 1 task 3 ( :5: 5), nr_events: 1 ... Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-5-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 20d887b222e4..dd714818fa4d 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -334,7 +334,12 @@ static struct task_desc *register_pid(struct perf_sched *sched, pid_max = MAX_PID; BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL); } - BUG_ON(pid >= (unsigned long)pid_max); + if (pid >= (unsigned long)pid_max) { + BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) * + sizeof(struct task_desc *))) == NULL); + while (pid >= (unsigned long)pid_max) + sched->pid_to_task[pid_max++] = NULL; + } task = sched->pid_to_task[pid]; From 08097abc11bcee21355dd857852a807b2a30b79f Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:32 +0800 Subject: [PATCH 09/19] perf sched replay: Fix the segmentation fault problem caused by pr_err in threads The pr_err in self_open_counters() prints error message to stderr. Unlike stdout, stderr uses memory buffer on the stack of each calling process. The pr_err in self_open_counters() works in a thread called thread_func created in function create_tasks, which concurrently creates sched->nr_tasks threads. If the error happens and pr_err prints the error message in each of these threads, the stack size of the perf process (default is 8192 kbytes) will quickly run out and the segmentation fault will happen then. To solve this problem, pr_err with self_open_counters() should be moved from newly created threads to the old main thread of the perf process. Then the pr_err can work in a stable situation without the strange segmentation fault problem. Example: Test environment: x86_64 with 160 cores Before this patch: $ perf sched replay ... task 1549 ( :163132: 163132), nr_events: 1 task 1550 ( :163540: 163540), nr_events: 1 task 1551 ( : 0), nr_events: 10 Segmentation fault After this patch: $ perf sched replay ... task 1549 ( :163132: 163132), nr_events: 1 task 1550 ( :163540: 163540), nr_events: 1 task 1551 ( : 0), nr_events: 10 ... As shown above, the result continues without any segmentation fault. Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-6-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index dd714818fa4d..7fe3b3cb4cc8 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -472,6 +472,7 @@ static u64 get_cpu_usage_nsec_self(int fd) struct sched_thread_parms { struct task_desc *task; struct perf_sched *sched; + int fd; }; static void *thread_func(void *ctx) @@ -482,13 +483,12 @@ static void *thread_func(void *ctx) u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; - int fd; + int fd = parms->fd; zfree(&parms); sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); - fd = self_open_counters(); if (fd < 0) return NULL; again: @@ -540,6 +540,7 @@ static void create_tasks(struct perf_sched *sched) BUG_ON(parms == NULL); parms->task = task = sched->tasks[i]; parms->sched = sched; + parms->fd = self_open_counters(); sem_init(&task->sleep_sem, 0, 0); sem_init(&task->ready_for_work, 0, 0); sem_init(&task->work_done_sem, 0, 0); From 1aff59be53ef37aa9943fb5f772f03148f789bb6 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:33 +0800 Subject: [PATCH 10/19] perf sched replay: Handle the dead halt of sem_wait when create_tasks() fails for any task Since there is sem_wait for each task in the wait_for_tasks(), e.g. sem_wait(&task->work_done_sem). The sem_wait can continue only when work_done_sem is greater than 0, or it will be blocked. For perf sched replay, one task may sem_post the work_done_sem of another task, which causes the work_done_sem of that task processed in a reasonable sequence, e.g. sem_post, sem_wait, sem_wait, sem_post... This sequence simulates the sched process of the running tasks at the time when perf sched record runs. As a result, all the tasks are required and their threads must be successfully created. If any one (task A) of the tasks fails to create its thread, then another task (task B), whose work_done_sem needs sem_post from that failed task A, may likely block itself due to seg_wait. And this is a dead halt, since task B's thread_func cannot continue at all. To solve this problem, perf sched replay should exit once any task fails to create its thread. Example: Test environment: x86_64 with 160 cores Before this patch: $ perf sched replay ... Error: sys_perf_event_open() syscall returned with -1 (Too many open files) ------------------------------------------------------------ <- dead halt After this patch: $ perf sched replay ... task 1551 ( : 0), nr_events: 10 Error: sys_perf_event_open() syscall returned with -1 (Too many open files) $ As shown above, perf sched replay finishes the process after printing an error message and does not block itself. Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-7-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 7fe3b3cb4cc8..3261300c08f0 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -451,10 +451,12 @@ static int self_open_counters(void) fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); - if (fd < 0) + if (fd < 0) { pr_err("Error: sys_perf_event_open() syscall returned " "with %d (%s)\n", fd, strerror_r(errno, sbuf, sizeof(sbuf))); + exit(EXIT_FAILURE); + } return fd; } From 939cda521a24ae4dbf3beec983abd519bce56231 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:34 +0800 Subject: [PATCH 11/19] perf sched replay: Fix the EMFILE error caused by the limitation of the maximum open files The soft maximum number of open files for a calling process is 1024, which is defined as INR_OPEN_CUR in include/uapi/linux/fs.h, and the hard maximum number of open files for a calling process is 4096, which is defined as INR_OPEN_MAX in include/uapi/linux/fs.h. Both INR_OPEN_CUR and INR_OPEN_MAX are used to limit the value of RLIMIT_NOFILE in include/asm-generic/resource.h. And the soft maximum number finally decides the limitation of the maximum files which are allowed to be opened. That is to say a process can use at most 1024 file descriptors for its o pened files, or an EMFILE error will happen. This error can be fixed by increasing the soft maximum number, under the constraint that the soft maximum number can not exceed the hard maximum number, or both soft and hard maximum number should be increased simultaneously with privilege. For perf sched replay, it uses sys_perf_event_open to create the file descriptor for each of the tasks in order to handle information of perf events. That is to say each task needs a unique file descriptor. In x86_64, there may be over 1024 or 4096 tasks correspoinding to the record in perf.data, which causes that no enough file descriptors can be used. As a result, EMFILE error happens and stops the replay process. To solve this problem, we adaptively increase the soft and hard maximum number of open files with a '-f' option. Example: Test environment: x86_64 with 160 cores $ cat /proc/sys/kernel/pid_max 163840 $ cat /proc/sys/fs/file-max 6815744 $ ulimit -Sn 1024 $ ulimit -Hn 4096 Before this patch: $ perf sched replay ... task 1549 ( :163132: 163132), nr_events: 1 task 1550 ( :163540: 163540), nr_events: 1 task 1551 ( : 0), nr_events: 10 Error: sys_perf_event_open() syscall returned with -1 (Too many open files) After this patch: $ perf sched replay ... task 1549 ( :163132: 163132), nr_events: 1 task 1550 ( :163540: 163540), nr_events: 1 task 1551 ( : 0), nr_events: 10 Error: sys_perf_event_open() syscall returned with -1 (Too many open files) Have a try with -f option $ perf sched replay -f ... task 1549 ( :163132: 163132), nr_events: 1 task 1550 ( :163540: 163540), nr_events: 1 task 1551 ( : 0), nr_events: 10 ------------------------------------------------------------ #1 : 54.401, ravg: 54.40, cpu: 3285.21 / 3285.21 #2 : 199.548, ravg: 68.92, cpu: 4999.65 / 3456.66 #3 : 170.483, ravg: 79.07, cpu: 1349.94 / 3245.99 #4 : 192.034, ravg: 90.37, cpu: 1322.88 / 3053.67 #5 : 182.929, ravg: 99.62, cpu: 1406.51 / 2888.96 #6 : 152.974, ravg: 104.96, cpu: 1167.54 / 2716.82 #7 : 155.579, ravg: 110.02, cpu: 2992.53 / 2744.39 #8 : 130.557, ravg: 112.08, cpu: 1126.43 / 2582.59 #9 : 138.520, ravg: 114.72, cpu: 1253.22 / 2449.65 #10 : 134.328, ravg: 116.68, cpu: 1587.95 / 2363.48 Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-8-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 3261300c08f0..5ab58c6d2467 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -170,6 +170,7 @@ struct perf_sched { u64 cpu_last_switched[MAX_CPUS]; struct rb_root atom_root, sorted_atom_root; struct list_head sort_list, cmp_pid; + bool force; }; static u64 get_nsecs(void) @@ -437,24 +438,43 @@ static u64 get_cpu_usage_nsec_parent(void) return sum; } -static int self_open_counters(void) +static int self_open_counters(struct perf_sched *sched, unsigned long cur_task) { struct perf_event_attr attr; - char sbuf[STRERR_BUFSIZE]; + char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE]; int fd; + struct rlimit limit; + bool need_privilege = false; memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_TASK_CLOCK; +force_again: fd = sys_perf_event_open(&attr, 0, -1, -1, perf_event_open_cloexec_flag()); if (fd < 0) { + if (errno == EMFILE) { + if (sched->force) { + BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1); + limit.rlim_cur += sched->nr_tasks - cur_task; + if (limit.rlim_cur > limit.rlim_max) { + limit.rlim_max = limit.rlim_cur; + need_privilege = true; + } + if (setrlimit(RLIMIT_NOFILE, &limit) == -1) { + if (need_privilege && errno == EPERM) + strcpy(info, "Need privilege\n"); + } else + goto force_again; + } else + strcpy(info, "Have a try with -f option\n"); + } pr_err("Error: sys_perf_event_open() syscall returned " - "with %d (%s)\n", fd, - strerror_r(errno, sbuf, sizeof(sbuf))); + "with %d (%s)\n%s", fd, + strerror_r(errno, sbuf, sizeof(sbuf)), info); exit(EXIT_FAILURE); } return fd; @@ -542,7 +562,7 @@ static void create_tasks(struct perf_sched *sched) BUG_ON(parms == NULL); parms->task = task = sched->tasks[i]; parms->sched = sched; - parms->fd = self_open_counters(); + parms->fd = self_open_counters(sched, i); sem_init(&task->sleep_sem, 0, 0); sem_init(&task->ready_for_work, 0, 0); sem_init(&task->work_done_sem, 0, 0); @@ -1700,6 +1720,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), + OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), OPT_END() }; const struct option sched_options[] = { From f0dd330fdf07d295ac468660cf60341796d5d501 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:35 +0800 Subject: [PATCH 12/19] perf sched replay: Support using -f to override perf.data file ownership Enable to use perf.data when it is not owned by current user or root. Example: $ ls -al perf.data -rw------- 1 Yunlong.Song Yunlong.Song 5321918 Mar 25 15:14 perf.data $ sudo id uid=0(root) gid=0(root) groups=0(root),64(pkcs11) Before this patch: $ sudo perf sched replay -f run measurement overhead: 98 nsecs sleep measurement overhead: 52909 nsecs the run test took 1000015 nsecs the sleep test took 1054253 nsecs File perf.data not owned by current user or root (use -f to override) As shown above, the -f option does not work at all. After this patch: $ sudo perf sched replay -f run measurement overhead: 221 nsecs sleep measurement overhead: 40514 nsecs the run test took 1000003 nsecs the sleep test took 1056098 nsecs nr_run_events: 10 nr_sleep_events: 1562 nr_wakeup_events: 5 task 0 ( :1: 1), nr_events: 1 task 1 ( :2: 2), nr_events: 1 task 2 ( :3: 3), nr_events: 1 ... ... task 1549 ( :163132: 163132), nr_events: 1 task 1550 ( :163540: 163540), nr_events: 1 task 1551 ( : 0), nr_events: 10 ------------------------------------------------------------ #1 : 50.198, ravg: 50.20, cpu: 2335.18 / 2335.18 #2 : 219.099, ravg: 67.09, cpu: 2835.11 / 2385.17 #3 : 238.626, ravg: 84.24, cpu: 3278.26 / 2474.48 #4 : 200.364, ravg: 95.85, cpu: 2977.41 / 2524.77 #5 : 176.882, ravg: 103.96, cpu: 2801.35 / 2552.43 #6 : 191.093, ravg: 112.67, cpu: 2813.70 / 2578.56 #7 : 189.448, ravg: 120.35, cpu: 2809.21 / 2601.62 #8 : 200.637, ravg: 128.38, cpu: 2849.91 / 2626.45 #9 : 248.338, ravg: 140.37, cpu: 4380.61 / 2801.87 #10 : 511.139, ravg: 177.45, cpu: 3077.73 / 2829.45 As shown above, the -f option really works now. Besides for replay, -f option can also work for latency and map. Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-9-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 5ab58c6d2467..7b7b798b22b2 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1487,6 +1487,7 @@ static int perf_sched__read_events(struct perf_sched *sched) struct perf_data_file file = { .path = input_name, .mode = PERF_DATA_MODE_READ, + .force = sched->force, }; int rc = -1; From ff5f3bbd40bfb8632f826f1f83223d95363f36af Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 31 Mar 2015 21:46:36 +0800 Subject: [PATCH 13/19] perf sched replay: Use replay_repeat to calculate the runavg of cpu usage instead of the default value 10 Since sched->replay_repeat is set to 10 as default, the sched->run_avg, sched->runavg_cpu_usage, and sched->runavg_parent_cpu_usage all use 10 to calculate their value. However, the replay_repeat can be changed to other value by using -r option, so the calculation above should use replay_repeat to achieve more accurate results instead of the default value 10. Signed-off-by: Yunlong Song Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1427809596-29559-10-git-send-email-yunlong.song@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 7b7b798b22b2..5275bab70313 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -607,13 +607,13 @@ static void wait_for_tasks(struct perf_sched *sched) cpu_usage_1 = get_cpu_usage_nsec_parent(); if (!sched->runavg_cpu_usage) sched->runavg_cpu_usage = sched->cpu_usage; - sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10; + sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat; sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0; if (!sched->runavg_parent_cpu_usage) sched->runavg_parent_cpu_usage = sched->parent_cpu_usage; - sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 + - sched->parent_cpu_usage)/10; + sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) + + sched->parent_cpu_usage)/sched->replay_repeat; ret = pthread_mutex_lock(&sched->start_work_mutex); BUG_ON(ret); @@ -645,7 +645,7 @@ static void run_one_test(struct perf_sched *sched) sched->sum_fluct += fluct; if (!sched->run_avg) sched->run_avg = delta; - sched->run_avg = (sched->run_avg * 9 + delta) / 10; + sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat; printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0); From 814c8c38e13c7050259c72f89bb01f3fc903f642 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Mar 2015 00:19:31 +0200 Subject: [PATCH 14/19] perf record: Add clockid parameter Teach perf-record about the new perf_event_attr::{use_clockid, clockid} fields. Add a simple parameter to set the clock (if any) to be used for the events to be recorded into the data file. Since we store the entire perf_event_attr in the EVENT_DESC section we also already store the used clockid in the data file. Signed-off-by: Peter Zijlstra (Intel) Acked-by: David Ahern Cc: "H. Peter Anvin" Cc: Adrian Hunter Cc: Andrew Morton Cc: Jiri Olsa Cc: John Stultz Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Yunlong Song Link: http://lkml.kernel.org/r/20150407154851.GR23123@twins.programming.kicks-ass.net [ Conditionally define CLOCK_BOOTTIME, at least rhel6 doesn't have it - dsahern Ditto for CLOCK_MONOTONIC_RAW, sles11sp2 doesn't have it - yunlong.song ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-record.txt | 7 ++ tools/perf/builtin-record.c | 87 ++++++++++++++++++++++++ tools/perf/perf.h | 2 + tools/perf/util/evsel.c | 59 ++++++++++++++-- tools/perf/util/header.c | 3 + 5 files changed, 154 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 355c4f5569b5..4847a793de65 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -250,6 +250,13 @@ is off by default. --running-time:: Record running and enabled time for read events (:S) +-k:: +--clockid:: +Sets the clock id to use for the various time fields in the perf_event_type +records. See clock_gettime(). In particular CLOCK_MONOTONIC and +CLOCK_MONOTONIC_RAW are supported, some events might also allow +CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 18aad239b401..ac610488d2e1 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -711,6 +711,90 @@ static int perf_record_config(const char *var, const char *value, void *cb) return perf_default_config(var, value, cb); } +struct clockid_map { + const char *name; + int clockid; +}; + +#define CLOCKID_MAP(n, c) \ + { .name = n, .clockid = (c), } + +#define CLOCKID_END { .name = NULL, } + + +/* + * Add the missing ones, we need to build on many distros... + */ +#ifndef CLOCK_MONOTONIC_RAW +#define CLOCK_MONOTONIC_RAW 4 +#endif +#ifndef CLOCK_BOOTTIME +#define CLOCK_BOOTTIME 7 +#endif +#ifndef CLOCK_TAI +#define CLOCK_TAI 11 +#endif + +static const struct clockid_map clockids[] = { + /* available for all events, NMI safe */ + CLOCKID_MAP("monotonic", CLOCK_MONOTONIC), + CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW), + + /* available for some events */ + CLOCKID_MAP("realtime", CLOCK_REALTIME), + CLOCKID_MAP("boottime", CLOCK_BOOTTIME), + CLOCKID_MAP("tai", CLOCK_TAI), + + /* available for the lazy */ + CLOCKID_MAP("mono", CLOCK_MONOTONIC), + CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW), + CLOCKID_MAP("real", CLOCK_REALTIME), + CLOCKID_MAP("boot", CLOCK_BOOTTIME), + + CLOCKID_END, +}; + +static int parse_clockid(const struct option *opt, const char *str, int unset) +{ + struct record_opts *opts = (struct record_opts *)opt->value; + const struct clockid_map *cm; + const char *ostr = str; + + if (unset) { + opts->use_clockid = 0; + return 0; + } + + /* no arg passed */ + if (!str) + return 0; + + /* no setting it twice */ + if (opts->use_clockid) + return -1; + + opts->use_clockid = true; + + /* if its a number, we're done */ + if (sscanf(str, "%d", &opts->clockid) == 1) + return 0; + + /* allow a "CLOCK_" prefix to the name */ + if (!strncasecmp(str, "CLOCK_", 6)) + str += 6; + + for (cm = clockids; cm->name; cm++) { + if (!strcasecmp(str, cm->name)) { + opts->clockid = cm->clockid; + return 0; + } + } + + opts->use_clockid = false; + ui__warning("unknown clockid %s, check man page\n", ostr); + return -1; +} + static const char * const __record_usage[] = { "perf record [] []", "perf record [] -- []", @@ -842,6 +926,9 @@ struct option __record_options[] = { "Sample machine registers on interrupt"), OPT_BOOLEAN(0, "running-time", &record.opts.running_time, "Record running/enabled time of read (:S) events"), + OPT_CALLBACK('k', "clockid", &record.opts, + "clockid", "clockid to use for events, see clock_gettime()", + parse_clockid), OPT_END() }; diff --git a/tools/perf/perf.h b/tools/perf/perf.h index c38a085a5571..e14bb637255c 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -62,6 +62,8 @@ struct record_opts { u64 user_interval; bool sample_transaction; unsigned initial_delay; + bool use_clockid; + clockid_t clockid; }; struct option; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 358e5954baa8..d190f99a3a97 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -32,8 +32,12 @@ static struct { bool exclude_guest; bool mmap2; bool cloexec; + bool clockid; + bool clockid_wrong; } perf_missing_features; +static clockid_t clockid; + static int perf_evsel__no_extra_init(struct perf_evsel *evsel __maybe_unused) { return 0; @@ -761,6 +765,12 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) attr->disabled = 0; attr->enable_on_exec = 0; } + + clockid = opts->clockid; + if (opts->use_clockid) { + attr->use_clockid = 1; + attr->clockid = opts->clockid; + } } static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) @@ -1036,7 +1046,6 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp) ret += PRINT_ATTR2(exclude_user, exclude_kernel); ret += PRINT_ATTR2(exclude_hv, exclude_idle); ret += PRINT_ATTR2(mmap, comm); - ret += PRINT_ATTR2(mmap2, comm_exec); ret += PRINT_ATTR2(freq, inherit_stat); ret += PRINT_ATTR2(enable_on_exec, task); ret += PRINT_ATTR2(watermark, precise_ip); @@ -1044,6 +1053,9 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp) ret += PRINT_ATTR2(exclude_host, exclude_guest); ret += PRINT_ATTR2N("excl.callchain_kern", exclude_callchain_kernel, "excl.callchain_user", exclude_callchain_user); + ret += PRINT_ATTR2(mmap2, comm_exec); + ret += __PRINT_ATTR("%u",,use_clockid); + ret += PRINT_ATTR_U32(wakeup_events); ret += PRINT_ATTR_U32(wakeup_watermark); @@ -1055,6 +1067,7 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp) ret += PRINT_ATTR_X64(branch_sample_type); ret += PRINT_ATTR_X64(sample_regs_user); ret += PRINT_ATTR_U32(sample_stack_user); + ret += PRINT_ATTR_U32(clockid); ret += PRINT_ATTR_X64(sample_regs_intr); ret += fprintf(fp, "%.60s\n", graph_dotted_line); @@ -1085,6 +1098,12 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, } fallback_missing_features: + if (perf_missing_features.clockid_wrong) + evsel->attr.clockid = CLOCK_MONOTONIC; /* should always work */ + if (perf_missing_features.clockid) { + evsel->attr.use_clockid = 0; + evsel->attr.clockid = 0; + } if (perf_missing_features.cloexec) flags &= ~(unsigned long)PERF_FLAG_FD_CLOEXEC; if (perf_missing_features.mmap2) @@ -1122,6 +1141,17 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, goto try_fallback; } set_rlimit = NO_CHANGE; + + /* + * If we succeeded but had to kill clockid, fail and + * have perf_evsel__open_strerror() print us a nice + * error. + */ + if (perf_missing_features.clockid || + perf_missing_features.clockid_wrong) { + err = -EINVAL; + goto out_close; + } } } @@ -1155,7 +1185,17 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, if (err != -EINVAL || cpu > 0 || thread > 0) goto out_close; - if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) { + /* + * Must probe features in the order they were added to the + * perf_event_attr interface. + */ + if (!perf_missing_features.clockid_wrong && evsel->attr.use_clockid) { + perf_missing_features.clockid_wrong = true; + goto fallback_missing_features; + } else if (!perf_missing_features.clockid && evsel->attr.use_clockid) { + perf_missing_features.clockid = true; + goto fallback_missing_features; + } else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) { perf_missing_features.cloexec = true; goto fallback_missing_features; } else if (!perf_missing_features.mmap2 && evsel->attr.mmap2) { @@ -2063,9 +2103,7 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, if_print(exclude_hv); if_print(exclude_idle); if_print(mmap); - if_print(mmap2); if_print(comm); - if_print(comm_exec); if_print(freq); if_print(inherit_stat); if_print(enable_on_exec); @@ -2076,10 +2114,17 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, if_print(sample_id_all); if_print(exclude_host); if_print(exclude_guest); + if_print(mmap2); + if_print(comm_exec); + if_print(use_clockid); if_print(__reserved_1); if_print(wakeup_events); if_print(bp_type); if_print(branch_sample_type); + if_print(sample_regs_user); + if_print(sample_stack_user); + if_print(clockid); + if_print(sample_regs_intr); } out: fputc('\n', fp); @@ -2158,6 +2203,12 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, "The PMU counters are busy/taken by another profiler.\n" "We found oprofile daemon running, please stop it and try again."); break; + case EINVAL: + if (perf_missing_features.clockid) + return scnprintf(msg, size, "clockid feature not supported."); + if (perf_missing_features.clockid_wrong) + return scnprintf(msg, size, "wrong clockid (%d).", clockid); + break; default: break; } diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index fb432153e2aa..de5f4669ba5f 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1098,6 +1098,9 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp) } fprintf(fp, " }"); } + if (evsel->attr.use_clockid) + fprintf(fp, ", clockid = %d", evsel->attr.clockid); + fputc('\n', fp); } From 2c5e8c52c6354f77c4019357be8231bcc34456f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Apr 2015 11:09:54 +0200 Subject: [PATCH 15/19] perf tools: Merge all perf_event_attr print functions Currently there's 3 (that I found) different and incomplete implementations of printing perf_event_attr. This is quite silly. Merge the lot. While this patch does not retain the exact form all printing that I found is debug output and thus it should not be critical. Also, I cannot find a single print_event_desc() caller. Pre: $ perf record -vv -e cycles -- sleep 1 ------------------------------------------------------------ perf_event_attr: type 0 size 104 config 0 sample_period 4000 sample_freq 4000 sample_type 0x107 read_format 0 disabled 1 inherit 1 pinned 0 exclusive 0 exclude_user 0 exclude_kernel 0 exclude_hv 0 exclude_idle 0 mmap 1 comm 1 mmap2 1 comm_exec 1 freq 1 inherit_stat 0 enable_on_exec 1 task 1 watermark 0 precise_ip 0 mmap_data 0 sample_id_all 1 exclude_host 0 exclude_guest 1 excl.callchain_kern 0 excl.callchain_user 0 wakeup_events 0 wakeup_watermark 0 bp_type 0 bp_addr 0 config1 0 bp_len 0 config2 0 branch_sample_type 0 sample_regs_user 0 sample_stack_user 0 sample_regs_intr 0 ------------------------------------------------------------ $ perf evlist -vv cycles: sample_freq=4000, size: 104, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, mmap2: 1, comm: 1, comm_exec: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1 Post: $ ./perf record -vv -e cycles -- sleep 1 ------------------------------------------------------------ perf_event_attr: size 112 { sample_period, sample_freq } 4000 sample_type IP|TID|TIME|PERIOD disabled 1 inherit 1 mmap 1 comm 1 freq 1 enable_on_exec 1 task 1 sample_id_all 1 exclude_guest 1 mmap2 1 comm_exec 1 ------------------------------------------------------------ $ ./perf evlist -vv cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1 Signed-off-by: Peter Zijlstra (Intel) Acked-by: Adrian Hunter Acked-by: Ingo Molnar Acked-by: Jiri Olsa Cc: "H. Peter Anvin" Cc: Andrew Morton Cc: David Ahern Cc: John Stultz Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150407091150.644238729@infradead.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 272 +++++++++++++++++---------------------- tools/perf/util/evsel.h | 6 + tools/perf/util/header.c | 29 +---- 3 files changed, 133 insertions(+), 174 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d190f99a3a97..33e3fd8c2e68 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1011,70 +1011,126 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread) return fd; } -#define __PRINT_ATTR(fmt, cast, field) \ - fprintf(fp, " %-19s "fmt"\n", #field, cast attr->field) +struct bit_names { + int bit; + const char *name; +}; -#define PRINT_ATTR_U32(field) __PRINT_ATTR("%u" , , field) -#define PRINT_ATTR_X32(field) __PRINT_ATTR("%#x", , field) -#define PRINT_ATTR_U64(field) __PRINT_ATTR("%" PRIu64, (uint64_t), field) -#define PRINT_ATTR_X64(field) __PRINT_ATTR("%#"PRIx64, (uint64_t), field) - -#define PRINT_ATTR2N(name1, field1, name2, field2) \ - fprintf(fp, " %-19s %u %-19s %u\n", \ - name1, attr->field1, name2, attr->field2) - -#define PRINT_ATTR2(field1, field2) \ - PRINT_ATTR2N(#field1, field1, #field2, field2) - -static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp) +static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits) { - size_t ret = 0; + bool first_bit = true; + int i = 0; - ret += fprintf(fp, "%.60s\n", graph_dotted_line); - ret += fprintf(fp, "perf_event_attr:\n"); + do { + if (value & bits[i].bit) { + buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name); + first_bit = false; + } + } while (bits[++i].name != NULL); +} - ret += PRINT_ATTR_U32(type); - ret += PRINT_ATTR_U32(size); - ret += PRINT_ATTR_X64(config); - ret += PRINT_ATTR_U64(sample_period); - ret += PRINT_ATTR_U64(sample_freq); - ret += PRINT_ATTR_X64(sample_type); - ret += PRINT_ATTR_X64(read_format); +static void __p_sample_type(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_SAMPLE_##n, #n } + struct bit_names bits[] = { + bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR), + bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU), + bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), + bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), + bit_name(IDENTIFIER), bit_name(REGS_INTR), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} - ret += PRINT_ATTR2(disabled, inherit); - ret += PRINT_ATTR2(pinned, exclusive); - ret += PRINT_ATTR2(exclude_user, exclude_kernel); - ret += PRINT_ATTR2(exclude_hv, exclude_idle); - ret += PRINT_ATTR2(mmap, comm); - ret += PRINT_ATTR2(freq, inherit_stat); - ret += PRINT_ATTR2(enable_on_exec, task); - ret += PRINT_ATTR2(watermark, precise_ip); - ret += PRINT_ATTR2(mmap_data, sample_id_all); - ret += PRINT_ATTR2(exclude_host, exclude_guest); - ret += PRINT_ATTR2N("excl.callchain_kern", exclude_callchain_kernel, - "excl.callchain_user", exclude_callchain_user); - ret += PRINT_ATTR2(mmap2, comm_exec); - ret += __PRINT_ATTR("%u",,use_clockid); +static void __p_read_format(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_FORMAT_##n, #n } + struct bit_names bits[] = { + bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING), + bit_name(ID), bit_name(GROUP), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} +#define BUF_SIZE 1024 - ret += PRINT_ATTR_U32(wakeup_events); - ret += PRINT_ATTR_U32(wakeup_watermark); - ret += PRINT_ATTR_X32(bp_type); - ret += PRINT_ATTR_X64(bp_addr); - ret += PRINT_ATTR_X64(config1); - ret += PRINT_ATTR_U64(bp_len); - ret += PRINT_ATTR_X64(config2); - ret += PRINT_ATTR_X64(branch_sample_type); - ret += PRINT_ATTR_X64(sample_regs_user); - ret += PRINT_ATTR_U32(sample_stack_user); - ret += PRINT_ATTR_U32(clockid); - ret += PRINT_ATTR_X64(sample_regs_intr); +#define p_hex(val) snprintf(buf, BUF_SIZE, "%"PRIx64, (uint64_t)(val)) +#define p_unsigned(val) snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val)) +#define p_signed(val) snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val)) +#define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val) +#define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) - ret += fprintf(fp, "%.60s\n", graph_dotted_line); +#define PRINT_ATTRn(_n, _f, _p) \ +do { \ + if (attr->_f) { \ + _p(attr->_f); \ + ret += attr__fprintf(fp, _n, buf, priv);\ + } \ +} while (0) + +#define PRINT_ATTRf(_f, _p) PRINT_ATTRn(#_f, _f, _p) + +int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, + attr__fprintf_f attr__fprintf, void *priv) +{ + char buf[BUF_SIZE]; + int ret = 0; + + PRINT_ATTRf(type, p_unsigned); + PRINT_ATTRf(size, p_unsigned); + PRINT_ATTRf(config, p_hex); + PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned); + PRINT_ATTRf(sample_type, p_sample_type); + PRINT_ATTRf(read_format, p_read_format); + + PRINT_ATTRf(disabled, p_unsigned); + PRINT_ATTRf(inherit, p_unsigned); + PRINT_ATTRf(pinned, p_unsigned); + PRINT_ATTRf(exclusive, p_unsigned); + PRINT_ATTRf(exclude_user, p_unsigned); + PRINT_ATTRf(exclude_kernel, p_unsigned); + PRINT_ATTRf(exclude_hv, p_unsigned); + PRINT_ATTRf(exclude_idle, p_unsigned); + PRINT_ATTRf(mmap, p_unsigned); + PRINT_ATTRf(comm, p_unsigned); + PRINT_ATTRf(freq, p_unsigned); + PRINT_ATTRf(inherit_stat, p_unsigned); + PRINT_ATTRf(enable_on_exec, p_unsigned); + PRINT_ATTRf(task, p_unsigned); + PRINT_ATTRf(watermark, p_unsigned); + PRINT_ATTRf(precise_ip, p_unsigned); + PRINT_ATTRf(mmap_data, p_unsigned); + PRINT_ATTRf(sample_id_all, p_unsigned); + PRINT_ATTRf(exclude_host, p_unsigned); + PRINT_ATTRf(exclude_guest, p_unsigned); + PRINT_ATTRf(exclude_callchain_kernel, p_unsigned); + PRINT_ATTRf(exclude_callchain_user, p_unsigned); + PRINT_ATTRf(mmap2, p_unsigned); + PRINT_ATTRf(comm_exec, p_unsigned); + PRINT_ATTRf(use_clockid, p_unsigned); + + PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned); + PRINT_ATTRf(bp_type, p_unsigned); + PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex); + PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex); + PRINT_ATTRf(sample_regs_user, p_hex); + PRINT_ATTRf(sample_stack_user, p_unsigned); + PRINT_ATTRf(clockid, p_signed); + PRINT_ATTRf(sample_regs_intr, p_hex); return ret; } +static int __open_attr__fprintf(FILE *fp, const char *name, const char *val, + void *priv __attribute__((unused))) +{ + return fprintf(fp, " %-32s %s\n", name, val); +} + static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads) { @@ -1114,8 +1170,12 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, if (perf_missing_features.sample_id_all) evsel->attr.sample_id_all = 0; - if (verbose >= 2) - perf_event_attr__fprintf(&evsel->attr, stderr); + if (verbose >= 2) { + fprintf(stderr, "%.60s\n", graph_dotted_line); + fprintf(stderr, "perf_event_attr:\n"); + perf_event_attr__fprintf(stderr, &evsel->attr, __open_attr__fprintf, NULL); + fprintf(stderr, "%.60s\n", graph_dotted_line); + } for (cpu = 0; cpu < cpus->nr; cpu++) { @@ -1996,62 +2056,9 @@ static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) return ret; } -static int __if_fprintf(FILE *fp, bool *first, const char *field, u64 value) +static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv) { - if (value == 0) - return 0; - - return comma_fprintf(fp, first, " %s: %" PRIu64, field, value); -} - -#define if_print(field) printed += __if_fprintf(fp, &first, #field, evsel->attr.field) - -struct bit_names { - int bit; - const char *name; -}; - -static int bits__fprintf(FILE *fp, const char *field, u64 value, - struct bit_names *bits, bool *first) -{ - int i = 0, printed = comma_fprintf(fp, first, " %s: ", field); - bool first_bit = true; - - do { - if (value & bits[i].bit) { - printed += fprintf(fp, "%s%s", first_bit ? "" : "|", bits[i].name); - first_bit = false; - } - } while (bits[++i].name != NULL); - - return printed; -} - -static int sample_type__fprintf(FILE *fp, bool *first, u64 value) -{ -#define bit_name(n) { PERF_SAMPLE_##n, #n } - struct bit_names bits[] = { - bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR), - bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU), - bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), - bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), - bit_name(IDENTIFIER), bit_name(REGS_INTR), - { .name = NULL, } - }; -#undef bit_name - return bits__fprintf(fp, "sample_type", value, bits, first); -} - -static int read_format__fprintf(FILE *fp, bool *first, u64 value) -{ -#define bit_name(n) { PERF_FORMAT_##n, #n } - struct bit_names bits[] = { - bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING), - bit_name(ID), bit_name(GROUP), - { .name = NULL, } - }; -#undef bit_name - return bits__fprintf(fp, "read_format", value, bits, first); + return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val); } int perf_evsel__fprintf(struct perf_evsel *evsel, @@ -2080,52 +2087,13 @@ int perf_evsel__fprintf(struct perf_evsel *evsel, printed += fprintf(fp, "%s", perf_evsel__name(evsel)); - if (details->verbose || details->freq) { + if (details->verbose) { + printed += perf_event_attr__fprintf(fp, &evsel->attr, + __print_attr__fprintf, &first); + } else if (details->freq) { printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64, (u64)evsel->attr.sample_freq); } - - if (details->verbose) { - if_print(type); - if_print(config); - if_print(config1); - if_print(config2); - if_print(size); - printed += sample_type__fprintf(fp, &first, evsel->attr.sample_type); - if (evsel->attr.read_format) - printed += read_format__fprintf(fp, &first, evsel->attr.read_format); - if_print(disabled); - if_print(inherit); - if_print(pinned); - if_print(exclusive); - if_print(exclude_user); - if_print(exclude_kernel); - if_print(exclude_hv); - if_print(exclude_idle); - if_print(mmap); - if_print(comm); - if_print(freq); - if_print(inherit_stat); - if_print(enable_on_exec); - if_print(task); - if_print(watermark); - if_print(precise_ip); - if_print(mmap_data); - if_print(sample_id_all); - if_print(exclude_host); - if_print(exclude_guest); - if_print(mmap2); - if_print(comm_exec); - if_print(use_clockid); - if_print(__reserved_1); - if_print(wakeup_events); - if_print(bp_type); - if_print(branch_sample_type); - if_print(sample_regs_user); - if_print(sample_stack_user); - if_print(clockid); - if_print(sample_regs_intr); - } out: fputc('\n', fp); return ++printed; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index c5a43d6b13dc..e486151b0308 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -360,4 +360,10 @@ static inline bool has_branch_callstack(struct perf_evsel *evsel) { return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } + +typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *); + +int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, + attr__fprintf_f attr__fprintf, void *priv); + #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index de5f4669ba5f..fff3b2a455ae 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1055,6 +1055,12 @@ read_event_desc(struct perf_header *ph, int fd) goto out; } +static int __desc_attr__fprintf(FILE *fp, const char *name, const char *val, + void *priv __attribute__((unused))) +{ + return fprintf(fp, ", %s = %s", name, val); +} + static void print_event_desc(struct perf_header *ph, int fd, FILE *fp) { struct perf_evsel *evsel, *events = read_event_desc(ph, fd); @@ -1069,26 +1075,6 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp) for (evsel = events; evsel->attr.size; evsel++) { fprintf(fp, "# event : name = %s, ", evsel->name); - fprintf(fp, "type = %d, config = 0x%"PRIx64 - ", config1 = 0x%"PRIx64", config2 = 0x%"PRIx64, - evsel->attr.type, - (u64)evsel->attr.config, - (u64)evsel->attr.config1, - (u64)evsel->attr.config2); - - fprintf(fp, ", excl_usr = %d, excl_kern = %d", - evsel->attr.exclude_user, - evsel->attr.exclude_kernel); - - fprintf(fp, ", excl_host = %d, excl_guest = %d", - evsel->attr.exclude_host, - evsel->attr.exclude_guest); - - fprintf(fp, ", precise_ip = %d", evsel->attr.precise_ip); - - fprintf(fp, ", attr_mmap2 = %d", evsel->attr.mmap2); - fprintf(fp, ", attr_mmap = %d", evsel->attr.mmap); - fprintf(fp, ", attr_mmap_data = %d", evsel->attr.mmap_data); if (evsel->ids) { fprintf(fp, ", id = {"); for (j = 0, id = evsel->id; j < evsel->ids; j++, id++) { @@ -1098,9 +1084,8 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp) } fprintf(fp, " }"); } - if (evsel->attr.use_clockid) - fprintf(fp, ", clockid = %d", evsel->attr.clockid); + perf_event_attr__fprintf(fp, &evsel->attr, __desc_attr__fprintf, NULL); fputc('\n', fp); } From f6c15621f04b97ce882c66e5055f0ac325fb8eb8 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 8 Apr 2015 02:14:34 +0000 Subject: [PATCH 16/19] perf probe: Fix ARM 32 building error Commit 9b118acae310f57baee770b5db402500d8695e50 ("perf probe: Fix to handle aliased symbols in glibc") uses an absolute format '%lx' to print u64 argument, which causes compiling error on ARM 32. This patch replaces it with PRIx64. Signed-off-by: Wang Nan Acked-by: Masami Hiramatsu Cc: Jiri Olsa Cc: Namhyung Kim Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1428459274-138470-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 4fd49f021073..b78851732a71 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -322,7 +322,8 @@ static int find_alternative_probe_point(struct debuginfo *dinfo, ret = -ENOENT; goto out; } - pr_debug("Symbol %s address found : %lx\n", pp->function, address); + pr_debug("Symbol %s address found : %" PRIx64 "\n", + pp->function, address); ret = debuginfo__find_probe_point(dinfo, (unsigned long)address, result); From 54a50f93eb13ab5efa7366627534e2b0f7caa8e5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Apr 2015 19:17:15 +0200 Subject: [PATCH 17/19] perf tests: Fix attr tests Following commit: 1a5941312414 perf: Add wakeup watermark control to the AUX area enlarged perf_event_attr, but did not updated attr tests. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Jiri Olsa Cc: "H. Peter Anvin" Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: Kaixu Xia Cc: Kan Liang Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Markus T Metzger Cc: Mathieu Poirier Link: http://lkml.kernel.org/n/20150407171715.GA22603@krava.redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/attr/base-record | 2 +- tools/perf/tests/attr/base-stat | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record index d3095dafed36..7e6d74946e04 100644 --- a/tools/perf/tests/attr/base-record +++ b/tools/perf/tests/attr/base-record @@ -5,7 +5,7 @@ group_fd=-1 flags=0|8 cpu=* type=0|1 -size=104 +size=112 config=0 sample_period=4000 sample_type=263 diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat index 872ed7e24c7c..f4cf148f14cb 100644 --- a/tools/perf/tests/attr/base-stat +++ b/tools/perf/tests/attr/base-stat @@ -5,7 +5,7 @@ group_fd=-1 flags=0|8 cpu=* type=0 -size=104 +size=112 config=0 sample_period=0 sample_type=0 From f6fcc1433a4a9057b2977313f31eadbc1c84268b Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 8 Apr 2015 10:59:32 +0000 Subject: [PATCH 18/19] perf report: Don't call map__kmap if map is NULL. report__warn_kptr_restrict() calls map__kmap(kernel_map) before checking kernel_map againest NULL. Which is dangerous, since map__kmap() will return a invalid and not NULL address. It will trigger a warning message in map__kmap() after the patch "perf: kmaps: enforce usage of kmaps to protect futher bugs." was applied. This patch fixes it by adding the missing checking. Signed-off-by: Wang Nan Cc: Adrian Hunter Cc: Jiri Olsa Cc: Zefan Li Cc: pi3orama@163.com Link: http://lkml.kernel.org/r/1428490772-135393-1-git-send-email-wangnan0@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index b5b2ad4ca9c4..476cdf7afcca 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -347,7 +347,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, static void report__warn_kptr_restrict(const struct report *rep) { struct map *kernel_map = rep->session->machines.host.vmlinux_maps[MAP__FUNCTION]; - struct kmap *kernel_kmap = map__kmap(kernel_map); + struct kmap *kernel_kmap = kernel_map ? map__kmap(kernel_map) : NULL; if (kernel_map == NULL || (kernel_map->dso->hit && From a1e12da4796a4ddd0e911687a290eb396d1c64bf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Apr 2015 23:25:14 +0200 Subject: [PATCH 19/19] perf tools: Add 'I' event modifier for exclude_idle bit Adding 'I' event modifier to have complete set of modifiers for perf_event_attr:exclude_* bits. Any event specified with 'I' modifier will have the perf_event_attr:exclude_idle bit set. $ perf record -e cycles:I -vv ls 2>&1 | grep exclude_idle exclude_hv 0 exclude_idle 1 Adding automated tests. Signed-off-by: Jiri Olsa Cc: Andi Kleen Cc: David Ahern Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: William Cohen Link: http://lkml.kernel.org/r/1428441919-23099-2-git-send-email-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-list.txt | 1 + tools/perf/tests/parse-events.c | 40 ++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 8 +++++- tools/perf/util/parse-events.l | 2 +- 4 files changed, 49 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 4692d277980b..bada8933fdd4 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -26,6 +26,7 @@ counted. The following modifiers exist: u - user-space counting k - kernel counting h - hypervisor counting + I - non idle counting G - guest counting (in KVM guests) H - host counting (not in KVM guests) p - precise level diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index ac243ebcb20a..3de744961739 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -295,6 +295,36 @@ static int test__checkevent_genhw_modifier(struct perf_evlist *evlist) return test__checkevent_genhw(evlist); } +static int test__checkevent_exclude_idle_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude idle", evsel->attr.exclude_idle); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host); + TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + + return test__checkevent_symbolic_name(evlist); +} + +static int test__checkevent_exclude_idle_modifier_1(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__first(evlist); + + TEST_ASSERT_VAL("wrong exclude idle", evsel->attr.exclude_idle); + TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest); + TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host); + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + + return test__checkevent_symbolic_name(evlist); +} + static int test__checkevent_breakpoint_modifier(struct perf_evlist *evlist) { struct perf_evsel *evsel = perf_evlist__first(evlist); @@ -1494,6 +1524,16 @@ static struct evlist_test test__events[] = { .id = 100, }, #endif + { + .name = "instructions:I", + .check = test__checkevent_exclude_idle_modifier, + .id = 45, + }, + { + .name = "instructions:kIG", + .check = test__checkevent_exclude_idle_modifier_1, + .id = 46, + }, }; static struct evlist_test test__events_pmu[] = { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index fe07573d5ed4..be0655388b38 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -709,6 +709,7 @@ struct event_modifier { int eh; int eH; int eG; + int eI; int precise; int exclude_GH; int sample_read; @@ -723,6 +724,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, int eh = evsel ? evsel->attr.exclude_hv : 0; int eH = evsel ? evsel->attr.exclude_host : 0; int eG = evsel ? evsel->attr.exclude_guest : 0; + int eI = evsel ? evsel->attr.exclude_idle : 0; int precise = evsel ? evsel->attr.precise_ip : 0; int sample_read = 0; int pinned = evsel ? evsel->attr.pinned : 0; @@ -753,6 +755,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str, if (!exclude_GH) exclude_GH = eG = eH = 1; eH = 0; + } else if (*str == 'I') { + eI = 1; } else if (*str == 'p') { precise++; /* use of precise requires exclude_guest */ @@ -786,6 +790,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str, mod->eh = eh; mod->eH = eH; mod->eG = eG; + mod->eI = eI; mod->precise = precise; mod->exclude_GH = exclude_GH; mod->sample_read = sample_read; @@ -803,7 +808,7 @@ static int check_modifier(char *str) char *p = str; /* The sizeof includes 0 byte as well. */ - if (strlen(str) > (sizeof("ukhGHpppSD") - 1)) + if (strlen(str) > (sizeof("ukhGHpppSDI") - 1)) return -1; while (*p) { @@ -839,6 +844,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add) evsel->attr.precise_ip = mod.precise; evsel->attr.exclude_host = mod.eH; evsel->attr.exclude_guest = mod.eG; + evsel->attr.exclude_idle = mod.eI; evsel->exclude_GH = mod.exclude_GH; evsel->sample_read = mod.sample_read; diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 94eacb6c1ef7..8895cf3132ab 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -101,7 +101,7 @@ num_raw_hex [a-fA-F0-9]+ name [a-zA-Z_*?][a-zA-Z0-9_*?]* name_minus [a-zA-Z_*?][a-zA-Z0-9\-_*?]* /* If you add a modifier you need to update check_modifier() */ -modifier_event [ukhpGHSD]+ +modifier_event [ukhpGHSDI]+ modifier_bp [rwx]{1,3} %%