From 8ea92ceb748535799e3e9f35afb85bdc23bf6d7c Mon Sep 17 00:00:00 2001
From: He Kuang <hekuang@huawei.com>
Date: Tue, 7 Apr 2015 17:31:10 +0800
Subject: [PATCH 01/19] perf evlist: Fix inverted logic in perf_mmap__empty

perf_evlist__mmap_consume() uses perf_mmap__empty() to judge whether
perf_mmap is empty and can be released. But the result is inverted so
fix it.

Signed-off-by: He Kuang <hekuang@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1428399071-7141-1-git-send-email-hekuang@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 82bf224bbee9..76ef7ee62640 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -695,7 +695,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 
 static bool perf_mmap__empty(struct perf_mmap *md)
 {
-	return perf_mmap__read_head(md) != md->prev;
+	return perf_mmap__read_head(md) == md->prev;
 }
 
 static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx)

From ba92732e9808df679ddf75c5ea1c0caae6d7dce2 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 7 Apr 2015 08:22:45 +0000
Subject: [PATCH 02/19] perf kmaps: Check kmaps to make code more robust

This patch add checks in places where map__kmap is used to get kmaps
from struct kmap.

Error messages are added at map__kmap to warn invalid accessing of kmap
(for the case of !map->dso->kernel, kmap(map) does not exists at all).

Also, introduces map__kmaps() to warn uninitialized kmaps.

Reviewed-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: pi3orama@163.com
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1428394966-131044-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c     |  5 ++++-
 tools/perf/util/map.c         | 20 ++++++++++++++++++++
 tools/perf/util/map.h         |  6 ++----
 tools/perf/util/probe-event.c |  2 ++
 tools/perf/util/session.c     |  3 +++
 tools/perf/util/symbol-elf.c  | 16 +++++++++++-----
 tools/perf/util/symbol.c      | 34 ++++++++++++++++++++++++++++------
 7 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index e45c8f33a8fd..9c380a2caa54 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -679,6 +679,9 @@ int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
 			machine->vmlinux_maps[type]->unmap_ip =
 				identity__map_ip;
 		kmap = map__kmap(machine->vmlinux_maps[type]);
+		if (!kmap)
+			return -1;
+
 		kmap->kmaps = &machine->kmaps;
 		map_groups__insert(&machine->kmaps,
 				   machine->vmlinux_maps[type]);
@@ -700,7 +703,7 @@ void machine__destroy_kernel_maps(struct machine *machine)
 		kmap = map__kmap(machine->vmlinux_maps[type]);
 		map_groups__remove(&machine->kmaps,
 				   machine->vmlinux_maps[type]);
-		if (kmap->ref_reloc_sym) {
+		if (kmap && kmap->ref_reloc_sym) {
 			/*
 			 * ref_reloc_sym is shared among all maps, so free just
 			 * on one of them.
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 62ca9f2607d5..a14f08f41686 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -778,3 +778,23 @@ struct map *maps__next(struct map *map)
 		return rb_entry(next, struct map, rb_node);
 	return NULL;
 }
+
+struct kmap *map__kmap(struct map *map)
+{
+	if (!map->dso || !map->dso->kernel) {
+		pr_err("Internal error: map__kmap with a non-kernel map\n");
+		return NULL;
+	}
+	return (struct kmap *)(map + 1);
+}
+
+struct map_groups *map__kmaps(struct map *map)
+{
+	struct kmap *kmap = map__kmap(map);
+
+	if (!kmap || !kmap->kmaps) {
+		pr_err("Internal error: map__kmaps with a non-kernel map\n");
+		return NULL;
+	}
+	return kmap->kmaps;
+}
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 0e42438b1e59..ec19c59ca38e 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -76,10 +76,8 @@ static inline struct map_groups *map_groups__get(struct map_groups *mg)
 
 void map_groups__put(struct map_groups *mg);
 
-static inline struct kmap *map__kmap(struct map *map)
-{
-	return (struct kmap *)(map + 1);
-}
+struct kmap *map__kmap(struct map *map);
+struct map_groups *map__kmaps(struct map *map);
 
 static inline u64 map__map_ip(struct map *map, u64 ip)
 {
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 8feac0774c41..4fd49f021073 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -135,6 +135,8 @@ static struct ref_reloc_sym *kernel_get_ref_reloc_sym(void)
 		return NULL;
 
 	kmap = map__kmap(host_machine->vmlinux_maps[MAP__FUNCTION]);
+	if (!kmap)
+		return NULL;
 	return kmap->ref_reloc_sym;
 }
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index dfacf1d50162..0c74012575ac 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1466,6 +1466,9 @@ int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
 
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
 		struct kmap *kmap = map__kmap(maps[i]);
+
+		if (!kmap)
+			continue;
 		kmap->ref_reloc_sym = ref;
 	}
 
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 476268c99431..a7ab6063e038 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -776,6 +776,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 		  symbol_filter_t filter, int kmodule)
 {
 	struct kmap *kmap = dso->kernel ? map__kmap(map) : NULL;
+	struct map_groups *kmaps = kmap ? map__kmaps(map) : NULL;
 	struct map *curr_map = map;
 	struct dso *curr_dso = dso;
 	Elf_Data *symstrs, *secstrs;
@@ -791,6 +792,9 @@ int dso__load_sym(struct dso *dso, struct map *map,
 	int nr = 0;
 	bool remap_kernel = false, adjust_kernel_syms = false;
 
+	if (kmap && !kmaps)
+		return -1;
+
 	dso->symtab_type = syms_ss->type;
 	dso->is_64_bit = syms_ss->is_64_bit;
 	dso->rel = syms_ss->ehdr.e_type == ET_REL;
@@ -958,8 +962,10 @@ int dso__load_sym(struct dso *dso, struct map *map,
 					map->map_ip = map__map_ip;
 					map->unmap_ip = map__unmap_ip;
 					/* Ensure maps are correctly ordered */
-					map_groups__remove(kmap->kmaps, map);
-					map_groups__insert(kmap->kmaps, map);
+					if (kmaps) {
+						map_groups__remove(kmaps, map);
+						map_groups__insert(kmaps, map);
+					}
 				}
 
 				/*
@@ -983,7 +989,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 			snprintf(dso_name, sizeof(dso_name),
 				 "%s%s", dso->short_name, section_name);
 
-			curr_map = map_groups__find_by_name(kmap->kmaps, map->type, dso_name);
+			curr_map = map_groups__find_by_name(kmaps, map->type, dso_name);
 			if (curr_map == NULL) {
 				u64 start = sym.st_value;
 
@@ -1013,7 +1019,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 					curr_map->unmap_ip = identity__map_ip;
 				}
 				curr_dso->symtab_type = dso->symtab_type;
-				map_groups__insert(kmap->kmaps, curr_map);
+				map_groups__insert(kmaps, curr_map);
 				/*
 				 * The new DSO should go to the kernel DSOS
 				 */
@@ -1075,7 +1081,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 			 * We need to fixup this here too because we create new
 			 * maps here, for things like vsyscall sections.
 			 */
-			__map_groups__fixup_end(kmap->kmaps, map->type);
+			__map_groups__fixup_end(kmaps, map->type);
 		}
 	}
 	err = nr;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index fddeb9073039..201f6c4ca738 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -630,13 +630,16 @@ static int dso__load_all_kallsyms(struct dso *dso, const char *filename,
 static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
 					 symbol_filter_t filter)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
+	struct map_groups *kmaps = map__kmaps(map);
 	struct map *curr_map;
 	struct symbol *pos;
 	int count = 0, moved = 0;
 	struct rb_root *root = &dso->symbols[map->type];
 	struct rb_node *next = rb_first(root);
 
+	if (!kmaps)
+		return -1;
+
 	while (next) {
 		char *module;
 
@@ -682,8 +685,8 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
 static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
 			       symbol_filter_t filter)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
-	struct machine *machine = kmaps->machine;
+	struct map_groups *kmaps = map__kmaps(map);
+	struct machine *machine;
 	struct map *curr_map = map;
 	struct symbol *pos;
 	int count = 0, moved = 0;
@@ -691,6 +694,11 @@ static int dso__split_kallsyms(struct dso *dso, struct map *map, u64 delta,
 	struct rb_node *next = rb_first(root);
 	int kernel_range = 0;
 
+	if (!kmaps)
+		return -1;
+
+	machine = kmaps->machine;
+
 	while (next) {
 		char *module;
 
@@ -1025,9 +1033,12 @@ static bool filename_from_kallsyms_filename(char *filename,
 static int validate_kcore_modules(const char *kallsyms_filename,
 				  struct map *map)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
+	struct map_groups *kmaps = map__kmaps(map);
 	char modules_filename[PATH_MAX];
 
+	if (!kmaps)
+		return -EINVAL;
+
 	if (!filename_from_kallsyms_filename(modules_filename, "modules",
 					     kallsyms_filename))
 		return -EINVAL;
@@ -1043,6 +1054,9 @@ static int validate_kcore_addresses(const char *kallsyms_filename,
 {
 	struct kmap *kmap = map__kmap(map);
 
+	if (!kmap)
+		return -EINVAL;
+
 	if (kmap->ref_reloc_sym && kmap->ref_reloc_sym->name) {
 		u64 start;
 
@@ -1081,8 +1095,8 @@ static int kcore_mapfn(u64 start, u64 len, u64 pgoff, void *data)
 static int dso__load_kcore(struct dso *dso, struct map *map,
 			   const char *kallsyms_filename)
 {
-	struct map_groups *kmaps = map__kmap(map)->kmaps;
-	struct machine *machine = kmaps->machine;
+	struct map_groups *kmaps = map__kmaps(map);
+	struct machine *machine;
 	struct kcore_mapfn_data md;
 	struct map *old_map, *new_map, *replacement_map = NULL;
 	bool is_64_bit;
@@ -1090,6 +1104,11 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
 	char kcore_filename[PATH_MAX];
 	struct symbol *sym;
 
+	if (!kmaps)
+		return -EINVAL;
+
+	machine = kmaps->machine;
+
 	/* This function requires that the map is the kernel map */
 	if (map != machine->vmlinux_maps[map->type])
 		return -EINVAL;
@@ -1202,6 +1221,9 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
 	struct kmap *kmap = map__kmap(map);
 	u64 addr;
 
+	if (!kmap)
+		return -1;
+
 	if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->name)
 		return 0;
 

From 3201f0dc42f7fad9387afc4692cea3d0c730cba2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 6 Apr 2015 14:36:16 +0900
Subject: [PATCH 03/19] tools lib traceevent: Honor operator priority

Currently it ignores operator priority and just sets processed args as a
right operand.  But it could result in priority inversion in case that
the right operand is also a operator arg and its priority is lower.

For example, following print format is from new kmem events.

  "page=%p", REC->pfn != -1UL ? (((struct page *)(0xffffea0000000000UL)) + (REC->pfn)) : ((void *)0)

But this was treated as below:

  REC->pfn != ((null - 1UL) ? ((struct page *)0xffffea0000000000UL + REC->pfn) : (void *) 0)

In this case, the right arg was '?' operator which has lower priority.
But it just sets the whole arg so making the output confusing - page was
always 0 or 1 since that's the result of logical operation.

With this patch, it can handle it properly like following:

  ((REC->pfn != (null - 1UL)) ? ((struct page *)0xffffea0000000000UL + REC->pfn) : (void *) 0)

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1428298576-9785-10-git-send-email-namhyung@kernel.org
[ Replaced 'swap' with 'rotate' in a comment as requested by Steve and agreed by Namhyung ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/traceevent/event-parse.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 6d31b6419d37..12a7e2a40c89 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1939,7 +1939,22 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
 			goto out_warn_free;
 
 		type = process_arg_token(event, right, tok, type);
-		arg->op.right = right;
+
+		if (right->type == PRINT_OP &&
+		    get_op_prio(arg->op.op) < get_op_prio(right->op.op)) {
+			struct print_arg tmp;
+
+			/* rotate ops according to the priority */
+			arg->op.right = right->op.left;
+
+			tmp = *arg;
+			*arg = *right;
+			*right = tmp;
+
+			arg->op.left = right;
+		} else {
+			arg->op.right = right;
+		}
 
 	} else if (strcmp(token, "[") == 0) {
 

From 28939e1a1f6d198239d86b1d77fa9fd55773189a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 6 Apr 2015 14:36:08 +0900
Subject: [PATCH 04/19] perf kmem: Respect -i option

Currently the perf kmem does not respect -i option.

Initializing the file.path properly after options get parsed.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1428298576-9785-2-git-send-email-namhyung@kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index ac303ef9f2f0..4ebf65c79434 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -663,7 +663,6 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	const char * const default_sort_order = "frag,hit,bytes";
 	struct perf_data_file file = {
-		.path = input_name,
 		.mode = PERF_DATA_MODE_READ,
 	};
 	const struct option kmem_options[] = {
@@ -701,6 +700,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		return __cmd_record(argc, argv);
 	}
 
+	file.path = input_name;
+
 	session = perf_session__new(&file, false, &perf_kmem);
 	if (session == NULL)
 		return -1;

From 0755bc4dc77a876aa60d4b3d33b5f6506f21f91b Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:28 +0800
Subject: [PATCH 05/19] perf sched replay: Use struct task_desc instead of
 struct task_task for correct meaning

There is no struct task_task at all, thus it is a typo error in the old
commits, now fix it to what it should be in order to avoid unnecessary
misunderstanding.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-2-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 3b3a5bb97059..a1893e8dfe17 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -346,7 +346,7 @@ static struct task_desc *register_pid(struct perf_sched *sched,
 
 	sched->pid_to_task[pid] = task;
 	sched->nr_tasks++;
-	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
+	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
 	BUG_ON(!sched->tasks);
 	sched->tasks[task->nr] = task;
 

From a35e27d0e5d801ff75481a8f639bb4d59ea1aafa Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:29 +0800
Subject: [PATCH 06/19] perf sched replay: Increase the MAX_PID value to fix
 assertion failure problem

Current MAX_PID is only 65536, which will cause assertion failure problem
when CPU cores are more than 64 in x86_64.

This is because the pid_max value in x86_64 is at least
PIDS_PER_CPU_DEFAULT * num_possible_cpus() (see function pidmap_init
defined in kernel/pid.c), where PIDS_PER_CPU_DEFAULT is 1024 (defined in
include/linux/threads.h).

Thus for MAX_PID = 65536, the correspoinding CPU cores are
65536/1024=64.  This is obviously not enough at all for x86_64, and will
cause an assertion failure problem due to BUG_ON(pid >= MAX_PID) in the
codes.

We increase MAX_PID value from 65536 to 1024*1000, which can be used in
x86_64 with 1000 cores.

This number is finally decided according to the limitation of stack size
of calling process.

Use 'ulimit -a', the result shows the stack size of any process is 8192
Kbytes, which is defined in include/uapi/linux/resource.h (#define
_STK_LIM (8*1024*1024)).

Thus we choose a large enough value for MAX_PID, and make it satisfy to
the limitation of the stack size, i.e., making the perf process take up
a memory space just smaller than 8192 Kbytes.

We have calculated and tested that 1024*1000 is OK for MAX_PID.

This means perf sched replay can now be used with at most 1000 cores in
x86_64 without any assertion failure problem.

Example:

Test environment: x86_64 with 160 cores

 $ cat /proc/sys/kernel/pid_max
 163840

Before this patch:

 $ perf sched replay
 run measurement overhead: 240 nsecs
 sleep measurement overhead: 55379 nsecs
 the run test took 1000004 nsecs
 the sleep test took 1059424 nsecs
 perf: builtin-sched.c:330: register_pid: Assertion `!(pid >= 65536)'
 failed.
 Aborted

After this patch:

 $ perf sched replay
 run measurement overhead: 221 nsecs
 sleep measurement overhead: 55397 nsecs
 the run test took 999920 nsecs
 the sleep test took 1053313 nsecs
 nr_run_events:        10
 nr_sleep_events:      1562
 nr_wakeup_events:     5
 task      0 (                  :1:         1), nr_events: 1
 task      1 (                  :2:         2), nr_events: 1
 task      2 (                  :3:         3), nr_events: 1
 task      3 (                  :5:         5), nr_events: 1
 ...

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-3-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index a1893e8dfe17..c46610447ede 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -28,7 +28,7 @@
 #define MAX_CPUS		4096
 #define COMM_LEN		20
 #define SYM_LEN			129
-#define MAX_PID			65536
+#define MAX_PID			1024000
 
 struct sched_atom;
 

From cb06ac256a16fc1a5ab063107c2b35b3b9e95102 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:30 +0800
Subject: [PATCH 07/19] perf sched replay: Alloc the memory of pid_to_task
 dynamically to adapt to the unexpected change of pid_max

The current memory allocation of struct task_desc *pid_to_task[MAX_PID]
is in a permanent and preset way, and it has two problems:

Problem 1: If the pid_max, which is the max number of pids in the
system, is much smaller than MAX_PID (1024*1000), then it causes a waste
of stack memory. This may happen in the case where the number of cpu
cores is much smaller than 1000.

Problem 2: If the pid_max is changed from the default value to a value
larger than MAX_PID, then it will cause assertion failure problem. The
maximum value of pid_max can be set to pid_max_max (see pidmap_init
defined in kernel/pid.c), which equals to PID_MAX_LIMIT. In x86_64,
PID_MAX_LIMIT is 4*1024*1024 (defined in include/linux/threads.h). This
value is much larger than MAX_PID, and will take up 32768 Kbytes
(4*1024*1024*8/1024) for memory allocation of pid_to_task, which is much
larger than the default 8192 Kbytes of the stack size of calling
process.

Due to these two problems, we use calloc to allocate the memory of
pid_to_task dynamically.

Example:

Test environment: x86_64 with 160 cores

 $ cat /proc/sys/kernel/pid_max
 163840
 $ echo 1025000 > /proc/sys/kernel/pid_max
 $ cat /proc/sys/kernel/pid_max
 1025000

Run some applications until the pid of some process is greater than
the value of MAX_PID (1024*1000).

Before this patch:

 $ perf sched replay
 run measurement overhead: 221 nsecs
 sleep measurement overhead: 55480 nsecs
 the run test took 1000008 nsecs
 the sleep test took 1063151 nsecs
 perf: builtin-sched.c:330: register_pid: Assertion `!(pid >= 1024000)'
 failed.
 Aborted

After this patch:

 $ perf sched replay
 run measurement overhead: 221 nsecs
 sleep measurement overhead: 55435 nsecs
 the run test took 1000004 nsecs
 the sleep test took 1059312 nsecs
 nr_run_events:        10
 nr_sleep_events:      1562
 nr_wakeup_events:     5
 task      0 (                  :1:         1), nr_events: 1
 task      1 (                  :2:         2), nr_events: 1
 task      2 (                  :3:         3), nr_events: 1
 task      3 (                  :5:         5), nr_events: 1
 ...

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-4-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c46610447ede..20d887b222e4 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -23,6 +23,7 @@
 #include <semaphore.h>
 #include <pthread.h>
 #include <math.h>
+#include <api/fs/fs.h>
 
 #define PR_SET_NAME		15               /* Set process name */
 #define MAX_CPUS		4096
@@ -124,7 +125,7 @@ struct perf_sched {
 	struct perf_tool tool;
 	const char	 *sort_order;
 	unsigned long	 nr_tasks;
-	struct task_desc *pid_to_task[MAX_PID];
+	struct task_desc **pid_to_task;
 	struct task_desc **tasks;
 	const struct trace_sched_handler *tp_handler;
 	pthread_mutex_t	 start_work_mutex;
@@ -326,8 +327,14 @@ static struct task_desc *register_pid(struct perf_sched *sched,
 				      unsigned long pid, const char *comm)
 {
 	struct task_desc *task;
+	static int pid_max;
 
-	BUG_ON(pid >= MAX_PID);
+	if (sched->pid_to_task == NULL) {
+		if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
+			pid_max = MAX_PID;
+		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
+	}
+	BUG_ON(pid >= (unsigned long)pid_max);
 
 	task = sched->pid_to_task[pid];
 

From 3a423a5c36d1a28a258beaa7db855568b82d07ab Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:31 +0800
Subject: [PATCH 08/19] perf sched replay: Realloc the memory of pid_to_task
 stepwise to adapt to the different pid_max configurations

Although the memory of pid_to_task can be allocated via calloc according
to the value of /proc/sys/kernel/pid_max, it cannot handle the case when
pid_max is changed after 'perf sched record' has created its perf.data.

If the new pid_max configured in 'perf sched replay' is smaller than the
old pid_max configured in 'perf sched record', then it will cause the
assertion failure problem.

To solve this problem, we realloc the memory of pid_to_task stepwise
once the passed-in pid parameter in register_pid is larger than the
current pid_max.

Example:

Test environment: x86_64 with 160 cores

 $ cat /proc/sys/kernel/pid_max
 163840
 $ perf sched record ls
 $ echo 5000 > /proc/sys/kernel/pid_max
 $ cat /proc/sys/kernel/pid_max
 5000

Before this patch:

 $ perf sched replay
 run measurement overhead: 221 nsecs
 sleep measurement overhead: 55356 nsecs
 the run test took 1000011 nsecs
 the sleep test took 1060940 nsecs
 perf: builtin-sched.c:337: register_pid: Assertion `!(pid >= (unsigned
 long)pid_max)' failed.
 Aborted

After this patch:

 $ perf sched replay
 run measurement overhead: 221 nsecs
 sleep measurement overhead: 55611 nsecs
 the run test took 1000026 nsecs
 the sleep test took 1060486 nsecs
 nr_run_events:        10
 nr_sleep_events:      1562
 nr_wakeup_events:     5
 task      0 (                  :1:         1), nr_events: 1
 task      1 (                  :2:         2), nr_events: 1
 task      2 (                  :3:         3), nr_events: 1
 task      3 (                  :5:         5), nr_events: 1
 ...

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-5-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 20d887b222e4..dd714818fa4d 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -334,7 +334,12 @@ static struct task_desc *register_pid(struct perf_sched *sched,
 			pid_max = MAX_PID;
 		BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
 	}
-	BUG_ON(pid >= (unsigned long)pid_max);
+	if (pid >= (unsigned long)pid_max) {
+		BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + 1) *
+			sizeof(struct task_desc *))) == NULL);
+		while (pid >= (unsigned long)pid_max)
+			sched->pid_to_task[pid_max++] = NULL;
+	}
 
 	task = sched->pid_to_task[pid];
 

From 08097abc11bcee21355dd857852a807b2a30b79f Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:32 +0800
Subject: [PATCH 09/19] perf sched replay: Fix the segmentation fault problem
 caused by pr_err in threads

The pr_err in self_open_counters() prints error message to stderr.
Unlike stdout, stderr uses memory buffer on the stack of each calling
process.

The pr_err in self_open_counters() works in a thread called thread_func
created in function create_tasks, which concurrently creates
sched->nr_tasks threads.

If the error happens and pr_err prints the error message in each of
these threads, the stack size of the perf process (default is 8192
kbytes) will quickly run out and the segmentation fault will happen
then.

To solve this problem, pr_err with self_open_counters() should be moved
from newly created threads to the old main thread of the perf process.
Then the pr_err can work in a stable situation without the strange
segmentation fault problem.

Example:

Test environment: x86_64 with 160 cores

Before this patch:

 $ perf sched replay
 ...
 task   1549 (             :163132:    163132), nr_events: 1
 task   1550 (             :163540:    163540), nr_events: 1
 task   1551 (           <unknown>:         0), nr_events: 10
 Segmentation fault

After this patch:

 $ perf sched replay
 ...
 task   1549 (             :163132:    163132), nr_events: 1
 task   1550 (             :163540:    163540), nr_events: 1
 task   1551 (           <unknown>:         0), nr_events: 10
 ...

As shown above, the result continues without any segmentation fault.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-6-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index dd714818fa4d..7fe3b3cb4cc8 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -472,6 +472,7 @@ static u64 get_cpu_usage_nsec_self(int fd)
 struct sched_thread_parms {
 	struct task_desc  *task;
 	struct perf_sched *sched;
+	int fd;
 };
 
 static void *thread_func(void *ctx)
@@ -482,13 +483,12 @@ static void *thread_func(void *ctx)
 	u64 cpu_usage_0, cpu_usage_1;
 	unsigned long i, ret;
 	char comm2[22];
-	int fd;
+	int fd = parms->fd;
 
 	zfree(&parms);
 
 	sprintf(comm2, ":%s", this_task->comm);
 	prctl(PR_SET_NAME, comm2);
-	fd = self_open_counters();
 	if (fd < 0)
 		return NULL;
 again:
@@ -540,6 +540,7 @@ static void create_tasks(struct perf_sched *sched)
 		BUG_ON(parms == NULL);
 		parms->task = task = sched->tasks[i];
 		parms->sched = sched;
+		parms->fd = self_open_counters();
 		sem_init(&task->sleep_sem, 0, 0);
 		sem_init(&task->ready_for_work, 0, 0);
 		sem_init(&task->work_done_sem, 0, 0);

From 1aff59be53ef37aa9943fb5f772f03148f789bb6 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:33 +0800
Subject: [PATCH 10/19] perf sched replay: Handle the dead halt of sem_wait
 when create_tasks() fails for any task

Since there is sem_wait for each task in the wait_for_tasks(), e.g.
sem_wait(&task->work_done_sem).

The sem_wait can continue only when work_done_sem is greater than 0, or
it will be blocked.

For perf sched replay, one task may sem_post the work_done_sem of
another task, which causes the work_done_sem of that task processed in a
reasonable sequence, e.g. sem_post, sem_wait, sem_wait, sem_post...

This sequence simulates the sched process of the running tasks at the
time when perf sched record runs.

As a result, all the tasks are required and their threads must be
successfully created.

If any one (task A) of the tasks fails to create its thread, then
another task (task B), whose work_done_sem needs sem_post from that
failed task A, may likely block itself due to seg_wait.

And this is a dead halt, since task B's thread_func cannot continue at
all.

To solve this problem, perf sched replay should exit once any task fails
to create its thread.

Example:

Test environment: x86_64 with 160 cores

Before this patch:

 $ perf sched replay
 ...
 Error: sys_perf_event_open() syscall returned with -1 (Too many open
 files)
 ------------------------------------------------------------    <- dead halt

After this patch:

 $ perf sched replay
 ...
 task   1551 (           <unknown>:         0), nr_events: 10
 Error: sys_perf_event_open() syscall returned with -1 (Too many open
 files)
 $

As shown above, perf sched replay finishes the process after printing an
error message and does not block itself.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-7-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 7fe3b3cb4cc8..3261300c08f0 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -451,10 +451,12 @@ static int self_open_counters(void)
 	fd = sys_perf_event_open(&attr, 0, -1, -1,
 				 perf_event_open_cloexec_flag());
 
-	if (fd < 0)
+	if (fd < 0) {
 		pr_err("Error: sys_perf_event_open() syscall returned "
 		       "with %d (%s)\n", fd,
 		       strerror_r(errno, sbuf, sizeof(sbuf)));
+		exit(EXIT_FAILURE);
+	}
 	return fd;
 }
 

From 939cda521a24ae4dbf3beec983abd519bce56231 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:34 +0800
Subject: [PATCH 11/19] perf sched replay: Fix the EMFILE error caused by the
 limitation of the maximum open files

The soft maximum number of open files for a calling process is 1024,
which is defined as INR_OPEN_CUR in include/uapi/linux/fs.h, and the
hard maximum number of open files for a calling process is 4096, which
is defined as INR_OPEN_MAX in include/uapi/linux/fs.h.

Both INR_OPEN_CUR and INR_OPEN_MAX are used to limit the value of
RLIMIT_NOFILE in include/asm-generic/resource.h.

And the soft maximum number finally decides the limitation of the
maximum files which are allowed to be opened.

That is to say a process can use at most 1024 file descriptors for its
o pened files, or an EMFILE error will happen.

This error can be fixed by increasing the soft maximum number, under the
constraint that the soft maximum number can not exceed the hard maximum
number, or both soft and hard maximum number should be increased
simultaneously with privilege.

For perf sched replay, it uses sys_perf_event_open to create the file
descriptor for each of the tasks in order to handle information of perf
events.

That is to say each task needs a unique file descriptor. In x86_64,
there may be over 1024 or 4096 tasks correspoinding to the record in
perf.data, which causes that no enough file descriptors can be used.

As a result, EMFILE error happens and stops the replay process. To solve
this problem, we adaptively increase the soft and hard maximum number of
open files with a '-f' option.

Example:

Test environment: x86_64 with 160 cores

 $ cat /proc/sys/kernel/pid_max
 163840
 $ cat /proc/sys/fs/file-max
 6815744
 $ ulimit -Sn
 1024
 $ ulimit -Hn
 4096

Before this patch:

 $ perf sched replay
 ...
 task   1549 (             :163132:    163132), nr_events: 1
 task   1550 (             :163540:    163540), nr_events: 1
 task   1551 (           <unknown>:         0), nr_events: 10
 Error: sys_perf_event_open() syscall returned with -1 (Too many open
 files)

After this patch:

 $ perf sched replay
 ...
 task   1549 (             :163132:    163132), nr_events: 1
 task   1550 (             :163540:    163540), nr_events: 1
 task   1551 (           <unknown>:         0), nr_events: 10
 Error: sys_perf_event_open() syscall returned with -1 (Too many open
 files)
 Have a try with -f option

 $ perf sched replay -f
 ...
 task   1549 (             :163132:    163132), nr_events: 1
 task   1550 (             :163540:    163540), nr_events: 1
 task   1551 (           <unknown>:         0), nr_events: 10
 ------------------------------------------------------------
 #1  : 54.401, ravg: 54.40, cpu: 3285.21 / 3285.21
 #2  : 199.548, ravg: 68.92, cpu: 4999.65 / 3456.66
 #3  : 170.483, ravg: 79.07, cpu: 1349.94 / 3245.99
 #4  : 192.034, ravg: 90.37, cpu: 1322.88 / 3053.67
 #5  : 182.929, ravg: 99.62, cpu: 1406.51 / 2888.96
 #6  : 152.974, ravg: 104.96, cpu: 1167.54 / 2716.82
 #7  : 155.579, ravg: 110.02, cpu: 2992.53 / 2744.39
 #8  : 130.557, ravg: 112.08, cpu: 1126.43 / 2582.59
 #9  : 138.520, ravg: 114.72, cpu: 1253.22 / 2449.65
 #10 : 134.328, ravg: 116.68, cpu: 1587.95 / 2363.48

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-8-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 3261300c08f0..5ab58c6d2467 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -170,6 +170,7 @@ struct perf_sched {
 	u64		 cpu_last_switched[MAX_CPUS];
 	struct rb_root	 atom_root, sorted_atom_root;
 	struct list_head sort_list, cmp_pid;
+	bool force;
 };
 
 static u64 get_nsecs(void)
@@ -437,24 +438,43 @@ static u64 get_cpu_usage_nsec_parent(void)
 	return sum;
 }
 
-static int self_open_counters(void)
+static int self_open_counters(struct perf_sched *sched, unsigned long cur_task)
 {
 	struct perf_event_attr attr;
-	char sbuf[STRERR_BUFSIZE];
+	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
 	int fd;
+	struct rlimit limit;
+	bool need_privilege = false;
 
 	memset(&attr, 0, sizeof(attr));
 
 	attr.type = PERF_TYPE_SOFTWARE;
 	attr.config = PERF_COUNT_SW_TASK_CLOCK;
 
+force_again:
 	fd = sys_perf_event_open(&attr, 0, -1, -1,
 				 perf_event_open_cloexec_flag());
 
 	if (fd < 0) {
+		if (errno == EMFILE) {
+			if (sched->force) {
+				BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -1);
+				limit.rlim_cur += sched->nr_tasks - cur_task;
+				if (limit.rlim_cur > limit.rlim_max) {
+					limit.rlim_max = limit.rlim_cur;
+					need_privilege = true;
+				}
+				if (setrlimit(RLIMIT_NOFILE, &limit) == -1) {
+					if (need_privilege && errno == EPERM)
+						strcpy(info, "Need privilege\n");
+				} else
+					goto force_again;
+			} else
+				strcpy(info, "Have a try with -f option\n");
+		}
 		pr_err("Error: sys_perf_event_open() syscall returned "
-		       "with %d (%s)\n", fd,
-		       strerror_r(errno, sbuf, sizeof(sbuf)));
+		       "with %d (%s)\n%s", fd,
+		       strerror_r(errno, sbuf, sizeof(sbuf)), info);
 		exit(EXIT_FAILURE);
 	}
 	return fd;
@@ -542,7 +562,7 @@ static void create_tasks(struct perf_sched *sched)
 		BUG_ON(parms == NULL);
 		parms->task = task = sched->tasks[i];
 		parms->sched = sched;
-		parms->fd = self_open_counters();
+		parms->fd = self_open_counters(sched, i);
 		sem_init(&task->sleep_sem, 0, 0);
 		sem_init(&task->ready_for_work, 0, 0);
 		sem_init(&task->work_done_sem, 0, 0);
@@ -1700,6 +1720,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"),
 	OPT_END()
 	};
 	const struct option sched_options[] = {

From f0dd330fdf07d295ac468660cf60341796d5d501 Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:35 +0800
Subject: [PATCH 12/19] perf sched replay: Support using -f to override
 perf.data file ownership

Enable to use perf.data when it is not owned by current user or root.

Example:

 $ ls -al perf.data
 -rw------- 1 Yunlong.Song Yunlong.Song 5321918 Mar 25 15:14 perf.data
 $ sudo id
 uid=0(root) gid=0(root) groups=0(root),64(pkcs11)

Before this patch:

 $ sudo perf sched replay -f
 run measurement overhead: 98 nsecs
 sleep measurement overhead: 52909 nsecs
 the run test took 1000015 nsecs
 the sleep test took 1054253 nsecs
 File perf.data not owned by current user or root (use -f to override)

As shown above, the -f option does not work at all.

After this patch:

 $ sudo perf sched replay -f
 run measurement overhead: 221 nsecs
 sleep measurement overhead: 40514 nsecs
 the run test took 1000003 nsecs
 the sleep test took 1056098 nsecs
 nr_run_events:        10
 nr_sleep_events:      1562
 nr_wakeup_events:     5
 task      0 (                  :1:         1), nr_events: 1
 task      1 (                  :2:         2), nr_events: 1
 task      2 (                  :3:         3), nr_events: 1
 ...
 ...
 task   1549 (             :163132:    163132), nr_events: 1
 task   1550 (             :163540:    163540), nr_events: 1
 task   1551 (           <unknown>:         0), nr_events: 10
 ------------------------------------------------------------
 #1  : 50.198, ravg: 50.20, cpu: 2335.18 / 2335.18
 #2  : 219.099, ravg: 67.09, cpu: 2835.11 / 2385.17
 #3  : 238.626, ravg: 84.24, cpu: 3278.26 / 2474.48
 #4  : 200.364, ravg: 95.85, cpu: 2977.41 / 2524.77
 #5  : 176.882, ravg: 103.96, cpu: 2801.35 / 2552.43
 #6  : 191.093, ravg: 112.67, cpu: 2813.70 / 2578.56
 #7  : 189.448, ravg: 120.35, cpu: 2809.21 / 2601.62
 #8  : 200.637, ravg: 128.38, cpu: 2849.91 / 2626.45
 #9  : 248.338, ravg: 140.37, cpu: 4380.61 / 2801.87
 #10 : 511.139, ravg: 177.45, cpu: 3077.73 / 2829.45

As shown above, the -f option really works now.

Besides for replay, -f option can also work for latency and map.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-9-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5ab58c6d2467..7b7b798b22b2 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1487,6 +1487,7 @@ static int perf_sched__read_events(struct perf_sched *sched)
 	struct perf_data_file file = {
 		.path = input_name,
 		.mode = PERF_DATA_MODE_READ,
+		.force = sched->force,
 	};
 	int rc = -1;
 

From ff5f3bbd40bfb8632f826f1f83223d95363f36af Mon Sep 17 00:00:00 2001
From: Yunlong Song <yunlong.song@huawei.com>
Date: Tue, 31 Mar 2015 21:46:36 +0800
Subject: [PATCH 13/19] perf sched replay: Use replay_repeat to calculate the
 runavg of cpu usage instead of the default value 10

Since sched->replay_repeat is set to 10 as default, the sched->run_avg,
sched->runavg_cpu_usage, and sched->runavg_parent_cpu_usage all use
10 to calculate their value.

However, the replay_repeat can be changed to other value by using -r
option, so the calculation above should use replay_repeat to achieve
more accurate results instead of the default value 10.

Signed-off-by: Yunlong Song <yunlong.song@huawei.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1427809596-29559-10-git-send-email-yunlong.song@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 7b7b798b22b2..5275bab70313 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -607,13 +607,13 @@ static void wait_for_tasks(struct perf_sched *sched)
 	cpu_usage_1 = get_cpu_usage_nsec_parent();
 	if (!sched->runavg_cpu_usage)
 		sched->runavg_cpu_usage = sched->cpu_usage;
-	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
+	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - 1) + sched->cpu_usage) / sched->replay_repeat;
 
 	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
 	if (!sched->runavg_parent_cpu_usage)
 		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
-	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
-					 sched->parent_cpu_usage)/10;
+	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
+					 sched->parent_cpu_usage)/sched->replay_repeat;
 
 	ret = pthread_mutex_lock(&sched->start_work_mutex);
 	BUG_ON(ret);
@@ -645,7 +645,7 @@ static void run_one_test(struct perf_sched *sched)
 	sched->sum_fluct += fluct;
 	if (!sched->run_avg)
 		sched->run_avg = delta;
-	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
+	sched->run_avg = (sched->run_avg * (sched->replay_repeat - 1) + delta) / sched->replay_repeat;
 
 	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
 

From 814c8c38e13c7050259c72f89bb01f3fc903f642 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 31 Mar 2015 00:19:31 +0200
Subject: [PATCH 14/19] perf record: Add clockid parameter

Teach perf-record about the new perf_event_attr::{use_clockid, clockid}
fields. Add a simple parameter to set the clock (if any) to be used for
the events to be recorded into the data file.

Since we store the entire perf_event_attr in the EVENT_DESC section we
also already store the used clockid in the data file.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yunlong Song <yunlong.song@huawei.com>
Link: http://lkml.kernel.org/r/20150407154851.GR23123@twins.programming.kicks-ass.net
[ Conditionally define CLOCK_BOOTTIME, at least rhel6 doesn't have it - dsahern
  Ditto for CLOCK_MONOTONIC_RAW, sles11sp2 doesn't have it - yunlong.song ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt |  7 ++
 tools/perf/builtin-record.c              | 87 ++++++++++++++++++++++++
 tools/perf/perf.h                        |  2 +
 tools/perf/util/evsel.c                  | 59 ++++++++++++++--
 tools/perf/util/header.c                 |  3 +
 5 files changed, 154 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 355c4f5569b5..4847a793de65 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -250,6 +250,13 @@ is off by default.
 --running-time::
 Record running and enabled time for read events (:S)
 
+-k::
+--clockid::
+Sets the clock id to use for the various time fields in the perf_event_type
+records. See clock_gettime(). In particular CLOCK_MONOTONIC and
+CLOCK_MONOTONIC_RAW are supported, some events might also allow
+CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 18aad239b401..ac610488d2e1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -711,6 +711,90 @@ static int perf_record_config(const char *var, const char *value, void *cb)
 	return perf_default_config(var, value, cb);
 }
 
+struct clockid_map {
+	const char *name;
+	int clockid;
+};
+
+#define CLOCKID_MAP(n, c)	\
+	{ .name = n, .clockid = (c), }
+
+#define CLOCKID_END	{ .name = NULL, }
+
+
+/*
+ * Add the missing ones, we need to build on many distros...
+ */
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif
+#ifndef CLOCK_BOOTTIME
+#define CLOCK_BOOTTIME 7
+#endif
+#ifndef CLOCK_TAI
+#define CLOCK_TAI 11
+#endif
+
+static const struct clockid_map clockids[] = {
+	/* available for all events, NMI safe */
+	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
+	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
+
+	/* available for some events */
+	CLOCKID_MAP("realtime", CLOCK_REALTIME),
+	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
+	CLOCKID_MAP("tai", CLOCK_TAI),
+
+	/* available for the lazy */
+	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
+	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
+	CLOCKID_MAP("real", CLOCK_REALTIME),
+	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
+
+	CLOCKID_END,
+};
+
+static int parse_clockid(const struct option *opt, const char *str, int unset)
+{
+	struct record_opts *opts = (struct record_opts *)opt->value;
+	const struct clockid_map *cm;
+	const char *ostr = str;
+
+	if (unset) {
+		opts->use_clockid = 0;
+		return 0;
+	}
+
+	/* no arg passed */
+	if (!str)
+		return 0;
+
+	/* no setting it twice */
+	if (opts->use_clockid)
+		return -1;
+
+	opts->use_clockid = true;
+
+	/* if its a number, we're done */
+	if (sscanf(str, "%d", &opts->clockid) == 1)
+		return 0;
+
+	/* allow a "CLOCK_" prefix to the name */
+	if (!strncasecmp(str, "CLOCK_", 6))
+		str += 6;
+
+	for (cm = clockids; cm->name; cm++) {
+		if (!strcasecmp(str, cm->name)) {
+			opts->clockid = cm->clockid;
+			return 0;
+		}
+	}
+
+	opts->use_clockid = false;
+	ui__warning("unknown clockid %s, check man page\n", ostr);
+	return -1;
+}
+
 static const char * const __record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
@@ -842,6 +926,9 @@ struct option __record_options[] = {
 		    "Sample machine registers on interrupt"),
 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
 		    "Record running/enabled time of read (:S) events"),
+	OPT_CALLBACK('k', "clockid", &record.opts,
+	"clockid", "clockid to use for events, see clock_gettime()",
+	parse_clockid),
 	OPT_END()
 };
 
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index c38a085a5571..e14bb637255c 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -62,6 +62,8 @@ struct record_opts {
 	u64	     user_interval;
 	bool	     sample_transaction;
 	unsigned     initial_delay;
+	bool         use_clockid;
+	clockid_t    clockid;
 };
 
 struct option;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 358e5954baa8..d190f99a3a97 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -32,8 +32,12 @@ static struct {
 	bool exclude_guest;
 	bool mmap2;
 	bool cloexec;
+	bool clockid;
+	bool clockid_wrong;
 } perf_missing_features;
 
+static clockid_t clockid;
+
 static int perf_evsel__no_extra_init(struct perf_evsel *evsel __maybe_unused)
 {
 	return 0;
@@ -761,6 +765,12 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 		attr->disabled = 0;
 		attr->enable_on_exec = 0;
 	}
+
+	clockid = opts->clockid;
+	if (opts->use_clockid) {
+		attr->use_clockid = 1;
+		attr->clockid = opts->clockid;
+	}
 }
 
 static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
@@ -1036,7 +1046,6 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp)
 	ret += PRINT_ATTR2(exclude_user, exclude_kernel);
 	ret += PRINT_ATTR2(exclude_hv, exclude_idle);
 	ret += PRINT_ATTR2(mmap, comm);
-	ret += PRINT_ATTR2(mmap2, comm_exec);
 	ret += PRINT_ATTR2(freq, inherit_stat);
 	ret += PRINT_ATTR2(enable_on_exec, task);
 	ret += PRINT_ATTR2(watermark, precise_ip);
@@ -1044,6 +1053,9 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp)
 	ret += PRINT_ATTR2(exclude_host, exclude_guest);
 	ret += PRINT_ATTR2N("excl.callchain_kern", exclude_callchain_kernel,
 			    "excl.callchain_user", exclude_callchain_user);
+	ret += PRINT_ATTR2(mmap2, comm_exec);
+	ret += __PRINT_ATTR("%u",,use_clockid);
+
 
 	ret += PRINT_ATTR_U32(wakeup_events);
 	ret += PRINT_ATTR_U32(wakeup_watermark);
@@ -1055,6 +1067,7 @@ static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp)
 	ret += PRINT_ATTR_X64(branch_sample_type);
 	ret += PRINT_ATTR_X64(sample_regs_user);
 	ret += PRINT_ATTR_U32(sample_stack_user);
+	ret += PRINT_ATTR_U32(clockid);
 	ret += PRINT_ATTR_X64(sample_regs_intr);
 
 	ret += fprintf(fp, "%.60s\n", graph_dotted_line);
@@ -1085,6 +1098,12 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	}
 
 fallback_missing_features:
+	if (perf_missing_features.clockid_wrong)
+		evsel->attr.clockid = CLOCK_MONOTONIC; /* should always work */
+	if (perf_missing_features.clockid) {
+		evsel->attr.use_clockid = 0;
+		evsel->attr.clockid = 0;
+	}
 	if (perf_missing_features.cloexec)
 		flags &= ~(unsigned long)PERF_FLAG_FD_CLOEXEC;
 	if (perf_missing_features.mmap2)
@@ -1122,6 +1141,17 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 				goto try_fallback;
 			}
 			set_rlimit = NO_CHANGE;
+
+			/*
+			 * If we succeeded but had to kill clockid, fail and
+			 * have perf_evsel__open_strerror() print us a nice
+			 * error.
+			 */
+			if (perf_missing_features.clockid ||
+			    perf_missing_features.clockid_wrong) {
+				err = -EINVAL;
+				goto out_close;
+			}
 		}
 	}
 
@@ -1155,7 +1185,17 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	if (err != -EINVAL || cpu > 0 || thread > 0)
 		goto out_close;
 
-	if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
+	/*
+	 * Must probe features in the order they were added to the
+	 * perf_event_attr interface.
+	 */
+	if (!perf_missing_features.clockid_wrong && evsel->attr.use_clockid) {
+		perf_missing_features.clockid_wrong = true;
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.clockid && evsel->attr.use_clockid) {
+		perf_missing_features.clockid = true;
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
 		perf_missing_features.cloexec = true;
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.mmap2 && evsel->attr.mmap2) {
@@ -2063,9 +2103,7 @@ int perf_evsel__fprintf(struct perf_evsel *evsel,
 		if_print(exclude_hv);
 		if_print(exclude_idle);
 		if_print(mmap);
-		if_print(mmap2);
 		if_print(comm);
-		if_print(comm_exec);
 		if_print(freq);
 		if_print(inherit_stat);
 		if_print(enable_on_exec);
@@ -2076,10 +2114,17 @@ int perf_evsel__fprintf(struct perf_evsel *evsel,
 		if_print(sample_id_all);
 		if_print(exclude_host);
 		if_print(exclude_guest);
+		if_print(mmap2);
+		if_print(comm_exec);
+		if_print(use_clockid);
 		if_print(__reserved_1);
 		if_print(wakeup_events);
 		if_print(bp_type);
 		if_print(branch_sample_type);
+		if_print(sample_regs_user);
+		if_print(sample_stack_user);
+		if_print(clockid);
+		if_print(sample_regs_intr);
 	}
 out:
 	fputc('\n', fp);
@@ -2158,6 +2203,12 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 	"The PMU counters are busy/taken by another profiler.\n"
 	"We found oprofile daemon running, please stop it and try again.");
 		break;
+	case EINVAL:
+		if (perf_missing_features.clockid)
+			return scnprintf(msg, size, "clockid feature not supported.");
+		if (perf_missing_features.clockid_wrong)
+			return scnprintf(msg, size, "wrong clockid (%d).", clockid);
+		break;
 	default:
 		break;
 	}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index fb432153e2aa..de5f4669ba5f 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1098,6 +1098,9 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 			}
 			fprintf(fp, " }");
 		}
+		if (evsel->attr.use_clockid)
+			fprintf(fp, ", clockid = %d", evsel->attr.clockid);
+
 
 		fputc('\n', fp);
 	}

From 2c5e8c52c6354f77c4019357be8231bcc34456f8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 7 Apr 2015 11:09:54 +0200
Subject: [PATCH 15/19] perf tools: Merge all perf_event_attr print functions

Currently there's 3 (that I found) different and incomplete
implementations of printing perf_event_attr.

This is quite silly. Merge the lot.

While this patch does not retain the exact form all printing that I
found is debug output and thus it should not be critical.

Also, I cannot find a single print_event_desc() caller.

Pre:

 $ perf record -vv -e cycles -- sleep 1
 ------------------------------------------------------------
 perf_event_attr:
  type                0
  size                104
  config              0
  sample_period       4000
  sample_freq         4000
  sample_type         0x107
  read_format         0
  disabled            1    inherit             1
  pinned              0    exclusive           0
  exclude_user        0    exclude_kernel      0
  exclude_hv          0    exclude_idle        0
  mmap                1    comm                1
  mmap2               1    comm_exec           1
  freq                1    inherit_stat        0
  enable_on_exec      1    task                1
  watermark           0    precise_ip          0
  mmap_data           0    sample_id_all       1
  exclude_host        0    exclude_guest       1
  excl.callchain_kern 0    excl.callchain_user 0
  wakeup_events       0
  wakeup_watermark    0
  bp_type             0
  bp_addr             0
  config1             0
  bp_len              0
  config2             0
  branch_sample_type  0
  sample_regs_user    0
  sample_stack_user   0
  sample_regs_intr    0
 ------------------------------------------------------------

 $ perf evlist  -vv
 cycles: sample_freq=4000, size: 104, sample_type: IP|TID|TIME|PERIOD,
 disabled: 1, inherit: 1, mmap: 1, mmap2: 1, comm: 1, comm_exec: 1,
 freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1

 Post:

 $ ./perf record -vv -e cycles -- sleep 1
 ------------------------------------------------------------
 perf_event_attr:
  size                             112
  { sample_period, sample_freq }   4000
  sample_type                      IP|TID|TIME|PERIOD
  disabled                         1
  inherit                          1
  mmap                             1
  comm                             1
  freq                             1
  enable_on_exec                   1
  task                             1
  sample_id_all                    1
  exclude_guest                    1
  mmap2                            1
  comm_exec                        1
------------------------------------------------------------

 $ ./perf evlist  -vv
 cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type:
 IP|TID|TIME|PERIOD, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq:
 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1,
 mmap2: 1, comm_exec: 1

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150407091150.644238729@infradead.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c  | 272 +++++++++++++++++----------------------
 tools/perf/util/evsel.h  |   6 +
 tools/perf/util/header.c |  29 +----
 3 files changed, 133 insertions(+), 174 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d190f99a3a97..33e3fd8c2e68 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1011,70 +1011,126 @@ static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread)
 	return fd;
 }
 
-#define __PRINT_ATTR(fmt, cast, field)  \
-	fprintf(fp, "  %-19s "fmt"\n", #field, cast attr->field)
+struct bit_names {
+	int bit;
+	const char *name;
+};
 
-#define PRINT_ATTR_U32(field)  __PRINT_ATTR("%u" , , field)
-#define PRINT_ATTR_X32(field)  __PRINT_ATTR("%#x", , field)
-#define PRINT_ATTR_U64(field)  __PRINT_ATTR("%" PRIu64, (uint64_t), field)
-#define PRINT_ATTR_X64(field)  __PRINT_ATTR("%#"PRIx64, (uint64_t), field)
-
-#define PRINT_ATTR2N(name1, field1, name2, field2)	\
-	fprintf(fp, "  %-19s %u    %-19s %u\n",		\
-	name1, attr->field1, name2, attr->field2)
-
-#define PRINT_ATTR2(field1, field2) \
-	PRINT_ATTR2N(#field1, field1, #field2, field2)
-
-static size_t perf_event_attr__fprintf(struct perf_event_attr *attr, FILE *fp)
+static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits)
 {
-	size_t ret = 0;
+	bool first_bit = true;
+	int i = 0;
 
-	ret += fprintf(fp, "%.60s\n", graph_dotted_line);
-	ret += fprintf(fp, "perf_event_attr:\n");
+	do {
+		if (value & bits[i].bit) {
+			buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name);
+			first_bit = false;
+		}
+	} while (bits[++i].name != NULL);
+}
 
-	ret += PRINT_ATTR_U32(type);
-	ret += PRINT_ATTR_U32(size);
-	ret += PRINT_ATTR_X64(config);
-	ret += PRINT_ATTR_U64(sample_period);
-	ret += PRINT_ATTR_U64(sample_freq);
-	ret += PRINT_ATTR_X64(sample_type);
-	ret += PRINT_ATTR_X64(read_format);
+static void __p_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
+		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
+		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
+		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
+		bit_name(IDENTIFIER), bit_name(REGS_INTR),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
 
-	ret += PRINT_ATTR2(disabled, inherit);
-	ret += PRINT_ATTR2(pinned, exclusive);
-	ret += PRINT_ATTR2(exclude_user, exclude_kernel);
-	ret += PRINT_ATTR2(exclude_hv, exclude_idle);
-	ret += PRINT_ATTR2(mmap, comm);
-	ret += PRINT_ATTR2(freq, inherit_stat);
-	ret += PRINT_ATTR2(enable_on_exec, task);
-	ret += PRINT_ATTR2(watermark, precise_ip);
-	ret += PRINT_ATTR2(mmap_data, sample_id_all);
-	ret += PRINT_ATTR2(exclude_host, exclude_guest);
-	ret += PRINT_ATTR2N("excl.callchain_kern", exclude_callchain_kernel,
-			    "excl.callchain_user", exclude_callchain_user);
-	ret += PRINT_ATTR2(mmap2, comm_exec);
-	ret += __PRINT_ATTR("%u",,use_clockid);
+static void __p_read_format(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_FORMAT_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
+		bit_name(ID), bit_name(GROUP),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
 
+#define BUF_SIZE		1024
 
-	ret += PRINT_ATTR_U32(wakeup_events);
-	ret += PRINT_ATTR_U32(wakeup_watermark);
-	ret += PRINT_ATTR_X32(bp_type);
-	ret += PRINT_ATTR_X64(bp_addr);
-	ret += PRINT_ATTR_X64(config1);
-	ret += PRINT_ATTR_U64(bp_len);
-	ret += PRINT_ATTR_X64(config2);
-	ret += PRINT_ATTR_X64(branch_sample_type);
-	ret += PRINT_ATTR_X64(sample_regs_user);
-	ret += PRINT_ATTR_U32(sample_stack_user);
-	ret += PRINT_ATTR_U32(clockid);
-	ret += PRINT_ATTR_X64(sample_regs_intr);
+#define p_hex(val)		snprintf(buf, BUF_SIZE, "%"PRIx64, (uint64_t)(val))
+#define p_unsigned(val)		snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
+#define p_signed(val)		snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
+#define p_sample_type(val)	__p_sample_type(buf, BUF_SIZE, val)
+#define p_read_format(val)	__p_read_format(buf, BUF_SIZE, val)
 
-	ret += fprintf(fp, "%.60s\n", graph_dotted_line);
+#define PRINT_ATTRn(_n, _f, _p)				\
+do {							\
+	if (attr->_f) {					\
+		_p(attr->_f);				\
+		ret += attr__fprintf(fp, _n, buf, priv);\
+	}						\
+} while (0)
+
+#define PRINT_ATTRf(_f, _p)	PRINT_ATTRn(#_f, _f, _p)
+
+int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
+			     attr__fprintf_f attr__fprintf, void *priv)
+{
+	char buf[BUF_SIZE];
+	int ret = 0;
+
+	PRINT_ATTRf(type, p_unsigned);
+	PRINT_ATTRf(size, p_unsigned);
+	PRINT_ATTRf(config, p_hex);
+	PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned);
+	PRINT_ATTRf(sample_type, p_sample_type);
+	PRINT_ATTRf(read_format, p_read_format);
+
+	PRINT_ATTRf(disabled, p_unsigned);
+	PRINT_ATTRf(inherit, p_unsigned);
+	PRINT_ATTRf(pinned, p_unsigned);
+	PRINT_ATTRf(exclusive, p_unsigned);
+	PRINT_ATTRf(exclude_user, p_unsigned);
+	PRINT_ATTRf(exclude_kernel, p_unsigned);
+	PRINT_ATTRf(exclude_hv, p_unsigned);
+	PRINT_ATTRf(exclude_idle, p_unsigned);
+	PRINT_ATTRf(mmap, p_unsigned);
+	PRINT_ATTRf(comm, p_unsigned);
+	PRINT_ATTRf(freq, p_unsigned);
+	PRINT_ATTRf(inherit_stat, p_unsigned);
+	PRINT_ATTRf(enable_on_exec, p_unsigned);
+	PRINT_ATTRf(task, p_unsigned);
+	PRINT_ATTRf(watermark, p_unsigned);
+	PRINT_ATTRf(precise_ip, p_unsigned);
+	PRINT_ATTRf(mmap_data, p_unsigned);
+	PRINT_ATTRf(sample_id_all, p_unsigned);
+	PRINT_ATTRf(exclude_host, p_unsigned);
+	PRINT_ATTRf(exclude_guest, p_unsigned);
+	PRINT_ATTRf(exclude_callchain_kernel, p_unsigned);
+	PRINT_ATTRf(exclude_callchain_user, p_unsigned);
+	PRINT_ATTRf(mmap2, p_unsigned);
+	PRINT_ATTRf(comm_exec, p_unsigned);
+	PRINT_ATTRf(use_clockid, p_unsigned);
+
+	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
+	PRINT_ATTRf(bp_type, p_unsigned);
+	PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
+	PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
+	PRINT_ATTRf(sample_regs_user, p_hex);
+	PRINT_ATTRf(sample_stack_user, p_unsigned);
+	PRINT_ATTRf(clockid, p_signed);
+	PRINT_ATTRf(sample_regs_intr, p_hex);
 
 	return ret;
 }
 
+static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
+				void *priv __attribute__((unused)))
+{
+	return fprintf(fp, "  %-32s %s\n", name, val);
+}
+
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			      struct thread_map *threads)
 {
@@ -1114,8 +1170,12 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 	if (perf_missing_features.sample_id_all)
 		evsel->attr.sample_id_all = 0;
 
-	if (verbose >= 2)
-		perf_event_attr__fprintf(&evsel->attr, stderr);
+	if (verbose >= 2) {
+		fprintf(stderr, "%.60s\n", graph_dotted_line);
+		fprintf(stderr, "perf_event_attr:\n");
+		perf_event_attr__fprintf(stderr, &evsel->attr, __open_attr__fprintf, NULL);
+		fprintf(stderr, "%.60s\n", graph_dotted_line);
+	}
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 
@@ -1996,62 +2056,9 @@ static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
 	return ret;
 }
 
-static int __if_fprintf(FILE *fp, bool *first, const char *field, u64 value)
+static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv)
 {
-	if (value == 0)
-		return 0;
-
-	return comma_fprintf(fp, first, " %s: %" PRIu64, field, value);
-}
-
-#define if_print(field) printed += __if_fprintf(fp, &first, #field, evsel->attr.field)
-
-struct bit_names {
-	int bit;
-	const char *name;
-};
-
-static int bits__fprintf(FILE *fp, const char *field, u64 value,
-			 struct bit_names *bits, bool *first)
-{
-	int i = 0, printed = comma_fprintf(fp, first, " %s: ", field);
-	bool first_bit = true;
-
-	do {
-		if (value & bits[i].bit) {
-			printed += fprintf(fp, "%s%s", first_bit ? "" : "|", bits[i].name);
-			first_bit = false;
-		}
-	} while (bits[++i].name != NULL);
-
-	return printed;
-}
-
-static int sample_type__fprintf(FILE *fp, bool *first, u64 value)
-{
-#define bit_name(n) { PERF_SAMPLE_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
-		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
-		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
-		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
-		bit_name(IDENTIFIER), bit_name(REGS_INTR),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	return bits__fprintf(fp, "sample_type", value, bits, first);
-}
-
-static int read_format__fprintf(FILE *fp, bool *first, u64 value)
-{
-#define bit_name(n) { PERF_FORMAT_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
-		bit_name(ID), bit_name(GROUP),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	return bits__fprintf(fp, "read_format", value, bits, first);
+	return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val);
 }
 
 int perf_evsel__fprintf(struct perf_evsel *evsel,
@@ -2080,52 +2087,13 @@ int perf_evsel__fprintf(struct perf_evsel *evsel,
 
 	printed += fprintf(fp, "%s", perf_evsel__name(evsel));
 
-	if (details->verbose || details->freq) {
+	if (details->verbose) {
+		printed += perf_event_attr__fprintf(fp, &evsel->attr,
+						    __print_attr__fprintf, &first);
+	} else if (details->freq) {
 		printed += comma_fprintf(fp, &first, " sample_freq=%" PRIu64,
 					 (u64)evsel->attr.sample_freq);
 	}
-
-	if (details->verbose) {
-		if_print(type);
-		if_print(config);
-		if_print(config1);
-		if_print(config2);
-		if_print(size);
-		printed += sample_type__fprintf(fp, &first, evsel->attr.sample_type);
-		if (evsel->attr.read_format)
-			printed += read_format__fprintf(fp, &first, evsel->attr.read_format);
-		if_print(disabled);
-		if_print(inherit);
-		if_print(pinned);
-		if_print(exclusive);
-		if_print(exclude_user);
-		if_print(exclude_kernel);
-		if_print(exclude_hv);
-		if_print(exclude_idle);
-		if_print(mmap);
-		if_print(comm);
-		if_print(freq);
-		if_print(inherit_stat);
-		if_print(enable_on_exec);
-		if_print(task);
-		if_print(watermark);
-		if_print(precise_ip);
-		if_print(mmap_data);
-		if_print(sample_id_all);
-		if_print(exclude_host);
-		if_print(exclude_guest);
-		if_print(mmap2);
-		if_print(comm_exec);
-		if_print(use_clockid);
-		if_print(__reserved_1);
-		if_print(wakeup_events);
-		if_print(bp_type);
-		if_print(branch_sample_type);
-		if_print(sample_regs_user);
-		if_print(sample_stack_user);
-		if_print(clockid);
-		if_print(sample_regs_intr);
-	}
 out:
 	fputc('\n', fp);
 	return ++printed;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index c5a43d6b13dc..e486151b0308 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -360,4 +360,10 @@ static inline bool has_branch_callstack(struct perf_evsel *evsel)
 {
 	return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
 }
+
+typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *);
+
+int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
+			     attr__fprintf_f attr__fprintf, void *priv);
+
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index de5f4669ba5f..fff3b2a455ae 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1055,6 +1055,12 @@ read_event_desc(struct perf_header *ph, int fd)
 	goto out;
 }
 
+static int __desc_attr__fprintf(FILE *fp, const char *name, const char *val,
+				void *priv __attribute__((unused)))
+{
+	return fprintf(fp, ", %s = %s", name, val);
+}
+
 static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 {
 	struct perf_evsel *evsel, *events = read_event_desc(ph, fd);
@@ -1069,26 +1075,6 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 	for (evsel = events; evsel->attr.size; evsel++) {
 		fprintf(fp, "# event : name = %s, ", evsel->name);
 
-		fprintf(fp, "type = %d, config = 0x%"PRIx64
-			    ", config1 = 0x%"PRIx64", config2 = 0x%"PRIx64,
-				evsel->attr.type,
-				(u64)evsel->attr.config,
-				(u64)evsel->attr.config1,
-				(u64)evsel->attr.config2);
-
-		fprintf(fp, ", excl_usr = %d, excl_kern = %d",
-				evsel->attr.exclude_user,
-				evsel->attr.exclude_kernel);
-
-		fprintf(fp, ", excl_host = %d, excl_guest = %d",
-				evsel->attr.exclude_host,
-				evsel->attr.exclude_guest);
-
-		fprintf(fp, ", precise_ip = %d", evsel->attr.precise_ip);
-
-		fprintf(fp, ", attr_mmap2 = %d", evsel->attr.mmap2);
-		fprintf(fp, ", attr_mmap  = %d", evsel->attr.mmap);
-		fprintf(fp, ", attr_mmap_data = %d", evsel->attr.mmap_data);
 		if (evsel->ids) {
 			fprintf(fp, ", id = {");
 			for (j = 0, id = evsel->id; j < evsel->ids; j++, id++) {
@@ -1098,9 +1084,8 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 			}
 			fprintf(fp, " }");
 		}
-		if (evsel->attr.use_clockid)
-			fprintf(fp, ", clockid = %d", evsel->attr.clockid);
 
+		perf_event_attr__fprintf(fp, &evsel->attr, __desc_attr__fprintf, NULL);
 
 		fputc('\n', fp);
 	}

From f6c15621f04b97ce882c66e5055f0ac325fb8eb8 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 8 Apr 2015 02:14:34 +0000
Subject: [PATCH 16/19] perf probe: Fix ARM 32 building error

Commit 9b118acae310f57baee770b5db402500d8695e50 ("perf probe: Fix to
handle aliased symbols in glibc") uses an absolute format '%lx' to
print u64 argument, which causes compiling error on ARM 32.

This patch replaces it with PRIx64.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1428459274-138470-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 4fd49f021073..b78851732a71 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -322,7 +322,8 @@ static int find_alternative_probe_point(struct debuginfo *dinfo,
 		ret = -ENOENT;
 		goto out;
 	}
-	pr_debug("Symbol %s address found : %lx\n", pp->function, address);
+	pr_debug("Symbol %s address found : %" PRIx64 "\n",
+			pp->function, address);
 
 	ret = debuginfo__find_probe_point(dinfo, (unsigned long)address,
 					  result);

From 54a50f93eb13ab5efa7366627534e2b0f7caa8e5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 7 Apr 2015 19:17:15 +0200
Subject: [PATCH 17/19] perf tests: Fix attr tests

Following commit:
  1a5941312414 perf: Add wakeup watermark control to the AUX area

enlarged perf_event_attr, but did not updated attr tests.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Markus T Metzger <markus.t.metzger@intel.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Link: http://lkml.kernel.org/n/20150407171715.GA22603@krava.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/attr/base-record | 2 +-
 tools/perf/tests/attr/base-stat   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record
index d3095dafed36..7e6d74946e04 100644
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record
@@ -5,7 +5,7 @@ group_fd=-1
 flags=0|8
 cpu=*
 type=0|1
-size=104
+size=112
 config=0
 sample_period=4000
 sample_type=263
diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat
index 872ed7e24c7c..f4cf148f14cb 100644
--- a/tools/perf/tests/attr/base-stat
+++ b/tools/perf/tests/attr/base-stat
@@ -5,7 +5,7 @@ group_fd=-1
 flags=0|8
 cpu=*
 type=0
-size=104
+size=112
 config=0
 sample_period=0
 sample_type=0

From f6fcc1433a4a9057b2977313f31eadbc1c84268b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 8 Apr 2015 10:59:32 +0000
Subject: [PATCH 18/19] perf report: Don't call map__kmap if map is NULL.

report__warn_kptr_restrict() calls map__kmap(kernel_map) before checking
kernel_map againest NULL.

Which is dangerous, since map__kmap() will return a invalid and not NULL
address.

It will trigger a warning message in map__kmap() after the patch "perf:
kmaps: enforce usage of kmaps to protect futher bugs." was applied.

This patch fixes it by adding the missing checking.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1428490772-135393-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index b5b2ad4ca9c4..476cdf7afcca 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -347,7 +347,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 static void report__warn_kptr_restrict(const struct report *rep)
 {
 	struct map *kernel_map = rep->session->machines.host.vmlinux_maps[MAP__FUNCTION];
-	struct kmap *kernel_kmap = map__kmap(kernel_map);
+	struct kmap *kernel_kmap = kernel_map ? map__kmap(kernel_map) : NULL;
 
 	if (kernel_map == NULL ||
 	    (kernel_map->dso->hit &&

From a1e12da4796a4ddd0e911687a290eb396d1c64bf Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 7 Apr 2015 23:25:14 +0200
Subject: [PATCH 19/19] perf tools: Add 'I' event modifier for exclude_idle bit

Adding 'I' event modifier to have complete set of modifiers for
perf_event_attr:exclude_* bits.

Any event specified with 'I' modifier will have the
perf_event_attr:exclude_idle bit set.

  $ perf record -e cycles:I -vv ls 2>&1 | grep exclude_idle
  exclude_hv          0    exclude_idle        1

Adding automated tests.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: William Cohen <wcohen@redhat.com>
Link: http://lkml.kernel.org/r/1428441919-23099-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-list.txt |  1 +
 tools/perf/tests/parse-events.c        | 40 ++++++++++++++++++++++++++
 tools/perf/util/parse-events.c         |  8 +++++-
 tools/perf/util/parse-events.l         |  2 +-
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 4692d277980b..bada8933fdd4 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -26,6 +26,7 @@ counted. The following modifiers exist:
  u - user-space counting
  k - kernel counting
  h - hypervisor counting
+ I - non idle counting
  G - guest counting (in KVM guests)
  H - host counting (not in KVM guests)
  p - precise level
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index ac243ebcb20a..3de744961739 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -295,6 +295,36 @@ static int test__checkevent_genhw_modifier(struct perf_evlist *evlist)
 	return test__checkevent_genhw(evlist);
 }
 
+static int test__checkevent_exclude_idle_modifier(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong exclude idle", evsel->attr.exclude_idle);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", !evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+	return test__checkevent_symbolic_name(evlist);
+}
+
+static int test__checkevent_exclude_idle_modifier_1(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong exclude idle", evsel->attr.exclude_idle);
+	TEST_ASSERT_VAL("wrong exclude guest", !evsel->attr.exclude_guest);
+	TEST_ASSERT_VAL("wrong exclude host", evsel->attr.exclude_host);
+	TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+	TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+	return test__checkevent_symbolic_name(evlist);
+}
+
 static int test__checkevent_breakpoint_modifier(struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel = perf_evlist__first(evlist);
@@ -1494,6 +1524,16 @@ static struct evlist_test test__events[] = {
 		.id    = 100,
 	},
 #endif
+	{
+		.name  = "instructions:I",
+		.check = test__checkevent_exclude_idle_modifier,
+		.id    = 45,
+	},
+	{
+		.name  = "instructions:kIG",
+		.check = test__checkevent_exclude_idle_modifier_1,
+		.id    = 46,
+	},
 };
 
 static struct evlist_test test__events_pmu[] = {
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index fe07573d5ed4..be0655388b38 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -709,6 +709,7 @@ struct event_modifier {
 	int eh;
 	int eH;
 	int eG;
+	int eI;
 	int precise;
 	int exclude_GH;
 	int sample_read;
@@ -723,6 +724,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 	int eh = evsel ? evsel->attr.exclude_hv : 0;
 	int eH = evsel ? evsel->attr.exclude_host : 0;
 	int eG = evsel ? evsel->attr.exclude_guest : 0;
+	int eI = evsel ? evsel->attr.exclude_idle : 0;
 	int precise = evsel ? evsel->attr.precise_ip : 0;
 	int sample_read = 0;
 	int pinned = evsel ? evsel->attr.pinned : 0;
@@ -753,6 +755,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 			if (!exclude_GH)
 				exclude_GH = eG = eH = 1;
 			eH = 0;
+		} else if (*str == 'I') {
+			eI = 1;
 		} else if (*str == 'p') {
 			precise++;
 			/* use of precise requires exclude_guest */
@@ -786,6 +790,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 	mod->eh = eh;
 	mod->eH = eH;
 	mod->eG = eG;
+	mod->eI = eI;
 	mod->precise = precise;
 	mod->exclude_GH = exclude_GH;
 	mod->sample_read = sample_read;
@@ -803,7 +808,7 @@ static int check_modifier(char *str)
 	char *p = str;
 
 	/* The sizeof includes 0 byte as well. */
-	if (strlen(str) > (sizeof("ukhGHpppSD") - 1))
+	if (strlen(str) > (sizeof("ukhGHpppSDI") - 1))
 		return -1;
 
 	while (*p) {
@@ -839,6 +844,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add)
 		evsel->attr.precise_ip     = mod.precise;
 		evsel->attr.exclude_host   = mod.eH;
 		evsel->attr.exclude_guest  = mod.eG;
+		evsel->attr.exclude_idle   = mod.eI;
 		evsel->exclude_GH          = mod.exclude_GH;
 		evsel->sample_read         = mod.sample_read;
 
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 94eacb6c1ef7..8895cf3132ab 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -101,7 +101,7 @@ num_raw_hex	[a-fA-F0-9]+
 name		[a-zA-Z_*?][a-zA-Z0-9_*?]*
 name_minus	[a-zA-Z_*?][a-zA-Z0-9\-_*?]*
 /* If you add a modifier you need to update check_modifier() */
-modifier_event	[ukhpGHSD]+
+modifier_event	[ukhpGHSDI]+
 modifier_bp	[rwx]{1,3}
 
 %%