2009-04-20 21:37:32 +08:00
|
|
|
/*
|
2009-06-03 05:37:05 +08:00
|
|
|
* builtin-stat.c
|
|
|
|
*
|
|
|
|
* Builtin stat command: Give a precise performance counters summary
|
|
|
|
* overview about any workload, CPU or specific PID.
|
|
|
|
*
|
|
|
|
* Sample output:
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-06-03 05:37:05 +08:00
|
|
|
$ perf stat ~/hackbench 10
|
|
|
|
Time: 0.104
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-06-03 05:37:05 +08:00
|
|
|
Performance counter stats for '/home/mingo/hackbench':
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-06-03 05:37:05 +08:00
|
|
|
1255.538611 task clock ticks # 10.143 CPU utilization factor
|
|
|
|
54011 context switches # 0.043 M/sec
|
|
|
|
385 CPU migrations # 0.000 M/sec
|
|
|
|
17755 pagefaults # 0.014 M/sec
|
|
|
|
3808323185 CPU cycles # 3033.219 M/sec
|
|
|
|
1575111190 instructions # 1254.530 M/sec
|
|
|
|
17367895 cache references # 13.833 M/sec
|
|
|
|
7674421 cache misses # 6.112 M/sec
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-06-03 05:37:05 +08:00
|
|
|
Wall-clock time elapsed: 123.786620 msecs
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-05-26 15:17:18 +08:00
|
|
|
*
|
|
|
|
* Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
|
|
|
|
*
|
|
|
|
* Improvements and fixes by:
|
|
|
|
*
|
|
|
|
* Arjan van de Ven <arjan@linux.intel.com>
|
|
|
|
* Yanmin Zhang <yanmin.zhang@intel.com>
|
|
|
|
* Wu Fengguang <fengguang.wu@intel.com>
|
|
|
|
* Mike Galbraith <efault@gmx.de>
|
|
|
|
* Paul Mackerras <paulus@samba.org>
|
2009-06-27 05:32:07 +08:00
|
|
|
* Jaswinder Singh Rajput <jaswinder@kernel.org>
|
2009-05-26 15:17:18 +08:00
|
|
|
*
|
|
|
|
* Released under the GPL v2. (and only v2, not any later version)
|
2009-04-20 21:37:32 +08:00
|
|
|
*/
|
|
|
|
|
2009-05-24 00:28:58 +08:00
|
|
|
#include "perf.h"
|
2009-05-27 15:10:38 +08:00
|
|
|
#include "builtin.h"
|
2009-04-27 14:02:14 +08:00
|
|
|
#include "util/util.h"
|
2009-05-26 15:17:18 +08:00
|
|
|
#include "util/parse-options.h"
|
|
|
|
#include "util/parse-events.h"
|
2009-08-17 04:05:48 +08:00
|
|
|
#include "util/event.h"
|
|
|
|
#include "util/debug.h"
|
2009-12-31 16:05:50 +08:00
|
|
|
#include "util/header.h"
|
perf tools: Fix sparse CPU numbering related bugs
At present, the perf subcommands that do system-wide monitoring
(perf stat, perf record and perf top) don't work properly unless
the online cpus are numbered 0, 1, ..., N-1. These tools ask
for the number of online cpus with sysconf(_SC_NPROCESSORS_ONLN)
and then try to create events for cpus 0, 1, ..., N-1.
This creates problems for systems where the online cpus are
numbered sparsely. For example, a POWER6 system in
single-threaded mode (i.e. only running 1 hardware thread per
core) will have only even-numbered cpus online.
This fixes the problem by reading the /sys/devices/system/cpu/online
file to find out which cpus are online. The code that does that is in
tools/perf/util/cpumap.[ch], and consists of a read_cpu_map()
function that sets up a cpumap[] array and returns the number of
online cpus. If /sys/devices/system/cpu/online can't be read or
can't be parsed successfully, it falls back to using sysconf to
ask how many cpus are online and sets up an identity map in cpumap[].
The perf record, perf stat and perf top code then calls
read_cpu_map() in the system-wide monitoring case (instead of
sysconf) and uses cpumap[] to get the cpu numbers to pass to
perf_event_open.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Anton Blanchard <anton@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <20100310093609.GA3959@brick.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-10 17:36:09 +08:00
|
|
|
#include "util/cpumap.h"
|
2010-03-18 22:36:05 +08:00
|
|
|
#include "util/thread.h"
|
2009-04-20 21:37:32 +08:00
|
|
|
|
|
|
|
#include <sys/prctl.h>
|
2009-06-13 20:57:28 +08:00
|
|
|
#include <math.h>
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
#include <locale.h>
|
2009-05-05 23:50:27 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
static struct perf_event_attr default_attrs[] = {
|
2009-04-20 21:37:32 +08:00
|
|
|
|
perf stat: Re-align the default_attrs[] array
Clean up the array definition to be vertically aligned.
No functional effects.
Cc: Tim Blechmann <tim@klingt.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4ADC3975.8050109@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
tools/perf/builtin-stat.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c373683..95a55ea 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,8 @@ static struct perf_event_attr default_attrs[] = {
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
};
2009-10-19 19:27:08 +08:00
|
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
|
|
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES },
|
|
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
|
|
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
|
2009-06-11 20:06:28 +08:00
|
|
|
|
perf stat: Re-align the default_attrs[] array
Clean up the array definition to be vertically aligned.
No functional effects.
Cc: Tim Blechmann <tim@klingt.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4ADC3975.8050109@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
tools/perf/builtin-stat.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c373683..95a55ea 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,8 @@ static struct perf_event_attr default_attrs[] = {
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
};
2009-10-19 19:27:08 +08:00
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
|
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
|
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
|
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
|
perf stat: Count branches first
Count branches first, cache-misses second. The reason is that
on x86 branches are not counted by all counters on all CPUs.
Before:
Performance counter stats for 'ls':
0.756653 task-clock-msecs # 0.802 CPUs
0 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
250 page-faults # 0.330 M/sec
2375725 cycles # 3139.781 M/sec
1628129 instructions # 0.685 IPC
19643 cache-references # 25.960 M/sec
4608 cache-misses # 6.090 M/sec
342532 branches # 452.694 M/sec
<not counted> branch-misses
0.000943356 seconds time elapsed
After:
Performance counter stats for 'ls':
1.056734 task-clock-msecs # 0.859 CPUs
0 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
259 page-faults # 0.245 M/sec
3345932 cycles # 3166.295 M/sec
3074090 instructions # 0.919 IPC
616928 branches # 583.806 M/sec
39279 branch-misses # 6.367 %
21312 cache-references # 20.168 M/sec
3661 cache-misses # 3.464 M/sec
0.001230551 seconds time elapsed
(also prettify the printout of branch misses, in case it's
getting scaled.)
Cc: Tim Blechmann <tim@klingt.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4ADC3975.8050109@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
tools/perf/builtin-stat.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c373683..95a55ea 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,8 @@ static struct perf_event_attr default_attrs[] = {
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
};
---
tools/perf/builtin-stat.c | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 95a55ea..90e0a26 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -50,17 +50,17 @@
static struct perf_event_attr default_attrs[] = {
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
-
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
+
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
};
2009-10-19 19:33:03 +08:00
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES },
|
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
|
2009-06-11 20:06:28 +08:00
|
|
|
|
2009-04-20 21:37:32 +08:00
|
|
|
};
|
2009-05-26 15:17:18 +08:00
|
|
|
|
2010-04-13 16:37:33 +08:00
|
|
|
static bool system_wide = false;
|
2010-05-28 18:00:01 +08:00
|
|
|
static int nr_cpus = 0;
|
2009-06-24 20:49:34 +08:00
|
|
|
static int run_idx = 0;
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-06-24 20:49:34 +08:00
|
|
|
static int run_count = 1;
|
2010-05-12 16:40:01 +08:00
|
|
|
static bool no_inherit = false;
|
2010-04-13 16:37:33 +08:00
|
|
|
static bool scale = true;
|
2009-10-04 08:35:01 +08:00
|
|
|
static pid_t target_pid = -1;
|
2010-03-18 22:36:05 +08:00
|
|
|
static pid_t target_tid = -1;
|
|
|
|
static pid_t *all_tids = NULL;
|
|
|
|
static int thread_num = 0;
|
2009-10-04 08:35:01 +08:00
|
|
|
static pid_t child_pid = -1;
|
2010-04-13 16:37:33 +08:00
|
|
|
static bool null_run = false;
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
static bool big_num = false;
|
2010-05-28 18:00:01 +08:00
|
|
|
static const char *cpu_list;
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2010-03-18 22:36:05 +08:00
|
|
|
static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
|
2009-06-13 20:57:28 +08:00
|
|
|
|
2009-09-04 23:03:13 +08:00
|
|
|
static int event_scaled[MAX_COUNTERS];
|
2009-06-24 20:49:34 +08:00
|
|
|
|
2009-12-31 16:05:50 +08:00
|
|
|
static volatile int done = 0;
|
|
|
|
|
2009-09-04 21:36:12 +08:00
|
|
|
struct stats
|
|
|
|
{
|
2009-09-04 23:26:26 +08:00
|
|
|
double n, mean, M2;
|
2009-09-04 21:36:12 +08:00
|
|
|
};
|
2009-06-13 20:57:28 +08:00
|
|
|
|
2009-09-04 21:36:08 +08:00
|
|
|
static void update_stats(struct stats *stats, u64 val)
|
|
|
|
{
|
2009-09-04 23:26:26 +08:00
|
|
|
double delta;
|
2009-09-04 21:36:08 +08:00
|
|
|
|
2009-09-04 23:26:26 +08:00
|
|
|
stats->n++;
|
|
|
|
delta = val - stats->mean;
|
|
|
|
stats->mean += delta / stats->n;
|
|
|
|
stats->M2 += delta*(val - stats->mean);
|
2009-09-04 21:36:08 +08:00
|
|
|
}
|
|
|
|
|
2009-09-04 21:36:12 +08:00
|
|
|
static double avg_stats(struct stats *stats)
|
|
|
|
{
|
2009-09-04 23:26:26 +08:00
|
|
|
return stats->mean;
|
2009-09-04 21:36:12 +08:00
|
|
|
}
|
2009-06-13 20:57:28 +08:00
|
|
|
|
2009-09-04 21:36:12 +08:00
|
|
|
/*
|
2009-09-04 23:03:13 +08:00
|
|
|
* http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
|
|
|
|
*
|
2009-09-04 23:26:26 +08:00
|
|
|
* (\Sum n_i^2) - ((\Sum n_i)^2)/n
|
|
|
|
* s^2 = -------------------------------
|
|
|
|
* n - 1
|
2009-09-04 23:03:13 +08:00
|
|
|
*
|
|
|
|
* http://en.wikipedia.org/wiki/Stddev
|
|
|
|
*
|
|
|
|
* The std dev of the mean is related to the std dev by:
|
|
|
|
*
|
|
|
|
* s
|
|
|
|
* s_mean = -------
|
|
|
|
* sqrt(n)
|
|
|
|
*
|
2009-09-04 21:36:12 +08:00
|
|
|
*/
|
|
|
|
static double stddev_stats(struct stats *stats)
|
|
|
|
{
|
2009-09-04 23:26:26 +08:00
|
|
|
double variance = stats->M2 / (stats->n - 1);
|
|
|
|
double variance_mean = variance / stats->n;
|
2009-06-13 20:57:28 +08:00
|
|
|
|
2009-09-04 23:03:13 +08:00
|
|
|
return sqrt(variance_mean);
|
2009-09-04 21:36:12 +08:00
|
|
|
}
|
2009-06-13 20:57:28 +08:00
|
|
|
|
2009-09-04 21:36:12 +08:00
|
|
|
struct stats event_res_stats[MAX_COUNTERS][3];
|
|
|
|
struct stats runtime_nsecs_stats;
|
|
|
|
struct stats walltime_nsecs_stats;
|
|
|
|
struct stats runtime_cycles_stats;
|
2009-10-18 19:29:23 +08:00
|
|
|
struct stats runtime_branches_stats;
|
2009-05-29 15:10:54 +08:00
|
|
|
|
2009-07-01 17:35:09 +08:00
|
|
|
#define MATCH_EVENT(t, c, counter) \
|
|
|
|
(attrs[counter].type == PERF_TYPE_##t && \
|
|
|
|
attrs[counter].config == PERF_COUNT_##c)
|
|
|
|
|
2009-06-23 19:42:49 +08:00
|
|
|
#define ERR_PERF_OPEN \
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
|
2009-06-23 19:42:49 +08:00
|
|
|
|
2010-03-23 00:10:28 +08:00
|
|
|
static int create_perf_stat_counter(int counter)
|
2009-04-20 21:37:32 +08:00
|
|
|
{
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
struct perf_event_attr *attr = attrs + counter;
|
2010-03-18 22:36:05 +08:00
|
|
|
int thread;
|
2010-03-23 00:10:28 +08:00
|
|
|
int ncreated = 0;
|
2009-05-05 23:50:27 +08:00
|
|
|
|
2009-04-20 21:37:32 +08:00
|
|
|
if (scale)
|
2009-06-06 15:58:57 +08:00
|
|
|
attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
|
|
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING;
|
2009-04-20 21:37:32 +08:00
|
|
|
|
|
|
|
if (system_wide) {
|
2010-05-28 18:00:01 +08:00
|
|
|
int cpu;
|
2009-07-01 18:37:06 +08:00
|
|
|
|
2009-06-23 19:42:49 +08:00
|
|
|
for (cpu = 0; cpu < nr_cpus; cpu++) {
|
2010-03-18 22:36:05 +08:00
|
|
|
fd[cpu][counter][0] = sys_perf_event_open(attr,
|
|
|
|
-1, cpumap[cpu], -1, 0);
|
2010-03-23 00:10:28 +08:00
|
|
|
if (fd[cpu][counter][0] < 0)
|
|
|
|
pr_debug(ERR_PERF_OPEN, counter,
|
|
|
|
fd[cpu][counter][0], strerror(errno));
|
|
|
|
else
|
|
|
|
++ncreated;
|
2009-04-20 21:37:32 +08:00
|
|
|
}
|
|
|
|
} else {
|
2010-05-12 16:40:01 +08:00
|
|
|
attr->inherit = !no_inherit;
|
|
|
|
if (target_pid == -1 && target_tid == -1) {
|
2010-03-18 22:36:03 +08:00
|
|
|
attr->disabled = 1;
|
|
|
|
attr->enable_on_exec = 1;
|
|
|
|
}
|
2010-03-18 22:36:05 +08:00
|
|
|
for (thread = 0; thread < thread_num; thread++) {
|
|
|
|
fd[0][counter][thread] = sys_perf_event_open(attr,
|
|
|
|
all_tids[thread], -1, -1, 0);
|
2010-03-23 00:10:28 +08:00
|
|
|
if (fd[0][counter][thread] < 0)
|
|
|
|
pr_debug(ERR_PERF_OPEN, counter,
|
|
|
|
fd[0][counter][thread],
|
|
|
|
strerror(errno));
|
|
|
|
else
|
|
|
|
++ncreated;
|
2010-03-18 22:36:05 +08:00
|
|
|
}
|
2009-04-20 21:37:32 +08:00
|
|
|
}
|
2010-03-23 00:10:28 +08:00
|
|
|
|
|
|
|
return ncreated;
|
2009-04-20 21:37:32 +08:00
|
|
|
}
|
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
/*
|
|
|
|
* Does the counter have nsecs as a unit?
|
|
|
|
*/
|
|
|
|
static inline int nsec_counter(int counter)
|
|
|
|
{
|
2009-07-01 17:35:09 +08:00
|
|
|
if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
|
|
|
|
MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
|
2009-05-29 15:10:54 +08:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2009-05-29 15:10:54 +08:00
|
|
|
* Read out the results of a single counter:
|
2009-05-29 15:10:54 +08:00
|
|
|
*/
|
2009-05-29 15:10:54 +08:00
|
|
|
static void read_counter(int counter)
|
2009-05-29 15:10:54 +08:00
|
|
|
{
|
2009-09-05 00:23:38 +08:00
|
|
|
u64 count[3], single_count[3];
|
2010-05-28 18:00:01 +08:00
|
|
|
int cpu;
|
2009-07-01 18:37:06 +08:00
|
|
|
size_t res, nv;
|
2009-05-29 15:10:54 +08:00
|
|
|
int scaled;
|
2010-03-18 22:36:05 +08:00
|
|
|
int i, thread;
|
2009-05-29 15:10:54 +08:00
|
|
|
|
|
|
|
count[0] = count[1] = count[2] = 0;
|
2009-05-29 15:10:54 +08:00
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
nv = scale ? 3 : 1;
|
2009-06-23 19:42:49 +08:00
|
|
|
for (cpu = 0; cpu < nr_cpus; cpu++) {
|
2010-03-18 22:36:05 +08:00
|
|
|
for (thread = 0; thread < thread_num; thread++) {
|
|
|
|
if (fd[cpu][counter][thread] < 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
res = read(fd[cpu][counter][thread],
|
|
|
|
single_count, nv * sizeof(u64));
|
|
|
|
assert(res == nv * sizeof(u64));
|
|
|
|
|
|
|
|
close(fd[cpu][counter][thread]);
|
|
|
|
fd[cpu][counter][thread] = -1;
|
|
|
|
|
|
|
|
count[0] += single_count[0];
|
|
|
|
if (scale) {
|
|
|
|
count[1] += single_count[1];
|
|
|
|
count[2] += single_count[2];
|
|
|
|
}
|
2009-05-29 15:10:54 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
scaled = 0;
|
|
|
|
if (scale) {
|
|
|
|
if (count[2] == 0) {
|
2009-09-04 21:36:08 +08:00
|
|
|
event_scaled[counter] = -1;
|
2009-05-29 15:10:54 +08:00
|
|
|
count[0] = 0;
|
2009-05-29 15:10:54 +08:00
|
|
|
return;
|
|
|
|
}
|
2009-05-29 15:10:54 +08:00
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
if (count[2] < count[1]) {
|
2009-09-04 21:36:08 +08:00
|
|
|
event_scaled[counter] = 1;
|
2009-05-29 15:10:54 +08:00
|
|
|
count[0] = (unsigned long long)
|
|
|
|
((double)count[0] * count[1] / count[2] + 0.5);
|
|
|
|
}
|
|
|
|
}
|
2009-09-04 21:36:08 +08:00
|
|
|
|
|
|
|
for (i = 0; i < 3; i++)
|
|
|
|
update_stats(&event_res_stats[counter][i], count[i]);
|
|
|
|
|
|
|
|
if (verbose) {
|
|
|
|
fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter),
|
|
|
|
count[0], count[1], count[2]);
|
|
|
|
}
|
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
/*
|
|
|
|
* Save the full runtime - to allow normalization during printout:
|
|
|
|
*/
|
2009-07-01 17:35:09 +08:00
|
|
|
if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
|
2009-09-04 21:36:08 +08:00
|
|
|
update_stats(&runtime_nsecs_stats, count[0]);
|
2009-07-01 17:35:09 +08:00
|
|
|
if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
|
2009-09-04 21:36:08 +08:00
|
|
|
update_stats(&runtime_cycles_stats, count[0]);
|
2009-10-18 19:29:23 +08:00
|
|
|
if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
|
|
|
|
update_stats(&runtime_branches_stats, count[0]);
|
2009-05-29 15:10:54 +08:00
|
|
|
}
|
|
|
|
|
2009-07-01 18:37:06 +08:00
|
|
|
static int run_perf_stat(int argc __used, const char **argv)
|
2009-06-13 20:57:28 +08:00
|
|
|
{
|
|
|
|
unsigned long long t0, t1;
|
|
|
|
int status = 0;
|
2010-03-23 00:10:28 +08:00
|
|
|
int counter, ncreated = 0;
|
2009-06-29 19:13:21 +08:00
|
|
|
int child_ready_pipe[2], go_pipe[2];
|
2010-03-18 22:36:03 +08:00
|
|
|
const bool forks = (argc > 0);
|
2009-06-29 19:13:21 +08:00
|
|
|
char buf;
|
2009-06-13 20:57:28 +08:00
|
|
|
|
|
|
|
if (!system_wide)
|
|
|
|
nr_cpus = 1;
|
|
|
|
|
2009-12-31 16:05:50 +08:00
|
|
|
if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
|
2009-06-29 19:13:21 +08:00
|
|
|
perror("failed to create pipes");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2009-12-31 16:05:50 +08:00
|
|
|
if (forks) {
|
2010-03-18 22:36:03 +08:00
|
|
|
if ((child_pid = fork()) < 0)
|
2009-12-31 16:05:50 +08:00
|
|
|
perror("failed to fork");
|
|
|
|
|
2010-03-18 22:36:03 +08:00
|
|
|
if (!child_pid) {
|
2009-12-31 16:05:50 +08:00
|
|
|
close(child_ready_pipe[0]);
|
|
|
|
close(go_pipe[1]);
|
|
|
|
fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do a dummy execvp to get the PLT entry resolved,
|
|
|
|
* so we avoid the resolver overhead on the real
|
|
|
|
* execvp call.
|
|
|
|
*/
|
|
|
|
execvp("", (char **)argv);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tell the parent we're ready to go
|
|
|
|
*/
|
|
|
|
close(child_ready_pipe[1]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait until the parent tells us to go.
|
|
|
|
*/
|
|
|
|
if (read(go_pipe[0], &buf, 1) == -1)
|
|
|
|
perror("unable to read pipe");
|
|
|
|
|
|
|
|
execvp(argv[0], (char **)argv);
|
|
|
|
|
|
|
|
perror(argv[0]);
|
|
|
|
exit(-1);
|
|
|
|
}
|
2009-06-29 19:13:21 +08:00
|
|
|
|
2010-03-18 22:36:05 +08:00
|
|
|
if (target_tid == -1 && target_pid == -1 && !system_wide)
|
|
|
|
all_tids[0] = child_pid;
|
|
|
|
|
2009-06-29 19:13:21 +08:00
|
|
|
/*
|
2009-12-31 16:05:50 +08:00
|
|
|
* Wait for the child to be ready to exec.
|
2009-06-29 19:13:21 +08:00
|
|
|
*/
|
|
|
|
close(child_ready_pipe[1]);
|
2009-12-31 16:05:50 +08:00
|
|
|
close(go_pipe[0]);
|
|
|
|
if (read(child_ready_pipe[0], &buf, 1) == -1)
|
2009-07-02 03:02:10 +08:00
|
|
|
perror("unable to read pipe");
|
2009-12-31 16:05:50 +08:00
|
|
|
close(child_ready_pipe[0]);
|
2009-06-29 19:13:21 +08:00
|
|
|
}
|
|
|
|
|
2009-06-13 20:57:28 +08:00
|
|
|
for (counter = 0; counter < nr_counters; counter++)
|
2010-03-23 00:10:28 +08:00
|
|
|
ncreated += create_perf_stat_counter(counter);
|
|
|
|
|
|
|
|
if (ncreated == 0) {
|
|
|
|
pr_err("No permission to collect %sstats.\n"
|
|
|
|
"Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n",
|
|
|
|
system_wide ? "system-wide " : "");
|
|
|
|
if (child_pid != -1)
|
|
|
|
kill(child_pid, SIGTERM);
|
|
|
|
return -1;
|
|
|
|
}
|
2009-06-13 20:57:28 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable counters and exec the command:
|
|
|
|
*/
|
|
|
|
t0 = rdclock();
|
|
|
|
|
2009-12-31 16:05:50 +08:00
|
|
|
if (forks) {
|
|
|
|
close(go_pipe[1]);
|
|
|
|
wait(&status);
|
|
|
|
} else {
|
2010-03-18 22:36:03 +08:00
|
|
|
while(!done) sleep(1);
|
2009-12-31 16:05:50 +08:00
|
|
|
}
|
2009-06-13 20:57:28 +08:00
|
|
|
|
|
|
|
t1 = rdclock();
|
|
|
|
|
2009-09-04 21:36:08 +08:00
|
|
|
update_stats(&walltime_nsecs_stats, t1 - t0);
|
2009-06-13 20:57:28 +08:00
|
|
|
|
|
|
|
for (counter = 0; counter < nr_counters; counter++)
|
|
|
|
read_counter(counter);
|
|
|
|
|
|
|
|
return WEXITSTATUS(status);
|
|
|
|
}
|
|
|
|
|
2009-09-05 00:23:38 +08:00
|
|
|
static void print_noise(int counter, double avg)
|
2009-06-13 20:57:28 +08:00
|
|
|
{
|
2009-09-05 00:23:38 +08:00
|
|
|
if (run_count == 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
fprintf(stderr, " ( +- %7.3f%% )",
|
|
|
|
100 * stddev_stats(&event_res_stats[counter][0]) / avg);
|
2009-06-13 20:57:28 +08:00
|
|
|
}
|
|
|
|
|
2009-09-05 00:23:38 +08:00
|
|
|
static void nsec_printout(int counter, double avg)
|
2009-06-13 19:35:00 +08:00
|
|
|
{
|
2009-09-04 21:36:12 +08:00
|
|
|
double msecs = avg / 1e6;
|
2009-06-13 19:35:00 +08:00
|
|
|
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter));
|
2009-06-13 19:35:00 +08:00
|
|
|
|
2009-07-01 17:35:09 +08:00
|
|
|
if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
|
2009-09-04 21:36:12 +08:00
|
|
|
fprintf(stderr, " # %10.3f CPUs ",
|
|
|
|
avg / avg_stats(&walltime_nsecs_stats));
|
2009-06-13 19:35:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-05 00:23:38 +08:00
|
|
|
static void abs_printout(int counter, double avg)
|
2009-06-13 19:35:00 +08:00
|
|
|
{
|
2009-09-22 20:53:51 +08:00
|
|
|
double total, ratio = 0.0;
|
|
|
|
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
if (big_num)
|
|
|
|
fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter));
|
|
|
|
else
|
|
|
|
fprintf(stderr, " %18.0f %-24s", avg, event_name(counter));
|
2009-06-13 19:35:00 +08:00
|
|
|
|
2009-09-04 21:36:12 +08:00
|
|
|
if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
|
2009-09-22 20:53:51 +08:00
|
|
|
total = avg_stats(&runtime_cycles_stats);
|
|
|
|
|
|
|
|
if (total)
|
|
|
|
ratio = avg / total;
|
|
|
|
|
|
|
|
fprintf(stderr, " # %10.3f IPC ", ratio);
|
2009-11-15 22:05:08 +08:00
|
|
|
} else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
|
|
|
|
runtime_branches_stats.n != 0) {
|
2009-10-18 19:29:23 +08:00
|
|
|
total = avg_stats(&runtime_branches_stats);
|
|
|
|
|
|
|
|
if (total)
|
|
|
|
ratio = avg * 100 / total;
|
|
|
|
|
perf stat: Count branches first
Count branches first, cache-misses second. The reason is that
on x86 branches are not counted by all counters on all CPUs.
Before:
Performance counter stats for 'ls':
0.756653 task-clock-msecs # 0.802 CPUs
0 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
250 page-faults # 0.330 M/sec
2375725 cycles # 3139.781 M/sec
1628129 instructions # 0.685 IPC
19643 cache-references # 25.960 M/sec
4608 cache-misses # 6.090 M/sec
342532 branches # 452.694 M/sec
<not counted> branch-misses
0.000943356 seconds time elapsed
After:
Performance counter stats for 'ls':
1.056734 task-clock-msecs # 0.859 CPUs
0 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
259 page-faults # 0.245 M/sec
3345932 cycles # 3166.295 M/sec
3074090 instructions # 0.919 IPC
616928 branches # 583.806 M/sec
39279 branch-misses # 6.367 %
21312 cache-references # 20.168 M/sec
3661 cache-misses # 3.464 M/sec
0.001230551 seconds time elapsed
(also prettify the printout of branch misses, in case it's
getting scaled.)
Cc: Tim Blechmann <tim@klingt.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4ADC3975.8050109@klingt.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
tools/perf/builtin-stat.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c373683..95a55ea 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,8 @@ static struct perf_event_attr default_attrs[] = {
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
};
---
tools/perf/builtin-stat.c | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 95a55ea..90e0a26 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -50,17 +50,17 @@
static struct perf_event_attr default_attrs[] = {
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
- { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
-
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
- { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
+ { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
+
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
};
2009-10-19 19:33:03 +08:00
|
|
|
fprintf(stderr, " # %10.3f %% ", ratio);
|
2009-10-18 19:29:23 +08:00
|
|
|
|
2009-11-15 22:05:08 +08:00
|
|
|
} else if (runtime_nsecs_stats.n != 0) {
|
2009-09-22 20:53:51 +08:00
|
|
|
total = avg_stats(&runtime_nsecs_stats);
|
|
|
|
|
|
|
|
if (total)
|
|
|
|
ratio = 1000.0 * avg / total;
|
|
|
|
|
|
|
|
fprintf(stderr, " # %10.3f M/sec", ratio);
|
2009-06-13 19:35:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
/*
|
|
|
|
* Print out the results of a single counter:
|
|
|
|
*/
|
|
|
|
static void print_counter(int counter)
|
|
|
|
{
|
2009-09-05 00:23:38 +08:00
|
|
|
double avg = avg_stats(&event_res_stats[counter][0]);
|
2009-09-04 23:03:13 +08:00
|
|
|
int scaled = event_scaled[counter];
|
2009-05-29 15:10:54 +08:00
|
|
|
|
|
|
|
if (scaled == -1) {
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
fprintf(stderr, " %18s %-24s\n",
|
2009-05-29 15:10:54 +08:00
|
|
|
"<not counted>", event_name(counter));
|
|
|
|
return;
|
|
|
|
}
|
2009-05-29 15:10:54 +08:00
|
|
|
|
2009-06-13 19:35:00 +08:00
|
|
|
if (nsec_counter(counter))
|
2009-09-05 00:23:38 +08:00
|
|
|
nsec_printout(counter, avg);
|
2009-06-13 19:35:00 +08:00
|
|
|
else
|
2009-09-05 00:23:38 +08:00
|
|
|
abs_printout(counter, avg);
|
|
|
|
|
|
|
|
print_noise(counter, avg);
|
2009-09-04 21:36:12 +08:00
|
|
|
|
|
|
|
if (scaled) {
|
|
|
|
double avg_enabled, avg_running;
|
|
|
|
|
|
|
|
avg_enabled = avg_stats(&event_res_stats[counter][1]);
|
|
|
|
avg_running = avg_stats(&event_res_stats[counter][2]);
|
2009-05-30 18:38:51 +08:00
|
|
|
|
2009-06-30 03:50:54 +08:00
|
|
|
fprintf(stderr, " (scaled from %.2f%%)",
|
2009-09-04 21:36:12 +08:00
|
|
|
100 * avg_running / avg_enabled);
|
|
|
|
}
|
2009-06-13 19:35:00 +08:00
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
fprintf(stderr, "\n");
|
|
|
|
}
|
|
|
|
|
2009-06-13 20:57:28 +08:00
|
|
|
static void print_stat(int argc, const char **argv)
|
|
|
|
{
|
|
|
|
int i, counter;
|
|
|
|
|
2009-04-20 21:37:32 +08:00
|
|
|
fflush(stdout);
|
|
|
|
|
|
|
|
fprintf(stderr, "\n");
|
2009-12-31 16:05:50 +08:00
|
|
|
fprintf(stderr, " Performance counter stats for ");
|
2010-03-18 22:36:05 +08:00
|
|
|
if(target_pid == -1 && target_tid == -1) {
|
2009-12-31 16:05:50 +08:00
|
|
|
fprintf(stderr, "\'%s", argv[0]);
|
|
|
|
for (i = 1; i < argc; i++)
|
|
|
|
fprintf(stderr, " %s", argv[i]);
|
2010-03-18 22:36:05 +08:00
|
|
|
} else if (target_pid != -1)
|
|
|
|
fprintf(stderr, "process id \'%d", target_pid);
|
|
|
|
else
|
|
|
|
fprintf(stderr, "thread id \'%d", target_tid);
|
2009-06-04 01:36:07 +08:00
|
|
|
|
2009-06-13 20:57:28 +08:00
|
|
|
fprintf(stderr, "\'");
|
|
|
|
if (run_count > 1)
|
|
|
|
fprintf(stderr, " (%d runs)", run_count);
|
|
|
|
fprintf(stderr, ":\n\n");
|
2009-05-29 15:10:54 +08:00
|
|
|
|
2009-05-29 15:10:54 +08:00
|
|
|
for (counter = 0; counter < nr_counters; counter++)
|
|
|
|
print_counter(counter);
|
2009-04-20 21:37:32 +08:00
|
|
|
|
|
|
|
fprintf(stderr, "\n");
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
fprintf(stderr, " %18.9f seconds time elapsed",
|
2009-09-04 21:36:12 +08:00
|
|
|
avg_stats(&walltime_nsecs_stats)/1e9);
|
2009-06-27 12:24:32 +08:00
|
|
|
if (run_count > 1) {
|
|
|
|
fprintf(stderr, " ( +- %7.3f%% )",
|
2009-09-04 21:36:12 +08:00
|
|
|
100*stddev_stats(&walltime_nsecs_stats) /
|
|
|
|
avg_stats(&walltime_nsecs_stats));
|
2009-06-27 12:24:32 +08:00
|
|
|
}
|
|
|
|
fprintf(stderr, "\n\n");
|
2009-04-20 21:37:32 +08:00
|
|
|
}
|
|
|
|
|
2009-06-10 21:55:59 +08:00
|
|
|
static volatile int signr = -1;
|
|
|
|
|
2009-05-26 15:17:18 +08:00
|
|
|
static void skip_signal(int signo)
|
2009-04-20 21:37:32 +08:00
|
|
|
{
|
2010-03-18 22:36:03 +08:00
|
|
|
if(child_pid == -1)
|
2009-12-31 16:05:50 +08:00
|
|
|
done = 1;
|
|
|
|
|
2009-06-10 21:55:59 +08:00
|
|
|
signr = signo;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sig_atexit(void)
|
|
|
|
{
|
2009-10-04 08:35:01 +08:00
|
|
|
if (child_pid != -1)
|
|
|
|
kill(child_pid, SIGTERM);
|
|
|
|
|
2009-06-10 21:55:59 +08:00
|
|
|
if (signr == -1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
signal(signr, SIG_DFL);
|
|
|
|
kill(getpid(), signr);
|
2009-05-26 15:17:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static const char * const stat_usage[] = {
|
2009-12-31 16:05:50 +08:00
|
|
|
"perf stat [<options>] [<command>]",
|
2009-05-26 15:17:18 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct option options[] = {
|
|
|
|
OPT_CALLBACK('e', "event", NULL, "event",
|
2009-06-06 18:24:17 +08:00
|
|
|
"event selector. use 'perf list' to list available events",
|
|
|
|
parse_events),
|
2010-05-12 16:40:01 +08:00
|
|
|
OPT_BOOLEAN('i', "no-inherit", &no_inherit,
|
|
|
|
"child tasks do not inherit counters"),
|
2009-05-26 15:17:18 +08:00
|
|
|
OPT_INTEGER('p', "pid", &target_pid,
|
2010-03-18 22:36:05 +08:00
|
|
|
"stat events on existing process id"),
|
|
|
|
OPT_INTEGER('t', "tid", &target_tid,
|
|
|
|
"stat events on existing thread id"),
|
2009-05-26 15:17:18 +08:00
|
|
|
OPT_BOOLEAN('a', "all-cpus", &system_wide,
|
2009-06-24 20:49:34 +08:00
|
|
|
"system-wide collection from all CPUs"),
|
2009-08-07 16:18:39 +08:00
|
|
|
OPT_BOOLEAN('c', "scale", &scale,
|
2009-06-24 20:49:34 +08:00
|
|
|
"scale/normalize counters"),
|
2010-04-13 16:37:33 +08:00
|
|
|
OPT_INCR('v', "verbose", &verbose,
|
2009-06-07 23:06:46 +08:00
|
|
|
"be more verbose (show counter open errors, etc)"),
|
2009-06-13 20:57:28 +08:00
|
|
|
OPT_INTEGER('r', "repeat", &run_count,
|
|
|
|
"repeat command and print average + stddev (max: 100)"),
|
2009-06-27 12:10:30 +08:00
|
|
|
OPT_BOOLEAN('n', "null", &null_run,
|
|
|
|
"null run - dont start any counters"),
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
OPT_BOOLEAN('B', "big-num", &big_num,
|
|
|
|
"print large numbers with thousands\' separators"),
|
2010-05-28 18:00:01 +08:00
|
|
|
OPT_STRING('C', "cpu", &cpu_list, "cpu",
|
|
|
|
"list of cpus to monitor in system-wide"),
|
2009-05-26 15:17:18 +08:00
|
|
|
OPT_END()
|
|
|
|
};
|
|
|
|
|
2009-07-01 18:37:06 +08:00
|
|
|
int cmd_stat(int argc, const char **argv, const char *prefix __used)
|
2009-05-26 15:17:18 +08:00
|
|
|
{
|
2009-06-13 20:57:28 +08:00
|
|
|
int status;
|
2010-03-18 22:36:05 +08:00
|
|
|
int i,j;
|
2009-06-13 20:57:28 +08:00
|
|
|
|
perf stat: add perf stat -B to pretty print large numbers
It is hard to read very large numbers so provide an option to perf stat
to separate thousands using a separator. The patch leverages the locale
support of stdio. You need to set your LC_NUMERIC appropriately, for
instance LC_NUMERIC=en_US.UTF8. You need to pass -B to activate this
feature. This way existing scripts parsing the output do not need to be
changed. Here is an example.
$ perf stat noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.347031 task-clock-msecs # 0.998 CPUs
61 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
118 page-faults # 0.000 M/sec
4,138,410,900 cycles # 2070.917 M/sec (scaled from 70.01%)
2,062,650,268 instructions # 0.498 IPC (scaled from 70.01%)
2,057,653,466 branches # 1029.678 M/sec (scaled from 70.01%)
40,267 branch-misses # 0.002 % (scaled from 30.04%)
2,055,961,348 cache-references # 1028.831 M/sec (scaled from 30.03%)
53,725 cache-misses # 0.027 M/sec (scaled from 30.02%)
2.001393933 seconds time elapsed
$ perf stat -B noploop 2
noploop for 2 seconds
Performance counter stats for 'noploop 2':
1998.297883 task-clock-msecs # 0.998 CPUs
59 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
119 page-faults # 0.000 M/sec
4,131,380,160 cycles # 2067.450 M/sec (scaled from 70.01%)
2,059,096,507 instructions # 0.498 IPC (scaled from 70.01%)
2,054,681,303 branches # 1028.216 M/sec (scaled from 70.01%)
25,650 branch-misses # 0.001 % (scaled from 30.05%)
2,056,283,014 cache-references # 1029.017 M/sec (scaled from 30.03%)
47,097 cache-misses # 0.024 M/sec (scaled from 30.02%)
2.001391016 seconds time elapsed
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4bf28fe8.914ed80a.01ca.fffff5f5@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2010-05-18 21:00:01 +08:00
|
|
|
setlocale(LC_ALL, "");
|
|
|
|
|
2009-07-22 21:04:12 +08:00
|
|
|
argc = parse_options(argc, argv, options, stat_usage,
|
|
|
|
PARSE_OPT_STOP_AT_NON_OPTION);
|
2010-03-18 22:36:05 +08:00
|
|
|
if (!argc && target_pid == -1 && target_tid == -1)
|
2009-05-26 15:17:18 +08:00
|
|
|
usage_with_options(stat_usage, options);
|
2009-09-04 21:36:08 +08:00
|
|
|
if (run_count <= 0)
|
2009-06-13 20:57:28 +08:00
|
|
|
usage_with_options(stat_usage, options);
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2009-06-28 02:19:09 +08:00
|
|
|
/* Set attrs and nr_counters if no event is selected and !null_run */
|
|
|
|
if (!null_run && !nr_counters) {
|
|
|
|
memcpy(attrs, default_attrs, sizeof(default_attrs));
|
|
|
|
nr_counters = ARRAY_SIZE(default_attrs);
|
|
|
|
}
|
2009-04-20 21:37:32 +08:00
|
|
|
|
perf tools: Fix sparse CPU numbering related bugs
At present, the perf subcommands that do system-wide monitoring
(perf stat, perf record and perf top) don't work properly unless
the online cpus are numbered 0, 1, ..., N-1. These tools ask
for the number of online cpus with sysconf(_SC_NPROCESSORS_ONLN)
and then try to create events for cpus 0, 1, ..., N-1.
This creates problems for systems where the online cpus are
numbered sparsely. For example, a POWER6 system in
single-threaded mode (i.e. only running 1 hardware thread per
core) will have only even-numbered cpus online.
This fixes the problem by reading the /sys/devices/system/cpu/online
file to find out which cpus are online. The code that does that is in
tools/perf/util/cpumap.[ch], and consists of a read_cpu_map()
function that sets up a cpumap[] array and returns the number of
online cpus. If /sys/devices/system/cpu/online can't be read or
can't be parsed successfully, it falls back to using sysconf to
ask how many cpus are online and sets up an identity map in cpumap[].
The perf record, perf stat and perf top code then calls
read_cpu_map() in the system-wide monitoring case (instead of
sysconf) and uses cpumap[] to get the cpu numbers to pass to
perf_event_open.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Anton Blanchard <anton@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <20100310093609.GA3959@brick.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-10 17:36:09 +08:00
|
|
|
if (system_wide)
|
2010-05-28 18:00:01 +08:00
|
|
|
nr_cpus = read_cpu_map(cpu_list);
|
perf tools: Fix sparse CPU numbering related bugs
At present, the perf subcommands that do system-wide monitoring
(perf stat, perf record and perf top) don't work properly unless
the online cpus are numbered 0, 1, ..., N-1. These tools ask
for the number of online cpus with sysconf(_SC_NPROCESSORS_ONLN)
and then try to create events for cpus 0, 1, ..., N-1.
This creates problems for systems where the online cpus are
numbered sparsely. For example, a POWER6 system in
single-threaded mode (i.e. only running 1 hardware thread per
core) will have only even-numbered cpus online.
This fixes the problem by reading the /sys/devices/system/cpu/online
file to find out which cpus are online. The code that does that is in
tools/perf/util/cpumap.[ch], and consists of a read_cpu_map()
function that sets up a cpumap[] array and returns the number of
online cpus. If /sys/devices/system/cpu/online can't be read or
can't be parsed successfully, it falls back to using sysconf to
ask how many cpus are online and sets up an identity map in cpumap[].
The perf record, perf stat and perf top code then calls
read_cpu_map() in the system-wide monitoring case (instead of
sysconf) and uses cpumap[] to get the cpu numbers to pass to
perf_event_open.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Anton Blanchard <anton@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <20100310093609.GA3959@brick.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-10 17:36:09 +08:00
|
|
|
else
|
|
|
|
nr_cpus = 1;
|
2009-04-20 21:37:32 +08:00
|
|
|
|
2010-05-28 18:00:01 +08:00
|
|
|
if (nr_cpus < 1)
|
|
|
|
usage_with_options(stat_usage, options);
|
|
|
|
|
2010-03-18 22:36:05 +08:00
|
|
|
if (target_pid != -1) {
|
|
|
|
target_tid = target_pid;
|
|
|
|
thread_num = find_all_tid(target_pid, &all_tids);
|
|
|
|
if (thread_num <= 0) {
|
|
|
|
fprintf(stderr, "Can't find all threads of pid %d\n",
|
|
|
|
target_pid);
|
|
|
|
usage_with_options(stat_usage, options);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
all_tids=malloc(sizeof(pid_t));
|
|
|
|
if (!all_tids)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
all_tids[0] = target_tid;
|
|
|
|
thread_num = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NR_CPUS; i++) {
|
|
|
|
for (j = 0; j < MAX_COUNTERS; j++) {
|
|
|
|
fd[i][j] = malloc(sizeof(int)*thread_num);
|
|
|
|
if (!fd[i][j])
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-05-15 17:03:23 +08:00
|
|
|
/*
|
|
|
|
* We dont want to block the signals - that would cause
|
|
|
|
* child tasks to inherit that and Ctrl-C would not work.
|
|
|
|
* What we want is for Ctrl-C to work in the exec()-ed
|
|
|
|
* task, but being ignored by perf stat itself:
|
|
|
|
*/
|
2009-06-10 21:55:59 +08:00
|
|
|
atexit(sig_atexit);
|
2009-05-15 17:03:23 +08:00
|
|
|
signal(SIGINT, skip_signal);
|
|
|
|
signal(SIGALRM, skip_signal);
|
|
|
|
signal(SIGABRT, skip_signal);
|
|
|
|
|
2009-06-13 20:57:28 +08:00
|
|
|
status = 0;
|
|
|
|
for (run_idx = 0; run_idx < run_count; run_idx++) {
|
|
|
|
if (run_count != 1 && verbose)
|
2009-06-24 20:49:34 +08:00
|
|
|
fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
|
2009-06-13 20:57:28 +08:00
|
|
|
status = run_perf_stat(argc, argv);
|
|
|
|
}
|
|
|
|
|
2010-03-23 00:10:28 +08:00
|
|
|
if (status != -1)
|
|
|
|
print_stat(argc, argv);
|
2009-06-13 20:57:28 +08:00
|
|
|
|
|
|
|
return status;
|
2009-04-20 21:37:32 +08:00
|
|
|
}
|