2006-09-26 16:52:42 +08:00
|
|
|
/*
|
2006-09-26 16:52:42 +08:00
|
|
|
* Thermal throttle event support code (such as syslog messaging and rate
|
|
|
|
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
|
2009-04-08 18:31:19 +08:00
|
|
|
*
|
2006-09-26 16:52:42 +08:00
|
|
|
* This allows consistent reporting of CPU thermal throttle events.
|
|
|
|
*
|
|
|
|
* Maintains a counter in /sys that keeps track of the number of thermal
|
|
|
|
* events, such that the user knows how bad the thermal problem might be
|
|
|
|
* (since the logging to syslog and mcelog is rate limited).
|
2006-09-26 16:52:42 +08:00
|
|
|
*
|
|
|
|
* Author: Dmitriy Zavin (dmitriyz@google.com)
|
|
|
|
*
|
|
|
|
* Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
|
2006-09-26 16:52:42 +08:00
|
|
|
* Inspired by Ross Biro's and Al Borchers' counter code.
|
2006-09-26 16:52:42 +08:00
|
|
|
*/
|
2009-06-15 16:25:27 +08:00
|
|
|
#include <linux/interrupt.h>
|
2009-04-08 18:31:19 +08:00
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/jiffies.h>
|
2009-06-15 16:26:10 +08:00
|
|
|
#include <linux/kernel.h>
|
2006-09-26 16:52:42 +08:00
|
|
|
#include <linux/percpu.h>
|
2011-05-27 00:22:53 +08:00
|
|
|
#include <linux/export.h>
|
2009-06-15 16:26:10 +08:00
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/smp.h>
|
2006-09-26 16:52:42 +08:00
|
|
|
#include <linux/cpu.h>
|
2009-04-08 18:31:19 +08:00
|
|
|
|
2009-06-15 16:26:10 +08:00
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/apic.h>
|
2009-06-15 16:25:27 +08:00
|
|
|
#include <asm/mce.h>
|
2009-06-15 16:26:10 +08:00
|
|
|
#include <asm/msr.h>
|
x86, trace: Add irq vector tracepoints
[Purpose of this patch]
As Vaibhav explained in the thread below, tracepoints for irq vectors
are useful.
http://www.spinics.net/lists/mm-commits/msg85707.html
<snip>
The current interrupt traces from irq_handler_entry and irq_handler_exit
provide when an interrupt is handled. They provide good data about when
the system has switched to kernel space and how it affects the currently
running processes.
There are some IRQ vectors which trigger the system into kernel space,
which are not handled in generic IRQ handlers. Tracing such events gives
us the information about IRQ interaction with other system events.
The trace also tells where the system is spending its time. We want to
know which cores are handling interrupts and how they are affecting other
processes in the system. Also, the trace provides information about when
the cores are idle and which interrupts are changing that state.
<snip>
On the other hand, my usecase is tracing just local timer event and
getting a value of instruction pointer.
I suggested to add an argument local timer event to get instruction pointer before.
But there is another way to get it with external module like systemtap.
So, I don't need to add any argument to irq vector tracepoints now.
[Patch Description]
Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events.
But there is an above use case to trace specific irq_vector rather than tracing all events.
In this case, we are concerned about overhead due to unwanted events.
So, add following tracepoints instead of introducing irq_vector_entry/exit.
so that we can enable them independently.
- local_timer_vector
- reschedule_vector
- call_function_vector
- call_function_single_vector
- irq_work_entry_vector
- error_apic_vector
- thermal_apic_vector
- threshold_apic_vector
- spurious_apic_vector
- x86_platform_ipi_vector
Also, introduce a logic switching IDT at enabling/disabling time so that a time penalty
makes a zero when tracepoints are disabled. Detailed explanations are as follows.
- Create trace irq handlers with entering_irq()/exiting_irq().
- Create a new IDT, trace_idt_table, at boot time by adding a logic to
_set_gate(). It is just a copy of original idt table.
- Register the new handlers for tracpoints to the new IDT by introducing
macros to alloc_intr_gate() called at registering time of irq_vector handlers.
- Add checking, whether irq vector tracing is on/off, into load_current_idt().
This has to be done below debug checking for these reasons.
- Switching to debug IDT may be kicked while tracing is enabled.
- On the other hands, switching to trace IDT is kicked only when debugging
is disabled.
In addition, the new IDT is created only when CONFIG_TRACING is enabled to avoid being
used for other purposes.
Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Link: http://lkml.kernel.org/r/51C323ED.5050708@hds.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
2013-06-20 23:46:53 +08:00
|
|
|
#include <asm/trace/irq_vectors.h>
|
2006-09-26 16:52:42 +08:00
|
|
|
|
|
|
|
/* How long to wait between reporting thermal events */
|
2009-04-08 18:31:19 +08:00
|
|
|
#define CHECK_INTERVAL (300 * HZ)
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2010-07-30 08:13:45 +08:00
|
|
|
#define THERMAL_THROTTLING_EVENT 0
|
|
|
|
#define POWER_LIMIT_EVENT 1
|
|
|
|
|
2009-09-22 21:50:24 +08:00
|
|
|
/*
|
2010-07-30 08:13:45 +08:00
|
|
|
* Current thermal event state:
|
2009-09-22 21:50:24 +08:00
|
|
|
*/
|
2010-07-30 08:13:44 +08:00
|
|
|
struct _thermal_state {
|
2010-07-30 08:13:45 +08:00
|
|
|
bool new_event;
|
|
|
|
int event;
|
2009-09-22 21:50:24 +08:00
|
|
|
u64 next_check;
|
2010-07-30 08:13:45 +08:00
|
|
|
unsigned long count;
|
|
|
|
unsigned long last_count;
|
2009-09-22 21:50:24 +08:00
|
|
|
};
|
2009-04-08 18:31:19 +08:00
|
|
|
|
2010-07-30 08:13:44 +08:00
|
|
|
struct thermal_state {
|
2010-07-30 08:13:45 +08:00
|
|
|
struct _thermal_state core_throttle;
|
|
|
|
struct _thermal_state core_power_limit;
|
|
|
|
struct _thermal_state package_throttle;
|
|
|
|
struct _thermal_state package_power_limit;
|
2011-01-03 19:52:04 +08:00
|
|
|
struct _thermal_state core_thresh0;
|
|
|
|
struct _thermal_state core_thresh1;
|
2013-05-18 07:42:01 +08:00
|
|
|
struct _thermal_state pkg_thresh0;
|
|
|
|
struct _thermal_state pkg_thresh1;
|
2010-07-30 08:13:44 +08:00
|
|
|
};
|
|
|
|
|
2011-01-03 19:52:04 +08:00
|
|
|
/* Callback to handle core threshold interrupts */
|
|
|
|
int (*platform_thermal_notify)(__u64 msr_val);
|
2011-01-21 12:12:40 +08:00
|
|
|
EXPORT_SYMBOL(platform_thermal_notify);
|
2011-01-03 19:52:04 +08:00
|
|
|
|
2013-05-18 07:42:01 +08:00
|
|
|
/* Callback to handle core package threshold_interrupts */
|
|
|
|
int (*platform_thermal_package_notify)(__u64 msr_val);
|
|
|
|
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
|
|
|
|
|
|
|
|
/* Callback support of rate control, return true, if
|
|
|
|
* callback has rate control */
|
|
|
|
bool (*platform_thermal_package_rate_control)(void);
|
|
|
|
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
|
|
|
|
|
|
|
|
|
2009-09-22 21:50:24 +08:00
|
|
|
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
|
|
|
|
|
|
|
|
static atomic_t therm_throt_en = ATOMIC_INIT(0);
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2009-11-10 09:38:24 +08:00
|
|
|
static u32 lvtthmr_init __read_mostly;
|
|
|
|
|
2006-09-26 16:52:42 +08:00
|
|
|
#ifdef CONFIG_SYSFS
|
2011-12-22 06:29:42 +08:00
|
|
|
#define define_therm_throt_device_one_ro(_name) \
|
|
|
|
static DEVICE_ATTR(_name, 0444, \
|
|
|
|
therm_throt_device_show_##_name, \
|
2010-07-30 08:13:44 +08:00
|
|
|
NULL) \
|
2009-04-08 18:31:19 +08:00
|
|
|
|
2011-12-22 06:29:42 +08:00
|
|
|
#define define_therm_throt_device_show_func(event, name) \
|
2009-09-22 21:50:24 +08:00
|
|
|
\
|
2011-12-22 06:29:42 +08:00
|
|
|
static ssize_t therm_throt_device_show_##event##_##name( \
|
|
|
|
struct device *dev, \
|
|
|
|
struct device_attribute *attr, \
|
2009-09-22 21:50:24 +08:00
|
|
|
char *buf) \
|
2009-04-08 18:31:19 +08:00
|
|
|
{ \
|
|
|
|
unsigned int cpu = dev->id; \
|
|
|
|
ssize_t ret; \
|
|
|
|
\
|
|
|
|
preempt_disable(); /* CPU hotplug */ \
|
2010-07-30 08:13:44 +08:00
|
|
|
if (cpu_online(cpu)) { \
|
2009-04-08 18:31:19 +08:00
|
|
|
ret = sprintf(buf, "%lu\n", \
|
2010-07-30 08:13:45 +08:00
|
|
|
per_cpu(thermal_state, cpu).event.name); \
|
2010-07-30 08:13:44 +08:00
|
|
|
} else \
|
2009-04-08 18:31:19 +08:00
|
|
|
ret = 0; \
|
|
|
|
preempt_enable(); \
|
|
|
|
\
|
|
|
|
return ret; \
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
|
|
|
|
2011-12-22 06:29:42 +08:00
|
|
|
define_therm_throt_device_show_func(core_throttle, count);
|
|
|
|
define_therm_throt_device_one_ro(core_throttle_count);
|
2010-07-30 08:13:44 +08:00
|
|
|
|
2011-12-22 06:29:42 +08:00
|
|
|
define_therm_throt_device_show_func(core_power_limit, count);
|
|
|
|
define_therm_throt_device_one_ro(core_power_limit_count);
|
2010-07-30 08:13:45 +08:00
|
|
|
|
2011-12-22 06:29:42 +08:00
|
|
|
define_therm_throt_device_show_func(package_throttle, count);
|
|
|
|
define_therm_throt_device_one_ro(package_throttle_count);
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2011-12-22 06:29:42 +08:00
|
|
|
define_therm_throt_device_show_func(package_power_limit, count);
|
|
|
|
define_therm_throt_device_one_ro(package_power_limit_count);
|
2010-07-30 08:13:45 +08:00
|
|
|
|
2006-09-26 16:52:42 +08:00
|
|
|
static struct attribute *thermal_throttle_attrs[] = {
|
2011-12-22 06:29:42 +08:00
|
|
|
&dev_attr_core_throttle_count.attr,
|
2006-09-26 16:52:42 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2010-07-30 08:13:45 +08:00
|
|
|
static struct attribute_group thermal_attr_group = {
|
2009-04-08 18:31:19 +08:00
|
|
|
.attrs = thermal_throttle_attrs,
|
|
|
|
.name = "thermal_throttle"
|
2006-09-26 16:52:42 +08:00
|
|
|
};
|
|
|
|
#endif /* CONFIG_SYSFS */
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2010-07-30 08:13:45 +08:00
|
|
|
#define CORE_LEVEL 0
|
|
|
|
#define PACKAGE_LEVEL 1
|
|
|
|
|
2006-09-26 16:52:42 +08:00
|
|
|
/***
|
2006-09-26 16:52:42 +08:00
|
|
|
* therm_throt_process - Process thermal throttling event from interrupt
|
2006-09-26 16:52:42 +08:00
|
|
|
* @curr: Whether the condition is current or not (boolean), since the
|
|
|
|
* thermal interrupt normally gets called both when the thermal
|
|
|
|
* event begins and once the event has ended.
|
|
|
|
*
|
2006-09-26 16:52:42 +08:00
|
|
|
* This function is called by the thermal interrupt after the
|
2006-09-26 16:52:42 +08:00
|
|
|
* IRQ has been acknowledged.
|
|
|
|
*
|
|
|
|
* It will take care of rate limiting and printing messages to the syslog.
|
|
|
|
*
|
|
|
|
* Returns: 0 : Event should NOT be further logged, i.e. still in
|
|
|
|
* "timeout" from previous log message.
|
|
|
|
* 1 : Event should be logged further, and a message has been
|
|
|
|
* printed to the syslog.
|
|
|
|
*/
|
2010-07-30 08:13:45 +08:00
|
|
|
static int therm_throt_process(bool new_event, int event, int level)
|
2006-09-26 16:52:42 +08:00
|
|
|
{
|
2010-07-30 08:13:44 +08:00
|
|
|
struct _thermal_state *state;
|
2010-07-30 08:13:45 +08:00
|
|
|
unsigned int this_cpu = smp_processor_id();
|
|
|
|
bool old_event;
|
2009-09-22 21:50:24 +08:00
|
|
|
u64 now;
|
2010-07-30 08:13:45 +08:00
|
|
|
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
|
2009-09-22 21:50:24 +08:00
|
|
|
|
|
|
|
now = get_jiffies_64();
|
2010-07-30 08:13:45 +08:00
|
|
|
if (level == CORE_LEVEL) {
|
|
|
|
if (event == THERMAL_THROTTLING_EVENT)
|
|
|
|
state = &pstate->core_throttle;
|
|
|
|
else if (event == POWER_LIMIT_EVENT)
|
|
|
|
state = &pstate->core_power_limit;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
} else if (level == PACKAGE_LEVEL) {
|
|
|
|
if (event == THERMAL_THROTTLING_EVENT)
|
|
|
|
state = &pstate->package_throttle;
|
|
|
|
else if (event == POWER_LIMIT_EVENT)
|
|
|
|
state = &pstate->package_power_limit;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
} else
|
|
|
|
return 0;
|
2009-09-22 21:50:24 +08:00
|
|
|
|
2010-07-30 08:13:45 +08:00
|
|
|
old_event = state->new_event;
|
|
|
|
state->new_event = new_event;
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2010-07-30 08:13:45 +08:00
|
|
|
if (new_event)
|
|
|
|
state->count++;
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2009-09-22 21:50:24 +08:00
|
|
|
if (time_before64(now, state->next_check) &&
|
2010-07-30 08:13:45 +08:00
|
|
|
state->count != state->last_count)
|
2006-09-26 16:52:42 +08:00
|
|
|
return 0;
|
|
|
|
|
2009-09-22 21:50:24 +08:00
|
|
|
state->next_check = now + CHECK_INTERVAL;
|
2010-07-30 08:13:45 +08:00
|
|
|
state->last_count = state->count;
|
2006-09-26 16:52:42 +08:00
|
|
|
|
|
|
|
/* if we just entered the thermal event */
|
2010-07-30 08:13:45 +08:00
|
|
|
if (new_event) {
|
|
|
|
if (event == THERMAL_THROTTLING_EVENT)
|
2016-02-02 11:45:02 +08:00
|
|
|
pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
|
2010-07-30 08:13:45 +08:00
|
|
|
this_cpu,
|
|
|
|
level == CORE_LEVEL ? "Core" : "Package",
|
|
|
|
state->count);
|
2009-08-16 22:54:37 +08:00
|
|
|
return 1;
|
|
|
|
}
|
2010-07-30 08:13:45 +08:00
|
|
|
if (old_event) {
|
|
|
|
if (event == THERMAL_THROTTLING_EVENT)
|
2016-02-02 11:45:02 +08:00
|
|
|
pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
|
2010-07-30 08:13:45 +08:00
|
|
|
level == CORE_LEVEL ? "Core" : "Package");
|
2009-08-16 22:54:37 +08:00
|
|
|
return 1;
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
|
|
|
|
2009-08-16 22:54:37 +08:00
|
|
|
return 0;
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
2006-09-26 16:52:42 +08:00
|
|
|
|
2013-05-18 07:42:01 +08:00
|
|
|
static int thresh_event_valid(int level, int event)
|
2011-01-03 19:52:04 +08:00
|
|
|
{
|
|
|
|
struct _thermal_state *state;
|
|
|
|
unsigned int this_cpu = smp_processor_id();
|
|
|
|
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
|
|
|
|
u64 now = get_jiffies_64();
|
|
|
|
|
2013-05-18 07:42:01 +08:00
|
|
|
if (level == PACKAGE_LEVEL)
|
|
|
|
state = (event == 0) ? &pstate->pkg_thresh0 :
|
|
|
|
&pstate->pkg_thresh1;
|
|
|
|
else
|
|
|
|
state = (event == 0) ? &pstate->core_thresh0 :
|
|
|
|
&pstate->core_thresh1;
|
2011-01-03 19:52:04 +08:00
|
|
|
|
|
|
|
if (time_before64(now, state->next_check))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
state->next_check = now + CHECK_INTERVAL;
|
2013-05-18 07:42:01 +08:00
|
|
|
|
2011-01-03 19:52:04 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-05-22 03:35:17 +08:00
|
|
|
static bool int_pln_enable;
|
|
|
|
static int __init int_pln_enable_setup(char *s)
|
|
|
|
{
|
|
|
|
int_pln_enable = true;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("int_pln_enable", int_pln_enable_setup);
|
|
|
|
|
2006-09-26 16:52:42 +08:00
|
|
|
#ifdef CONFIG_SYSFS
|
2009-04-08 18:31:19 +08:00
|
|
|
/* Add/Remove thermal_throttle interface for CPU device: */
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
|
2006-09-26 16:52:42 +08:00
|
|
|
{
|
2010-07-30 08:13:44 +08:00
|
|
|
int err;
|
2010-08-20 15:36:34 +08:00
|
|
|
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
2010-07-30 08:13:44 +08:00
|
|
|
|
2011-12-22 06:29:42 +08:00
|
|
|
err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
|
2010-07-30 08:13:44 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2013-05-22 03:35:17 +08:00
|
|
|
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
|
2011-12-22 06:29:42 +08:00
|
|
|
err = sysfs_add_file_to_group(&dev->kobj,
|
|
|
|
&dev_attr_core_power_limit_count.attr,
|
2010-07-30 08:13:45 +08:00
|
|
|
thermal_attr_group.name);
|
2010-08-26 16:29:05 +08:00
|
|
|
if (cpu_has(c, X86_FEATURE_PTS)) {
|
2011-12-22 06:29:42 +08:00
|
|
|
err = sysfs_add_file_to_group(&dev->kobj,
|
|
|
|
&dev_attr_package_throttle_count.attr,
|
2010-07-30 08:13:45 +08:00
|
|
|
thermal_attr_group.name);
|
2013-05-22 03:35:17 +08:00
|
|
|
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
|
2011-12-22 06:29:42 +08:00
|
|
|
err = sysfs_add_file_to_group(&dev->kobj,
|
|
|
|
&dev_attr_package_power_limit_count.attr,
|
2010-07-30 08:13:45 +08:00
|
|
|
thermal_attr_group.name);
|
2010-08-26 16:29:05 +08:00
|
|
|
}
|
2010-07-30 08:13:44 +08:00
|
|
|
|
|
|
|
return err;
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
|
|
|
|
x86: delete __cpuinit usage from all x86 files
The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications. For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.
After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out. Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.
Note that some harmless section mismatch warnings may result, since
notify_cpu_starting() and cpu_up() are arch independent (kernel/cpu.c)
are flagged as __cpuinit -- so if we remove the __cpuinit from
arch specific callers, we will also get section mismatch warnings.
As an intermediate step, we intend to turn the linux/init.h cpuinit
content into no-ops as early as possible, since that will get rid
of these warnings. In any case, they are temporary and harmless.
This removes all the arch/x86 uses of the __cpuinit macros from
all C files. x86 only had the one __CPUINIT used in assembly files,
and it wasn't paired off with a .previous or a __FINIT, so we can
delete it directly w/o any corresponding additional change there.
[1] https://lkml.org/lkml/2013/5/20/589
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
2013-06-19 06:23:59 +08:00
|
|
|
static void thermal_throttle_remove_dev(struct device *dev)
|
2006-09-26 16:52:42 +08:00
|
|
|
{
|
2011-12-22 06:29:42 +08:00
|
|
|
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
|
2016-11-21 20:15:32 +08:00
|
|
|
static int thermal_throttle_online(unsigned int cpu)
|
2006-09-26 16:52:42 +08:00
|
|
|
{
|
2016-11-18 02:35:22 +08:00
|
|
|
struct device *dev = get_cpu_device(cpu);
|
|
|
|
|
|
|
|
return thermal_throttle_add_dev(dev, cpu);
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
|
|
|
|
2016-11-21 20:15:32 +08:00
|
|
|
static int thermal_throttle_offline(unsigned int cpu)
|
2006-09-26 16:52:42 +08:00
|
|
|
{
|
2016-11-18 02:35:22 +08:00
|
|
|
struct device *dev = get_cpu_device(cpu);
|
|
|
|
|
|
|
|
thermal_throttle_remove_dev(dev);
|
|
|
|
return 0;
|
|
|
|
}
|
2006-09-26 16:52:42 +08:00
|
|
|
|
|
|
|
static __init int thermal_throttle_init_device(void)
|
|
|
|
{
|
2016-11-21 20:15:32 +08:00
|
|
|
int ret;
|
2006-09-26 16:52:42 +08:00
|
|
|
|
|
|
|
if (!atomic_read(&therm_throt_en))
|
|
|
|
return 0;
|
|
|
|
|
2016-11-21 20:15:32 +08:00
|
|
|
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
|
|
|
|
thermal_throttle_online,
|
|
|
|
thermal_throttle_offline);
|
|
|
|
return ret < 0 ? ret : 0;
|
2006-09-26 16:52:42 +08:00
|
|
|
}
|
|
|
|
device_initcall(thermal_throttle_init_device);
|
2009-06-15 16:25:27 +08:00
|
|
|
|
2006-09-26 16:52:42 +08:00
|
|
|
#endif /* CONFIG_SYSFS */
|
2009-06-15 16:25:27 +08:00
|
|
|
|
2013-05-18 07:42:01 +08:00
|
|
|
static void notify_package_thresholds(__u64 msr_val)
|
|
|
|
{
|
|
|
|
bool notify_thres_0 = false;
|
|
|
|
bool notify_thres_1 = false;
|
|
|
|
|
|
|
|
if (!platform_thermal_package_notify)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* lower threshold check */
|
|
|
|
if (msr_val & THERM_LOG_THRESHOLD0)
|
|
|
|
notify_thres_0 = true;
|
|
|
|
/* higher threshold check */
|
|
|
|
if (msr_val & THERM_LOG_THRESHOLD1)
|
|
|
|
notify_thres_1 = true;
|
|
|
|
|
|
|
|
if (!notify_thres_0 && !notify_thres_1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (platform_thermal_package_rate_control &&
|
|
|
|
platform_thermal_package_rate_control()) {
|
|
|
|
/* Rate control is implemented in callback */
|
|
|
|
platform_thermal_package_notify(msr_val);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* lower threshold reached */
|
|
|
|
if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
|
|
|
|
platform_thermal_package_notify(msr_val);
|
|
|
|
/* higher threshold reached */
|
|
|
|
if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
|
|
|
|
platform_thermal_package_notify(msr_val);
|
|
|
|
}
|
|
|
|
|
2011-01-03 19:52:04 +08:00
|
|
|
static void notify_thresholds(__u64 msr_val)
|
|
|
|
{
|
|
|
|
/* check whether the interrupt handler is defined;
|
|
|
|
* otherwise simply return
|
|
|
|
*/
|
|
|
|
if (!platform_thermal_notify)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* lower threshold reached */
|
2013-05-18 07:42:01 +08:00
|
|
|
if ((msr_val & THERM_LOG_THRESHOLD0) &&
|
|
|
|
thresh_event_valid(CORE_LEVEL, 0))
|
2011-01-03 19:52:04 +08:00
|
|
|
platform_thermal_notify(msr_val);
|
|
|
|
/* higher threshold reached */
|
2013-05-18 07:42:01 +08:00
|
|
|
if ((msr_val & THERM_LOG_THRESHOLD1) &&
|
|
|
|
thresh_event_valid(CORE_LEVEL, 1))
|
2011-01-03 19:52:04 +08:00
|
|
|
platform_thermal_notify(msr_val);
|
|
|
|
}
|
|
|
|
|
2009-06-15 16:25:27 +08:00
|
|
|
/* Thermal transition interrupt handler */
|
2009-06-15 16:26:36 +08:00
|
|
|
static void intel_thermal_interrupt(void)
|
2009-06-15 16:25:27 +08:00
|
|
|
{
|
|
|
|
__u64 msr_val;
|
|
|
|
|
2016-03-24 12:07:39 +08:00
|
|
|
if (static_cpu_has(X86_FEATURE_HWP))
|
|
|
|
wrmsrl_safe(MSR_HWP_STATUS, 0);
|
|
|
|
|
2009-06-15 16:25:27 +08:00
|
|
|
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
|
2010-07-30 08:13:45 +08:00
|
|
|
|
2011-01-03 19:52:04 +08:00
|
|
|
/* Check for violation of core thermal thresholds*/
|
|
|
|
notify_thresholds(msr_val);
|
|
|
|
|
2010-07-30 08:13:44 +08:00
|
|
|
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
|
2010-07-30 08:13:45 +08:00
|
|
|
THERMAL_THROTTLING_EVENT,
|
2010-07-30 08:13:44 +08:00
|
|
|
CORE_LEVEL) != 0)
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
mce_log_therm_throt_event(msr_val);
|
2010-07-30 08:13:45 +08:00
|
|
|
|
2013-05-22 03:35:17 +08:00
|
|
|
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
|
2010-07-30 08:13:45 +08:00
|
|
|
POWER_LIMIT_EVENT,
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
CORE_LEVEL);
|
2010-07-30 08:13:44 +08:00
|
|
|
|
2011-03-12 19:50:46 +08:00
|
|
|
if (this_cpu_has(X86_FEATURE_PTS)) {
|
2010-07-30 08:13:44 +08:00
|
|
|
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
|
2013-05-18 07:42:01 +08:00
|
|
|
/* check violations of package thermal thresholds */
|
|
|
|
notify_package_thresholds(msr_val);
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
|
2010-07-30 08:13:45 +08:00
|
|
|
THERMAL_THROTTLING_EVENT,
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
PACKAGE_LEVEL);
|
2013-05-22 03:35:17 +08:00
|
|
|
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
therm_throt_process(msr_val &
|
2010-07-30 08:13:45 +08:00
|
|
|
PACKAGE_THERM_STATUS_POWER_LIMIT,
|
|
|
|
POWER_LIMIT_EVENT,
|
x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog
Thermal throttle and power limit events are not defined as MCE errors in x86
architecture and should not generate MCE errors in mcelog.
Current kernel generates fake software defined MCE errors for these events.
This may confuse users because they may think the machine has real MCE errors
while actually only thermal throttle or power limit events happen.
To make it worse, buggy firmware on some platforms may falsely generate
the events. Therefore, kernel reports MCE errors which users think as real
hardware errors. Although the firmware bugs should be fixed, on the other hand,
kernel should not report MCE errors either.
So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count,
package_power_limit_count, core_throttle_count, and package_throttle_count) in
/sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters
for each event on each CPU. Please note that all CPU's on one package report
duplicate counters. It's user application's responsibity to retrieve a package
level counter for one package.
This patch doesn't report package level power limit, core level power limit, and
package level thermal throttle events in mcelog. When the events happen, only
report them in respective counters in sysfs.
Since core level thermal throttle has been legacy code in kernel for a while and
users accepted it as MCE error in mcelog, core level thermal throttle is still
reported in mcelog. In the mean time, the event is counted in a counter in sysfs
as well.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Acked-by: Borislav Petkov <bp@amd64.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2011-11-05 04:31:23 +08:00
|
|
|
PACKAGE_LEVEL);
|
2010-07-30 08:13:44 +08:00
|
|
|
}
|
2009-06-15 16:25:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void unexpected_thermal_interrupt(void)
|
|
|
|
{
|
2016-02-02 11:45:02 +08:00
|
|
|
pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
|
|
|
|
smp_processor_id());
|
2009-06-15 16:25:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
|
|
|
|
|
x86, trace: Introduce entering/exiting_irq()
When implementing tracepoints in interrupt handers, if the tracepoints are
simply added in the performance sensitive path of interrupt handers,
it may cause potential performance problem due to the time penalty.
To solve the problem, an idea is to prepare non-trace/trace irq handers and
switch their IDTs at the enabling/disabling time.
So, let's introduce entering_irq()/exiting_irq() for pre/post-
processing of each irq handler.
A way to use them is as follows.
Non-trace irq handler:
smp_irq_handler()
{
entering_irq(); /* pre-processing of this handler */
__smp_irq_handler(); /*
* common logic between non-trace and trace handlers
* in a vector.
*/
exiting_irq(); /* post-processing of this handler */
}
Trace irq_handler:
smp_trace_irq_handler()
{
entering_irq(); /* pre-processing of this handler */
trace_irq_entry(); /* tracepoint for irq entry */
__smp_irq_handler(); /*
* common logic between non-trace and trace handlers
* in a vector.
*/
trace_irq_exit(); /* tracepoint for irq exit */
exiting_irq(); /* post-processing of this handler */
}
If tracepoints can place outside entering_irq()/exiting_irq() as follows,
it looks cleaner.
smp_trace_irq_handler()
{
trace_irq_entry();
smp_irq_handler();
trace_irq_exit();
}
But it doesn't work.
The problem is with irq_enter/exit() being called. They must be called before
trace_irq_enter/exit(), because of the rcu_irq_enter() must be called before
any tracepoints are used, as tracepoints use rcu to synchronize.
As a possible alternative, we may be able to call irq_enter() first as follows
if irq_enter() can nest.
smp_trace_irq_hander()
{
irq_entry();
trace_irq_entry();
smp_irq_handler();
trace_irq_exit();
irq_exit();
}
But it doesn't work, either.
If irq_enter() is nested, it may have a time penalty because it has to check if it
was already called or not. The time penalty is not desired in performance sensitive
paths even if it is tiny.
Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Link: http://lkml.kernel.org/r/51C3238D.9040706@hds.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
2013-06-20 23:45:17 +08:00
|
|
|
static inline void __smp_thermal_interrupt(void)
|
2009-06-15 16:25:27 +08:00
|
|
|
{
|
|
|
|
inc_irq_stat(irq_thermal_count);
|
|
|
|
smp_thermal_vector();
|
x86, trace: Introduce entering/exiting_irq()
When implementing tracepoints in interrupt handers, if the tracepoints are
simply added in the performance sensitive path of interrupt handers,
it may cause potential performance problem due to the time penalty.
To solve the problem, an idea is to prepare non-trace/trace irq handers and
switch their IDTs at the enabling/disabling time.
So, let's introduce entering_irq()/exiting_irq() for pre/post-
processing of each irq handler.
A way to use them is as follows.
Non-trace irq handler:
smp_irq_handler()
{
entering_irq(); /* pre-processing of this handler */
__smp_irq_handler(); /*
* common logic between non-trace and trace handlers
* in a vector.
*/
exiting_irq(); /* post-processing of this handler */
}
Trace irq_handler:
smp_trace_irq_handler()
{
entering_irq(); /* pre-processing of this handler */
trace_irq_entry(); /* tracepoint for irq entry */
__smp_irq_handler(); /*
* common logic between non-trace and trace handlers
* in a vector.
*/
trace_irq_exit(); /* tracepoint for irq exit */
exiting_irq(); /* post-processing of this handler */
}
If tracepoints can place outside entering_irq()/exiting_irq() as follows,
it looks cleaner.
smp_trace_irq_handler()
{
trace_irq_entry();
smp_irq_handler();
trace_irq_exit();
}
But it doesn't work.
The problem is with irq_enter/exit() being called. They must be called before
trace_irq_enter/exit(), because of the rcu_irq_enter() must be called before
any tracepoints are used, as tracepoints use rcu to synchronize.
As a possible alternative, we may be able to call irq_enter() first as follows
if irq_enter() can nest.
smp_trace_irq_hander()
{
irq_entry();
trace_irq_entry();
smp_irq_handler();
trace_irq_exit();
irq_exit();
}
But it doesn't work, either.
If irq_enter() is nested, it may have a time penalty because it has to check if it
was already called or not. The time penalty is not desired in performance sensitive
paths even if it is tiny.
Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Link: http://lkml.kernel.org/r/51C3238D.9040706@hds.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
2013-06-20 23:45:17 +08:00
|
|
|
}
|
|
|
|
|
2014-05-02 06:44:37 +08:00
|
|
|
asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
|
x86, trace: Introduce entering/exiting_irq()
When implementing tracepoints in interrupt handers, if the tracepoints are
simply added in the performance sensitive path of interrupt handers,
it may cause potential performance problem due to the time penalty.
To solve the problem, an idea is to prepare non-trace/trace irq handers and
switch their IDTs at the enabling/disabling time.
So, let's introduce entering_irq()/exiting_irq() for pre/post-
processing of each irq handler.
A way to use them is as follows.
Non-trace irq handler:
smp_irq_handler()
{
entering_irq(); /* pre-processing of this handler */
__smp_irq_handler(); /*
* common logic between non-trace and trace handlers
* in a vector.
*/
exiting_irq(); /* post-processing of this handler */
}
Trace irq_handler:
smp_trace_irq_handler()
{
entering_irq(); /* pre-processing of this handler */
trace_irq_entry(); /* tracepoint for irq entry */
__smp_irq_handler(); /*
* common logic between non-trace and trace handlers
* in a vector.
*/
trace_irq_exit(); /* tracepoint for irq exit */
exiting_irq(); /* post-processing of this handler */
}
If tracepoints can place outside entering_irq()/exiting_irq() as follows,
it looks cleaner.
smp_trace_irq_handler()
{
trace_irq_entry();
smp_irq_handler();
trace_irq_exit();
}
But it doesn't work.
The problem is with irq_enter/exit() being called. They must be called before
trace_irq_enter/exit(), because of the rcu_irq_enter() must be called before
any tracepoints are used, as tracepoints use rcu to synchronize.
As a possible alternative, we may be able to call irq_enter() first as follows
if irq_enter() can nest.
smp_trace_irq_hander()
{
irq_entry();
trace_irq_entry();
smp_irq_handler();
trace_irq_exit();
irq_exit();
}
But it doesn't work, either.
If irq_enter() is nested, it may have a time penalty because it has to check if it
was already called or not. The time penalty is not desired in performance sensitive
paths even if it is tiny.
Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Link: http://lkml.kernel.org/r/51C3238D.9040706@hds.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
2013-06-20 23:45:17 +08:00
|
|
|
{
|
|
|
|
entering_irq();
|
|
|
|
__smp_thermal_interrupt();
|
|
|
|
exiting_ack_irq();
|
2009-06-15 16:25:27 +08:00
|
|
|
}
|
|
|
|
|
2014-05-02 06:44:37 +08:00
|
|
|
asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
|
x86, trace: Add irq vector tracepoints
[Purpose of this patch]
As Vaibhav explained in the thread below, tracepoints for irq vectors
are useful.
http://www.spinics.net/lists/mm-commits/msg85707.html
<snip>
The current interrupt traces from irq_handler_entry and irq_handler_exit
provide when an interrupt is handled. They provide good data about when
the system has switched to kernel space and how it affects the currently
running processes.
There are some IRQ vectors which trigger the system into kernel space,
which are not handled in generic IRQ handlers. Tracing such events gives
us the information about IRQ interaction with other system events.
The trace also tells where the system is spending its time. We want to
know which cores are handling interrupts and how they are affecting other
processes in the system. Also, the trace provides information about when
the cores are idle and which interrupts are changing that state.
<snip>
On the other hand, my usecase is tracing just local timer event and
getting a value of instruction pointer.
I suggested to add an argument local timer event to get instruction pointer before.
But there is another way to get it with external module like systemtap.
So, I don't need to add any argument to irq vector tracepoints now.
[Patch Description]
Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events.
But there is an above use case to trace specific irq_vector rather than tracing all events.
In this case, we are concerned about overhead due to unwanted events.
So, add following tracepoints instead of introducing irq_vector_entry/exit.
so that we can enable them independently.
- local_timer_vector
- reschedule_vector
- call_function_vector
- call_function_single_vector
- irq_work_entry_vector
- error_apic_vector
- thermal_apic_vector
- threshold_apic_vector
- spurious_apic_vector
- x86_platform_ipi_vector
Also, introduce a logic switching IDT at enabling/disabling time so that a time penalty
makes a zero when tracepoints are disabled. Detailed explanations are as follows.
- Create trace irq handlers with entering_irq()/exiting_irq().
- Create a new IDT, trace_idt_table, at boot time by adding a logic to
_set_gate(). It is just a copy of original idt table.
- Register the new handlers for tracpoints to the new IDT by introducing
macros to alloc_intr_gate() called at registering time of irq_vector handlers.
- Add checking, whether irq vector tracing is on/off, into load_current_idt().
This has to be done below debug checking for these reasons.
- Switching to debug IDT may be kicked while tracing is enabled.
- On the other hands, switching to trace IDT is kicked only when debugging
is disabled.
In addition, the new IDT is created only when CONFIG_TRACING is enabled to avoid being
used for other purposes.
Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Link: http://lkml.kernel.org/r/51C323ED.5050708@hds.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
2013-06-20 23:46:53 +08:00
|
|
|
{
|
|
|
|
entering_irq();
|
|
|
|
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
|
|
|
|
__smp_thermal_interrupt();
|
|
|
|
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
|
|
|
|
exiting_ack_irq();
|
|
|
|
}
|
|
|
|
|
2009-12-14 16:57:00 +08:00
|
|
|
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
|
|
|
|
static int intel_thermal_supported(struct cpuinfo_x86 *c)
|
|
|
|
{
|
2016-04-05 04:25:00 +08:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_APIC))
|
2009-12-14 16:57:00 +08:00
|
|
|
return 0;
|
|
|
|
if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-11-11 15:51:25 +08:00
|
|
|
void __init mcheck_intel_therm_init(void)
|
2009-11-10 09:38:24 +08:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This function is only called on boot CPU. Save the init thermal
|
|
|
|
* LVT value on BSP and use that value to restore APs' thermal LVT
|
|
|
|
* entry BIOS programmed later
|
|
|
|
*/
|
2009-12-14 16:57:00 +08:00
|
|
|
if (intel_thermal_supported(&boot_cpu_data))
|
2009-11-10 09:38:24 +08:00
|
|
|
lvtthmr_init = apic_read(APIC_LVTTHMR);
|
|
|
|
}
|
|
|
|
|
2009-11-12 14:52:40 +08:00
|
|
|
void intel_init_thermal(struct cpuinfo_x86 *c)
|
2009-06-15 16:26:10 +08:00
|
|
|
{
|
|
|
|
unsigned int cpu = smp_processor_id();
|
|
|
|
int tm2 = 0;
|
|
|
|
u32 l, h;
|
|
|
|
|
2009-12-14 16:57:00 +08:00
|
|
|
if (!intel_thermal_supported(c))
|
2009-06-15 16:26:10 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First check if its enabled already, in which case there might
|
|
|
|
* be some SMM goo which handles it, so we can't even put a handler
|
|
|
|
* since it might be delivered via SMI already:
|
|
|
|
*/
|
|
|
|
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
|
2009-11-10 09:38:24 +08:00
|
|
|
|
2011-04-22 00:22:43 +08:00
|
|
|
h = lvtthmr_init;
|
2009-11-10 09:38:24 +08:00
|
|
|
/*
|
|
|
|
* The initial value of thermal LVT entries on all APs always reads
|
|
|
|
* 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
|
|
|
|
* sequence to them and LVT registers are reset to 0s except for
|
|
|
|
* the mask bits which are set to 1s when APs receive INIT IPI.
|
2011-04-22 00:22:43 +08:00
|
|
|
* If BIOS takes over the thermal interrupt and sets its interrupt
|
|
|
|
* delivery mode to SMI (not fixed), it restores the value that the
|
|
|
|
* BIOS has programmed on AP based on BSP's info we saved since BIOS
|
|
|
|
* is always setting the same value for all threads/cores.
|
2009-11-10 09:38:24 +08:00
|
|
|
*/
|
2011-04-22 00:22:43 +08:00
|
|
|
if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
|
|
|
|
apic_write(APIC_LVTTHMR, lvtthmr_init);
|
2009-11-10 09:38:24 +08:00
|
|
|
|
|
|
|
|
2009-06-15 16:26:10 +08:00
|
|
|
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
|
2014-09-19 03:22:15 +08:00
|
|
|
if (system_state == SYSTEM_BOOTING)
|
2016-02-02 11:45:02 +08:00
|
|
|
pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
|
2009-06-15 16:26:10 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-07-29 06:04:59 +08:00
|
|
|
/* early Pentium M models use different method for enabling TM2 */
|
|
|
|
if (cpu_has(c, X86_FEATURE_TM2)) {
|
|
|
|
if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
|
|
|
|
rdmsr(MSR_THERM2_CTL, l, h);
|
|
|
|
if (l & MSR_THERM2_CTL_TM_SELECT)
|
|
|
|
tm2 = 1;
|
|
|
|
} else if (l & MSR_IA32_MISC_ENABLE_TM2)
|
|
|
|
tm2 = 1;
|
|
|
|
}
|
|
|
|
|
2009-06-15 16:26:10 +08:00
|
|
|
/* We'll mask the thermal vector in the lapic till we're ready: */
|
|
|
|
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
|
|
|
|
apic_write(APIC_LVTTHMR, h);
|
|
|
|
|
|
|
|
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
|
2013-05-22 03:35:17 +08:00
|
|
|
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
|
2010-07-30 08:13:45 +08:00
|
|
|
wrmsr(MSR_IA32_THERM_INTERRUPT,
|
2013-05-22 03:35:17 +08:00
|
|
|
(l | (THERM_INT_LOW_ENABLE
|
|
|
|
| THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
|
|
|
|
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
|
2010-07-30 08:13:45 +08:00
|
|
|
wrmsr(MSR_IA32_THERM_INTERRUPT,
|
2013-05-22 03:35:17 +08:00
|
|
|
l | (THERM_INT_LOW_ENABLE
|
2010-07-30 08:13:45 +08:00
|
|
|
| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
|
|
|
|
else
|
|
|
|
wrmsr(MSR_IA32_THERM_INTERRUPT,
|
|
|
|
l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
|
2009-06-15 16:26:10 +08:00
|
|
|
|
2010-07-30 08:13:44 +08:00
|
|
|
if (cpu_has(c, X86_FEATURE_PTS)) {
|
|
|
|
rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
|
2013-05-22 03:35:17 +08:00
|
|
|
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
|
2010-07-30 08:13:45 +08:00
|
|
|
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
|
2013-05-22 03:35:17 +08:00
|
|
|
(l | (PACKAGE_THERM_INT_LOW_ENABLE
|
|
|
|
| PACKAGE_THERM_INT_HIGH_ENABLE))
|
|
|
|
& ~PACKAGE_THERM_INT_PLN_ENABLE, h);
|
|
|
|
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
|
|
|
|
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
|
|
|
|
l | (PACKAGE_THERM_INT_LOW_ENABLE
|
2010-07-30 08:13:45 +08:00
|
|
|
| PACKAGE_THERM_INT_HIGH_ENABLE
|
|
|
|
| PACKAGE_THERM_INT_PLN_ENABLE), h);
|
|
|
|
else
|
|
|
|
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
|
|
|
|
l | (PACKAGE_THERM_INT_LOW_ENABLE
|
|
|
|
| PACKAGE_THERM_INT_HIGH_ENABLE), h);
|
2010-07-30 08:13:44 +08:00
|
|
|
}
|
|
|
|
|
2009-06-15 16:26:36 +08:00
|
|
|
smp_thermal_vector = intel_thermal_interrupt;
|
2009-06-15 16:26:10 +08:00
|
|
|
|
|
|
|
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
|
|
|
|
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
|
|
|
|
|
|
|
|
/* Unmask the thermal vector: */
|
|
|
|
l = apic_read(APIC_LVTTHMR);
|
|
|
|
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
|
|
|
|
|
2016-02-02 11:45:02 +08:00
|
|
|
pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
|
|
|
|
tm2 ? "TM2" : "TM1");
|
2009-06-15 16:26:10 +08:00
|
|
|
|
|
|
|
/* enable thermal throttle processing */
|
|
|
|
atomic_set(&therm_throt_en, 1);
|
|
|
|
}
|