2011-11-30 10:46:42 +08:00
|
|
|
/*
|
|
|
|
* processor_idle - idle state cpuidle driver.
|
|
|
|
* Adapted from drivers/idle/intel_idle.c and
|
|
|
|
* drivers/acpi/processor_idle.c
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/moduleparam.h>
|
|
|
|
#include <linux/cpuidle.h>
|
|
|
|
#include <linux/cpu.h>
|
2012-05-21 02:34:27 +08:00
|
|
|
#include <linux/notifier.h>
|
2011-11-30 10:46:42 +08:00
|
|
|
|
|
|
|
#include <asm/paca.h>
|
|
|
|
#include <asm/reg.h>
|
|
|
|
#include <asm/machdep.h>
|
|
|
|
#include <asm/firmware.h>
|
2012-03-29 01:30:02 +08:00
|
|
|
#include <asm/runlatch.h>
|
2013-08-22 17:53:52 +08:00
|
|
|
#include <asm/plpar_wrappers.h>
|
2011-11-30 10:46:42 +08:00
|
|
|
|
|
|
|
struct cpuidle_driver pseries_idle_driver = {
|
2013-04-03 20:15:22 +08:00
|
|
|
.name = "pseries_idle",
|
|
|
|
.owner = THIS_MODULE,
|
2011-11-30 10:46:42 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define MAX_IDLE_STATE_COUNT 2
|
|
|
|
|
|
|
|
static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
|
|
|
|
static struct cpuidle_device __percpu *pseries_cpuidle_devices;
|
|
|
|
static struct cpuidle_state *cpuidle_state_table;
|
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
static inline void idle_loop_prolog(unsigned long *in_purr)
|
2011-11-30 10:46:42 +08:00
|
|
|
{
|
|
|
|
*in_purr = mfspr(SPRN_PURR);
|
|
|
|
/*
|
|
|
|
* Indicate to the HV that we are idle. Now would be
|
|
|
|
* a good time to find other work to dispatch.
|
|
|
|
*/
|
|
|
|
get_lppaca()->idle = 1;
|
|
|
|
}
|
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
static inline void idle_loop_epilog(unsigned long in_purr)
|
2011-11-30 10:46:42 +08:00
|
|
|
{
|
2013-08-07 00:01:46 +08:00
|
|
|
u64 wait_cycles;
|
|
|
|
|
|
|
|
wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
|
|
|
|
wait_cycles += mfspr(SPRN_PURR) - in_purr;
|
|
|
|
get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
|
2011-11-30 10:46:42 +08:00
|
|
|
get_lppaca()->idle = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int snooze_loop(struct cpuidle_device *dev,
|
|
|
|
struct cpuidle_driver *drv,
|
|
|
|
int index)
|
|
|
|
{
|
|
|
|
unsigned long in_purr;
|
cpuidle/powerpc: Fix snooze state problem in the cpuidle design on pseries.
Earlier without cpuidle framework on pseries, the native arch
idle routine comprised of both snooze and nap
states. smt_snooze_delay variable was used to delay
the idle process entry to deeper idle state like nap.
With the coming of cpuidle, this arch specific idle was replaced
by two different idle routines, one for supporting snooze and other
for nap. This enabled addition of more
low level idle states on pseries in the future.
On adopting the generic cpuidle framework for POWER systems,
the decision of which idle state to choose from, given a predicted
idle time is taken by the menu governor based on
target_residency and exit_latency of the idle states.
target_residency is the minimum time to be resident in that idle state.
Exit_latency is time taken to exit out of idle state.
Deeper the idle state, both the target residency and exit latency
would be higher.
In the current design, smt_snooze_delay is used as target_residency
for the snooze state which is incorrect, as it is not the
minimum but the maximum duration to be in snooze state.
This would result in the governor in taking bad decision,
as presently target_residency of nap < target_residency of snooze
inspite of nap being deeper idle state.
This patch aims to fix this problem by replacing the smt_snooze_delay loop
in snooze state, with the need_resched() as the governor is aware of
entry and exit of various idle transitions based on which
next idle time prediction.
The governor is intelligent enough to determine the idle state the needs to
be transitioned to and maintains a whole of heuristics including
io load, previous idle states predictions etc for the same, based on
which idle state entry decision is taken.
With this fix, of setting target_residency of snooze to 0
nap to smt_snooze_delay
if the predicted idle time is less
than smt_snooze_delay (target_residency of nap)
value governor would pick snooze state, else nap. This adhers to the
previous native idle design.
Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-10-04 02:42:26 +08:00
|
|
|
int cpu = dev->cpu;
|
2011-11-30 10:46:42 +08:00
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
idle_loop_prolog(&in_purr);
|
cpuidle/powerpc: Fix snooze state problem in the cpuidle design on pseries.
Earlier without cpuidle framework on pseries, the native arch
idle routine comprised of both snooze and nap
states. smt_snooze_delay variable was used to delay
the idle process entry to deeper idle state like nap.
With the coming of cpuidle, this arch specific idle was replaced
by two different idle routines, one for supporting snooze and other
for nap. This enabled addition of more
low level idle states on pseries in the future.
On adopting the generic cpuidle framework for POWER systems,
the decision of which idle state to choose from, given a predicted
idle time is taken by the menu governor based on
target_residency and exit_latency of the idle states.
target_residency is the minimum time to be resident in that idle state.
Exit_latency is time taken to exit out of idle state.
Deeper the idle state, both the target residency and exit latency
would be higher.
In the current design, smt_snooze_delay is used as target_residency
for the snooze state which is incorrect, as it is not the
minimum but the maximum duration to be in snooze state.
This would result in the governor in taking bad decision,
as presently target_residency of nap < target_residency of snooze
inspite of nap being deeper idle state.
This patch aims to fix this problem by replacing the smt_snooze_delay loop
in snooze state, with the need_resched() as the governor is aware of
entry and exit of various idle transitions based on which
next idle time prediction.
The governor is intelligent enough to determine the idle state the needs to
be transitioned to and maintains a whole of heuristics including
io load, previous idle states predictions etc for the same, based on
which idle state entry decision is taken.
With this fix, of setting target_residency of snooze to 0
nap to smt_snooze_delay
if the predicted idle time is less
than smt_snooze_delay (target_residency of nap)
value governor would pick snooze state, else nap. This adhers to the
previous native idle design.
Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-10-04 02:42:26 +08:00
|
|
|
local_irq_enable();
|
|
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
2011-11-30 10:46:42 +08:00
|
|
|
|
cpuidle/powerpc: Fix snooze state problem in the cpuidle design on pseries.
Earlier without cpuidle framework on pseries, the native arch
idle routine comprised of both snooze and nap
states. smt_snooze_delay variable was used to delay
the idle process entry to deeper idle state like nap.
With the coming of cpuidle, this arch specific idle was replaced
by two different idle routines, one for supporting snooze and other
for nap. This enabled addition of more
low level idle states on pseries in the future.
On adopting the generic cpuidle framework for POWER systems,
the decision of which idle state to choose from, given a predicted
idle time is taken by the menu governor based on
target_residency and exit_latency of the idle states.
target_residency is the minimum time to be resident in that idle state.
Exit_latency is time taken to exit out of idle state.
Deeper the idle state, both the target residency and exit latency
would be higher.
In the current design, smt_snooze_delay is used as target_residency
for the snooze state which is incorrect, as it is not the
minimum but the maximum duration to be in snooze state.
This would result in the governor in taking bad decision,
as presently target_residency of nap < target_residency of snooze
inspite of nap being deeper idle state.
This patch aims to fix this problem by replacing the smt_snooze_delay loop
in snooze state, with the need_resched() as the governor is aware of
entry and exit of various idle transitions based on which
next idle time prediction.
The governor is intelligent enough to determine the idle state the needs to
be transitioned to and maintains a whole of heuristics including
io load, previous idle states predictions etc for the same, based on
which idle state entry decision is taken.
With this fix, of setting target_residency of snooze to 0
nap to smt_snooze_delay
if the predicted idle time is less
than smt_snooze_delay (target_residency of nap)
value governor would pick snooze state, else nap. This adhers to the
previous native idle design.
Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-10-04 02:42:26 +08:00
|
|
|
while ((!need_resched()) && cpu_online(cpu)) {
|
|
|
|
ppc64_runlatch_off();
|
|
|
|
HMT_low();
|
|
|
|
HMT_very_low();
|
2011-11-30 10:46:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
HMT_medium();
|
cpuidle/powerpc: Fix snooze state problem in the cpuidle design on pseries.
Earlier without cpuidle framework on pseries, the native arch
idle routine comprised of both snooze and nap
states. smt_snooze_delay variable was used to delay
the idle process entry to deeper idle state like nap.
With the coming of cpuidle, this arch specific idle was replaced
by two different idle routines, one for supporting snooze and other
for nap. This enabled addition of more
low level idle states on pseries in the future.
On adopting the generic cpuidle framework for POWER systems,
the decision of which idle state to choose from, given a predicted
idle time is taken by the menu governor based on
target_residency and exit_latency of the idle states.
target_residency is the minimum time to be resident in that idle state.
Exit_latency is time taken to exit out of idle state.
Deeper the idle state, both the target residency and exit latency
would be higher.
In the current design, smt_snooze_delay is used as target_residency
for the snooze state which is incorrect, as it is not the
minimum but the maximum duration to be in snooze state.
This would result in the governor in taking bad decision,
as presently target_residency of nap < target_residency of snooze
inspite of nap being deeper idle state.
This patch aims to fix this problem by replacing the smt_snooze_delay loop
in snooze state, with the need_resched() as the governor is aware of
entry and exit of various idle transitions based on which
next idle time prediction.
The governor is intelligent enough to determine the idle state the needs to
be transitioned to and maintains a whole of heuristics including
io load, previous idle states predictions etc for the same, based on
which idle state entry decision is taken.
With this fix, of setting target_residency of snooze to 0
nap to smt_snooze_delay
if the predicted idle time is less
than smt_snooze_delay (target_residency of nap)
value governor would pick snooze state, else nap. This adhers to the
previous native idle design.
Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-10-04 02:42:26 +08:00
|
|
|
clear_thread_flag(TIF_POLLING_NRFLAG);
|
|
|
|
smp_mb();
|
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
idle_loop_epilog(in_purr);
|
|
|
|
|
2011-11-30 10:46:42 +08:00
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
static void check_and_cede_processor(void)
|
|
|
|
{
|
|
|
|
/*
|
2012-07-10 16:36:40 +08:00
|
|
|
* Ensure our interrupt state is properly tracked,
|
|
|
|
* also checks if no interrupt has occurred while we
|
|
|
|
* were soft-disabled
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
*/
|
2012-07-10 16:36:40 +08:00
|
|
|
if (prep_irq_for_idle()) {
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
cede_processor();
|
2012-07-10 16:36:40 +08:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
/* Ensure that H_CEDE returns with IRQs on */
|
|
|
|
if (WARN_ON(!(mfmsr() & MSR_EE)))
|
|
|
|
__hard_irq_enable();
|
|
|
|
#endif
|
|
|
|
}
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
}
|
|
|
|
|
2011-11-30 10:46:42 +08:00
|
|
|
static int dedicated_cede_loop(struct cpuidle_device *dev,
|
|
|
|
struct cpuidle_driver *drv,
|
|
|
|
int index)
|
|
|
|
{
|
|
|
|
unsigned long in_purr;
|
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
idle_loop_prolog(&in_purr);
|
2011-11-30 10:46:42 +08:00
|
|
|
get_lppaca()->donate_dedicated_cpu = 1;
|
|
|
|
|
|
|
|
ppc64_runlatch_off();
|
|
|
|
HMT_medium();
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
check_and_cede_processor();
|
2011-11-30 10:46:42 +08:00
|
|
|
|
|
|
|
get_lppaca()->donate_dedicated_cpu = 0;
|
2013-04-03 20:15:22 +08:00
|
|
|
|
|
|
|
idle_loop_epilog(in_purr);
|
|
|
|
|
2011-11-30 10:46:42 +08:00
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int shared_cede_loop(struct cpuidle_device *dev,
|
|
|
|
struct cpuidle_driver *drv,
|
|
|
|
int index)
|
|
|
|
{
|
|
|
|
unsigned long in_purr;
|
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
idle_loop_prolog(&in_purr);
|
2011-11-30 10:46:42 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Yield the processor to the hypervisor. We return if
|
|
|
|
* an external interrupt occurs (which are driven prior
|
|
|
|
* to returning here) or if a prod occurs from another
|
|
|
|
* processor. When returning here, external interrupts
|
|
|
|
* are enabled.
|
|
|
|
*/
|
powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.
We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.
The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.
Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.
This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.
The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.
When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.
We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).
This removes the need to play with the decrementer to try to create
fake interrupts, among others.
In addition, this adds a few refinements:
- We no longer hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.
- Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.
- On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)
- We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.
Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2:
- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
to retrigger an interrupt without preventing hard-enable
v3:
- Fix or vs. ori bug on Book3E
- Fix enabling of interrupts for some exceptions on Book3E
v4:
- Fix resend of doorbells on return from interrupt on Book3E
v5:
- Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.
v6:
- 32-bit compile fix
- more compile fixes with various .config combos
- factor out the asm code to soft-disable interrupts
- remove the C wrapper around preempt_schedule_irq
v7:
- Fix a bug with hard irq state tracking on native power7
2012-03-06 15:27:59 +08:00
|
|
|
check_and_cede_processor();
|
2011-11-30 10:46:42 +08:00
|
|
|
|
2013-04-03 20:15:22 +08:00
|
|
|
idle_loop_epilog(in_purr);
|
|
|
|
|
2011-11-30 10:46:42 +08:00
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* States for dedicated partition case.
|
|
|
|
*/
|
|
|
|
static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
|
|
|
|
{ /* Snooze */
|
|
|
|
.name = "snooze",
|
|
|
|
.desc = "snooze",
|
|
|
|
.flags = CPUIDLE_FLAG_TIME_VALID,
|
|
|
|
.exit_latency = 0,
|
|
|
|
.target_residency = 0,
|
|
|
|
.enter = &snooze_loop },
|
|
|
|
{ /* CEDE */
|
|
|
|
.name = "CEDE",
|
|
|
|
.desc = "CEDE",
|
|
|
|
.flags = CPUIDLE_FLAG_TIME_VALID,
|
cpuidle/powerpc: Fix snooze state problem in the cpuidle design on pseries.
Earlier without cpuidle framework on pseries, the native arch
idle routine comprised of both snooze and nap
states. smt_snooze_delay variable was used to delay
the idle process entry to deeper idle state like nap.
With the coming of cpuidle, this arch specific idle was replaced
by two different idle routines, one for supporting snooze and other
for nap. This enabled addition of more
low level idle states on pseries in the future.
On adopting the generic cpuidle framework for POWER systems,
the decision of which idle state to choose from, given a predicted
idle time is taken by the menu governor based on
target_residency and exit_latency of the idle states.
target_residency is the minimum time to be resident in that idle state.
Exit_latency is time taken to exit out of idle state.
Deeper the idle state, both the target residency and exit latency
would be higher.
In the current design, smt_snooze_delay is used as target_residency
for the snooze state which is incorrect, as it is not the
minimum but the maximum duration to be in snooze state.
This would result in the governor in taking bad decision,
as presently target_residency of nap < target_residency of snooze
inspite of nap being deeper idle state.
This patch aims to fix this problem by replacing the smt_snooze_delay loop
in snooze state, with the need_resched() as the governor is aware of
entry and exit of various idle transitions based on which
next idle time prediction.
The governor is intelligent enough to determine the idle state the needs to
be transitioned to and maintains a whole of heuristics including
io load, previous idle states predictions etc for the same, based on
which idle state entry decision is taken.
With this fix, of setting target_residency of snooze to 0
nap to smt_snooze_delay
if the predicted idle time is less
than smt_snooze_delay (target_residency of nap)
value governor would pick snooze state, else nap. This adhers to the
previous native idle design.
Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-10-04 02:42:26 +08:00
|
|
|
.exit_latency = 10,
|
|
|
|
.target_residency = 100,
|
2011-11-30 10:46:42 +08:00
|
|
|
.enter = &dedicated_cede_loop },
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* States for shared partition case.
|
|
|
|
*/
|
|
|
|
static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
|
|
|
|
{ /* Shared Cede */
|
|
|
|
.name = "Shared Cede",
|
|
|
|
.desc = "Shared Cede",
|
|
|
|
.flags = CPUIDLE_FLAG_TIME_VALID,
|
|
|
|
.exit_latency = 0,
|
|
|
|
.target_residency = 0,
|
|
|
|
.enter = &shared_cede_loop },
|
|
|
|
};
|
|
|
|
|
2012-10-04 02:42:18 +08:00
|
|
|
void update_smt_snooze_delay(int cpu, int residency)
|
|
|
|
{
|
|
|
|
struct cpuidle_driver *drv = cpuidle_get_driver();
|
|
|
|
struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
|
|
|
|
|
|
|
|
if (cpuidle_state_table != dedicated_states)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (residency < 0) {
|
|
|
|
/* Disable the Nap state on that cpu */
|
|
|
|
if (dev)
|
|
|
|
dev->states_usage[1].disable = 1;
|
|
|
|
} else
|
|
|
|
if (drv)
|
cpuidle/powerpc: Fix snooze state problem in the cpuidle design on pseries.
Earlier without cpuidle framework on pseries, the native arch
idle routine comprised of both snooze and nap
states. smt_snooze_delay variable was used to delay
the idle process entry to deeper idle state like nap.
With the coming of cpuidle, this arch specific idle was replaced
by two different idle routines, one for supporting snooze and other
for nap. This enabled addition of more
low level idle states on pseries in the future.
On adopting the generic cpuidle framework for POWER systems,
the decision of which idle state to choose from, given a predicted
idle time is taken by the menu governor based on
target_residency and exit_latency of the idle states.
target_residency is the minimum time to be resident in that idle state.
Exit_latency is time taken to exit out of idle state.
Deeper the idle state, both the target residency and exit latency
would be higher.
In the current design, smt_snooze_delay is used as target_residency
for the snooze state which is incorrect, as it is not the
minimum but the maximum duration to be in snooze state.
This would result in the governor in taking bad decision,
as presently target_residency of nap < target_residency of snooze
inspite of nap being deeper idle state.
This patch aims to fix this problem by replacing the smt_snooze_delay loop
in snooze state, with the need_resched() as the governor is aware of
entry and exit of various idle transitions based on which
next idle time prediction.
The governor is intelligent enough to determine the idle state the needs to
be transitioned to and maintains a whole of heuristics including
io load, previous idle states predictions etc for the same, based on
which idle state entry decision is taken.
With this fix, of setting target_residency of snooze to 0
nap to smt_snooze_delay
if the predicted idle time is less
than smt_snooze_delay (target_residency of nap)
value governor would pick snooze state, else nap. This adhers to the
previous native idle design.
Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2012-10-04 02:42:26 +08:00
|
|
|
drv->states[1].target_residency = residency;
|
2012-10-04 02:42:18 +08:00
|
|
|
}
|
|
|
|
|
2012-05-21 02:34:27 +08:00
|
|
|
static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n,
|
|
|
|
unsigned long action, void *hcpu)
|
2011-11-30 10:46:42 +08:00
|
|
|
{
|
2012-05-21 02:34:27 +08:00
|
|
|
int hotcpu = (unsigned long)hcpu;
|
2011-11-30 10:46:42 +08:00
|
|
|
struct cpuidle_device *dev =
|
2012-05-21 02:34:27 +08:00
|
|
|
per_cpu_ptr(pseries_cpuidle_devices, hotcpu);
|
|
|
|
|
2012-07-04 04:07:22 +08:00
|
|
|
if (dev && cpuidle_get_driver()) {
|
|
|
|
switch (action) {
|
|
|
|
case CPU_ONLINE:
|
|
|
|
case CPU_ONLINE_FROZEN:
|
|
|
|
cpuidle_pause_and_lock();
|
2012-05-21 02:34:27 +08:00
|
|
|
cpuidle_enable_device(dev);
|
2012-07-04 04:07:22 +08:00
|
|
|
cpuidle_resume_and_unlock();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case CPU_DEAD:
|
|
|
|
case CPU_DEAD_FROZEN:
|
|
|
|
cpuidle_pause_and_lock();
|
|
|
|
cpuidle_disable_device(dev);
|
|
|
|
cpuidle_resume_and_unlock();
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return NOTIFY_DONE;
|
2012-05-21 02:34:27 +08:00
|
|
|
}
|
2011-11-30 10:46:42 +08:00
|
|
|
}
|
2012-05-21 02:34:27 +08:00
|
|
|
return NOTIFY_OK;
|
2011-11-30 10:46:42 +08:00
|
|
|
}
|
|
|
|
|
2012-05-21 02:34:27 +08:00
|
|
|
static struct notifier_block setup_hotplug_notifier = {
|
|
|
|
.notifier_call = pseries_cpuidle_add_cpu_notifier,
|
|
|
|
};
|
|
|
|
|
2011-11-30 10:46:42 +08:00
|
|
|
/*
|
|
|
|
* pseries_cpuidle_driver_init()
|
|
|
|
*/
|
|
|
|
static int pseries_cpuidle_driver_init(void)
|
|
|
|
{
|
|
|
|
int idle_state;
|
|
|
|
struct cpuidle_driver *drv = &pseries_idle_driver;
|
|
|
|
|
|
|
|
drv->state_count = 0;
|
|
|
|
|
|
|
|
for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) {
|
|
|
|
|
|
|
|
if (idle_state > max_idle_state)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* is the state not enabled? */
|
|
|
|
if (cpuidle_state_table[idle_state].enter == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
drv->states[drv->state_count] = /* structure copy */
|
|
|
|
cpuidle_state_table[idle_state];
|
|
|
|
|
|
|
|
drv->state_count += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pseries_idle_devices_uninit(void)
|
|
|
|
* unregister cpuidle devices and de-allocate memory
|
|
|
|
*/
|
|
|
|
static void pseries_idle_devices_uninit(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct cpuidle_device *dev;
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
dev = per_cpu_ptr(pseries_cpuidle_devices, i);
|
|
|
|
cpuidle_unregister_device(dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
free_percpu(pseries_cpuidle_devices);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pseries_idle_devices_init()
|
|
|
|
* allocate, initialize and register cpuidle device
|
|
|
|
*/
|
|
|
|
static int pseries_idle_devices_init(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct cpuidle_driver *drv = &pseries_idle_driver;
|
|
|
|
struct cpuidle_device *dev;
|
|
|
|
|
|
|
|
pseries_cpuidle_devices = alloc_percpu(struct cpuidle_device);
|
|
|
|
if (pseries_cpuidle_devices == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
dev = per_cpu_ptr(pseries_cpuidle_devices, i);
|
|
|
|
dev->state_count = drv->state_count;
|
|
|
|
dev->cpu = i;
|
|
|
|
if (cpuidle_register_device(dev)) {
|
|
|
|
printk(KERN_DEBUG \
|
|
|
|
"cpuidle_register_device %d failed!\n", i);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pseries_idle_probe()
|
|
|
|
* Choose state table for shared versus dedicated partition
|
|
|
|
*/
|
|
|
|
static int pseries_idle_probe(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
|
|
|
|
return -ENODEV;
|
|
|
|
|
2011-11-30 10:47:03 +08:00
|
|
|
if (cpuidle_disable != IDLE_NO_OVERRIDE)
|
|
|
|
return -ENODEV;
|
|
|
|
|
2011-11-30 10:46:42 +08:00
|
|
|
if (max_idle_state == 0) {
|
|
|
|
printk(KERN_DEBUG "pseries processor idle disabled.\n");
|
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
|
2013-08-07 00:01:26 +08:00
|
|
|
if (lppaca_shared_proc(get_lppaca()))
|
2011-11-30 10:46:42 +08:00
|
|
|
cpuidle_state_table = shared_states;
|
|
|
|
else
|
|
|
|
cpuidle_state_table = dedicated_states;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init pseries_processor_idle_init(void)
|
|
|
|
{
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
retval = pseries_idle_probe();
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
pseries_cpuidle_driver_init();
|
|
|
|
retval = cpuidle_register_driver(&pseries_idle_driver);
|
|
|
|
if (retval) {
|
|
|
|
printk(KERN_DEBUG "Registration of pseries driver failed.\n");
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
retval = pseries_idle_devices_init();
|
|
|
|
if (retval) {
|
|
|
|
pseries_idle_devices_uninit();
|
|
|
|
cpuidle_unregister_driver(&pseries_idle_driver);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2012-05-21 02:34:27 +08:00
|
|
|
register_cpu_notifier(&setup_hotplug_notifier);
|
2011-11-30 10:46:42 +08:00
|
|
|
printk(KERN_DEBUG "pseries_idle_driver registered\n");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit pseries_processor_idle_exit(void)
|
|
|
|
{
|
|
|
|
|
2012-07-04 04:07:22 +08:00
|
|
|
unregister_cpu_notifier(&setup_hotplug_notifier);
|
2011-11-30 10:46:42 +08:00
|
|
|
pseries_idle_devices_uninit();
|
|
|
|
cpuidle_unregister_driver(&pseries_idle_driver);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(pseries_processor_idle_init);
|
|
|
|
module_exit(pseries_processor_idle_exit);
|
|
|
|
|
|
|
|
MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>");
|
|
|
|
MODULE_DESCRIPTION("Cpuidle driver for POWER");
|
|
|
|
MODULE_LICENSE("GPL");
|