linux/drivers/cpufreq/intel_pstate.c

1512 lines
36 KiB
C
Raw Normal View History

/*
* intel_pstate.c: Native P state management for Intel processors
*
* (C) Copyright 2012 Intel Corporation
* Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/module.h>
#include <linux/ktime.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/cpufreq.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
#include <linux/acpi.h>
x86/mm: Decouple <linux/vmalloc.h> from <asm/io.h> Nothing in <asm/io.h> uses anything from <linux/vmalloc.h>, so remove it from there and fix up the resulting build problems triggered on x86 {64|32}-bit {def|allmod|allno}configs. The breakages were triggering in places where x86 builds relied on vmalloc() facilities but did not include <linux/vmalloc.h> explicitly and relied on the implicit inclusion via <asm/io.h>. Also add: - <linux/init.h> to <linux/io.h> - <asm/pgtable_types> to <asm/io.h> ... which were two other implicit header file dependencies. Suggested-by: David Miller <davem@davemloft.net> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> [ Tidied up the changelog. ] Acked-by: David Miller <davem@davemloft.net> Acked-by: Takashi Iwai <tiwai@suse.de> Acked-by: Viresh Kumar <viresh.kumar@linaro.org> Acked-by: Vinod Koul <vinod.koul@intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> Cc: Colin Cross <ccross@android.com> Cc: David Vrabel <david.vrabel@citrix.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Haiyang Zhang <haiyangz@microsoft.com> Cc: James E.J. Bottomley <JBottomley@odin.com> Cc: Jaroslav Kysela <perex@perex.cz> Cc: K. Y. Srinivasan <kys@microsoft.com> Cc: Kees Cook <keescook@chromium.org> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Kristen Carlson Accardi <kristen@linux.intel.com> Cc: Len Brown <lenb@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rafael J. Wysocki <rjw@rjwysocki.net> Cc: Suma Ramars <sramars@cisco.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-06-02 17:01:38 +08:00
#include <linux/vmalloc.h>
#include <trace/events/power.h>
#include <asm/div64.h>
#include <asm/msr.h>
#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#define ATOM_RATIOS 0x66a
#define ATOM_VIDS 0x66b
#define ATOM_TURBO_RATIOS 0x66c
#define ATOM_TURBO_VIDS 0x66d
#define FRAC_BITS 8
#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
#define fp_toint(X) ((X) >> FRAC_BITS)
static inline int32_t mul_fp(int32_t x, int32_t y)
{
return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
}
intel_pstate: Fix overflow in busy_scaled due to long delay The kernel may delay interrupts for a long time which can result in timers being delayed. If this occurs the intel_pstate driver will crash with a divide by zero error: divide error: 0000 [#1] SMP Modules linked in: btrfs zlib_deflate raid6_pq xor msdos ext4 mbcache jbd2 binfmt_misc arc4 md4 nls_utf8 cifs dns_resolver tcp_lp bnep bluetooth rfkill fuse dm_service_time iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ftp ip6t_rpfilter ip6t_REJECT ipt_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables intel_powerclamp coretemp vfat fat kvm_intel iTCO_wdt iTCO_vendor_support ipmi_devintf sr_mod kvm crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel cdc_ether lrw usbnet cdrom mii gf128mul glue_helper ablk_helper cryptd lpc_ich mfd_core pcspkr sb_edac edac_core ipmi_si ipmi_msghandler ioatdma wmi shpchp acpi_pad nfsd auth_rpcgss nfs_acl lockd uinput dm_multipath sunrpc xfs libcrc32c usb_storage sd_mod crc_t10dif crct10dif_common ixgbe mgag200 syscopyarea sysfillrect sysimgblt mdio drm_kms_helper ttm igb drm ptp pps_core dca i2c_algo_bit megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod CPU: 113 PID: 0 Comm: swapper/113 Tainted: G W -------------- 3.10.0-229.1.2.el7.x86_64 #1 Hardware name: IBM x3950 X6 -[3837AC2]-/00FN827, BIOS -[A8E112BUS-1.00]- 08/27/2014 task: ffff880fe8abe660 ti: ffff880fe8ae4000 task.ti: ffff880fe8ae4000 RIP: 0010:[<ffffffff814a9279>] [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0 RSP: 0018:ffff883fff4e3db8 EFLAGS: 00010206 RAX: 0000000027100000 RBX: ffff883fe6965100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000010 RDI: 000000002e53632d RBP: ffff883fff4e3e20 R08: 000e6f69a5a125c0 R09: ffff883fe84ec001 R10: 0000000000000002 R11: 0000000000000005 R12: 00000000000049f5 R13: 0000000000271000 R14: 00000000000049f5 R15: 0000000000000246 FS: 0000000000000000(0000) GS:ffff883fff4e0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7668601000 CR3: 000000000190a000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff883fff4e3e58 ffffffff81099dc1 0000000000000086 0000000000000071 ffff883fff4f3680 0000000000000071 fbdc8a965e33afee ffffffff810b69dd ffff883fe84ec000 ffff883fe6965108 0000000000000100 ffffffff814a9100 Call Trace: <IRQ> [<ffffffff81099dc1>] ? run_posix_cpu_timers+0x51/0x840 [<ffffffff810b69dd>] ? trigger_load_balance+0x5d/0x200 [<ffffffff814a9100>] ? pid_param_set+0x130/0x130 [<ffffffff8107df56>] call_timer_fn+0x36/0x110 [<ffffffff814a9100>] ? pid_param_set+0x130/0x130 [<ffffffff8107fdcf>] run_timer_softirq+0x21f/0x320 [<ffffffff81077b2f>] __do_softirq+0xef/0x280 [<ffffffff816156dc>] call_softirq+0x1c/0x30 [<ffffffff81015d95>] do_softirq+0x65/0xa0 [<ffffffff81077ec5>] irq_exit+0x115/0x120 [<ffffffff81616355>] smp_apic_timer_interrupt+0x45/0x60 [<ffffffff81614a1d>] apic_timer_interrupt+0x6d/0x80 <EOI> [<ffffffff814a9c32>] ? cpuidle_enter_state+0x52/0xc0 [<ffffffff814a9c28>] ? cpuidle_enter_state+0x48/0xc0 [<ffffffff814a9d65>] cpuidle_idle_call+0xc5/0x200 [<ffffffff8101d14e>] arch_cpu_idle+0xe/0x30 [<ffffffff810c67c1>] cpu_startup_entry+0xf1/0x290 [<ffffffff8104228a>] start_secondary+0x1ba/0x230 Code: 42 0f 00 45 89 e6 48 01 c2 43 8d 44 6d 00 39 d0 73 26 49 c1 e5 08 89 d2 4d 63 f4 49 63 c5 48 c1 e2 08 48 c1 e0 08 48 63 ca 48 99 <48> f7 f9 48 98 4c 0f af f0 49 c1 ee 08 8b 43 78 c1 e0 08 44 29 RIP [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0 RSP <ffff883fff4e3db8> The kernel values for cpudata for CPU 113 were: struct cpudata { cpu = 113, timer = { entry = { next = 0x0, prev = 0xdead000000200200 }, expires = 8357799745, base = 0xffff883fe84ec001, function = 0xffffffff814a9100 <intel_pstate_timer_func>, data = 18446612406765768960, <snip> i_gain = 0, d_gain = 0, deadband = 0, last_err = 22489 }, last_sample_time = { tv64 = 4063132438017305 }, prev_aperf = 287326796397463, prev_mperf = 251427432090198, sample = { core_pct_busy = 23081, aperf = 2937407, mperf = 3257884, freq = 2524484, time = { tv64 = 4063149215234118 } } } which results in the time between samples = last_sample_time - sample.time = 4063149215234118 - 4063132438017305 = 16777216813 which is 16.777 seconds. The duration between reads of the APERF and MPERF registers overflowed a s32 sized integer in intel_pstate_get_scaled_busy()'s call to div_fp(). The result is that int_tofp(duration_us) == 0, and the kernel attempts to divide by 0. While the kernel shouldn't be delaying for a long time, it can and does happen and the intel_pstate driver should not panic in this situation. This patch changes the div_fp() function to use div64_s64() to allow for "long" division. This will avoid the overflow condition on long delays. [v2]: use div64_s64() in div_fp() Signed-off-by: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-16 01:43:29 +08:00
static inline int32_t div_fp(s64 x, s64 y)
{
intel_pstate: Fix overflow in busy_scaled due to long delay The kernel may delay interrupts for a long time which can result in timers being delayed. If this occurs the intel_pstate driver will crash with a divide by zero error: divide error: 0000 [#1] SMP Modules linked in: btrfs zlib_deflate raid6_pq xor msdos ext4 mbcache jbd2 binfmt_misc arc4 md4 nls_utf8 cifs dns_resolver tcp_lp bnep bluetooth rfkill fuse dm_service_time iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ftp ip6t_rpfilter ip6t_REJECT ipt_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables intel_powerclamp coretemp vfat fat kvm_intel iTCO_wdt iTCO_vendor_support ipmi_devintf sr_mod kvm crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel cdc_ether lrw usbnet cdrom mii gf128mul glue_helper ablk_helper cryptd lpc_ich mfd_core pcspkr sb_edac edac_core ipmi_si ipmi_msghandler ioatdma wmi shpchp acpi_pad nfsd auth_rpcgss nfs_acl lockd uinput dm_multipath sunrpc xfs libcrc32c usb_storage sd_mod crc_t10dif crct10dif_common ixgbe mgag200 syscopyarea sysfillrect sysimgblt mdio drm_kms_helper ttm igb drm ptp pps_core dca i2c_algo_bit megaraid_sas i2c_core dm_mirror dm_region_hash dm_log dm_mod CPU: 113 PID: 0 Comm: swapper/113 Tainted: G W -------------- 3.10.0-229.1.2.el7.x86_64 #1 Hardware name: IBM x3950 X6 -[3837AC2]-/00FN827, BIOS -[A8E112BUS-1.00]- 08/27/2014 task: ffff880fe8abe660 ti: ffff880fe8ae4000 task.ti: ffff880fe8ae4000 RIP: 0010:[<ffffffff814a9279>] [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0 RSP: 0018:ffff883fff4e3db8 EFLAGS: 00010206 RAX: 0000000027100000 RBX: ffff883fe6965100 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000010 RDI: 000000002e53632d RBP: ffff883fff4e3e20 R08: 000e6f69a5a125c0 R09: ffff883fe84ec001 R10: 0000000000000002 R11: 0000000000000005 R12: 00000000000049f5 R13: 0000000000271000 R14: 00000000000049f5 R15: 0000000000000246 FS: 0000000000000000(0000) GS:ffff883fff4e0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7668601000 CR3: 000000000190a000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff883fff4e3e58 ffffffff81099dc1 0000000000000086 0000000000000071 ffff883fff4f3680 0000000000000071 fbdc8a965e33afee ffffffff810b69dd ffff883fe84ec000 ffff883fe6965108 0000000000000100 ffffffff814a9100 Call Trace: <IRQ> [<ffffffff81099dc1>] ? run_posix_cpu_timers+0x51/0x840 [<ffffffff810b69dd>] ? trigger_load_balance+0x5d/0x200 [<ffffffff814a9100>] ? pid_param_set+0x130/0x130 [<ffffffff8107df56>] call_timer_fn+0x36/0x110 [<ffffffff814a9100>] ? pid_param_set+0x130/0x130 [<ffffffff8107fdcf>] run_timer_softirq+0x21f/0x320 [<ffffffff81077b2f>] __do_softirq+0xef/0x280 [<ffffffff816156dc>] call_softirq+0x1c/0x30 [<ffffffff81015d95>] do_softirq+0x65/0xa0 [<ffffffff81077ec5>] irq_exit+0x115/0x120 [<ffffffff81616355>] smp_apic_timer_interrupt+0x45/0x60 [<ffffffff81614a1d>] apic_timer_interrupt+0x6d/0x80 <EOI> [<ffffffff814a9c32>] ? cpuidle_enter_state+0x52/0xc0 [<ffffffff814a9c28>] ? cpuidle_enter_state+0x48/0xc0 [<ffffffff814a9d65>] cpuidle_idle_call+0xc5/0x200 [<ffffffff8101d14e>] arch_cpu_idle+0xe/0x30 [<ffffffff810c67c1>] cpu_startup_entry+0xf1/0x290 [<ffffffff8104228a>] start_secondary+0x1ba/0x230 Code: 42 0f 00 45 89 e6 48 01 c2 43 8d 44 6d 00 39 d0 73 26 49 c1 e5 08 89 d2 4d 63 f4 49 63 c5 48 c1 e2 08 48 c1 e0 08 48 63 ca 48 99 <48> f7 f9 48 98 4c 0f af f0 49 c1 ee 08 8b 43 78 c1 e0 08 44 29 RIP [<ffffffff814a9279>] intel_pstate_timer_func+0x179/0x3d0 RSP <ffff883fff4e3db8> The kernel values for cpudata for CPU 113 were: struct cpudata { cpu = 113, timer = { entry = { next = 0x0, prev = 0xdead000000200200 }, expires = 8357799745, base = 0xffff883fe84ec001, function = 0xffffffff814a9100 <intel_pstate_timer_func>, data = 18446612406765768960, <snip> i_gain = 0, d_gain = 0, deadband = 0, last_err = 22489 }, last_sample_time = { tv64 = 4063132438017305 }, prev_aperf = 287326796397463, prev_mperf = 251427432090198, sample = { core_pct_busy = 23081, aperf = 2937407, mperf = 3257884, freq = 2524484, time = { tv64 = 4063149215234118 } } } which results in the time between samples = last_sample_time - sample.time = 4063149215234118 - 4063132438017305 = 16777216813 which is 16.777 seconds. The duration between reads of the APERF and MPERF registers overflowed a s32 sized integer in intel_pstate_get_scaled_busy()'s call to div_fp(). The result is that int_tofp(duration_us) == 0, and the kernel attempts to divide by 0. While the kernel shouldn't be delaying for a long time, it can and does happen and the intel_pstate driver should not panic in this situation. This patch changes the div_fp() function to use div64_s64() to allow for "long" division. This will avoid the overflow condition on long delays. [v2]: use div64_s64() in div_fp() Signed-off-by: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-06-16 01:43:29 +08:00
return div64_s64((int64_t)x << FRAC_BITS, y);
}
static inline int ceiling_fp(int32_t x)
{
int mask, ret;
ret = fp_toint(x);
mask = (1 << FRAC_BITS) - 1;
if (x & mask)
ret += 1;
return ret;
}
struct sample {
int32_t core_pct_busy;
int32_t busy_scaled;
u64 aperf;
u64 mperf;
u64 tsc;
int freq;
u64 time;
};
struct pstate_data {
int current_pstate;
int min_pstate;
int max_pstate;
int max_pstate_physical;
int scaling;
int turbo_pstate;
};
struct vid_data {
int min;
int max;
int turbo;
int32_t ratio;
};
struct _pid {
int setpoint;
int32_t integral;
int32_t p_gain;
int32_t i_gain;
int32_t d_gain;
int deadband;
int32_t last_err;
};
struct cpudata {
int cpu;
struct update_util_data update_util;
struct pstate_data pstate;
struct vid_data vid;
struct _pid pid;
u64 last_sample_time;
u64 prev_aperf;
u64 prev_mperf;
u64 prev_tsc;
u64 prev_cummulative_iowait;
struct sample sample;
};
static struct cpudata **all_cpu_data;
struct pstate_adjust_policy {
int sample_rate_ms;
s64 sample_rate_ns;
int deadband;
int setpoint;
int p_gain_pct;
int d_gain_pct;
int i_gain_pct;
};
struct pstate_funcs {
int (*get_max)(void);
int (*get_max_physical)(void);
int (*get_min)(void);
int (*get_turbo)(void);
int (*get_scaling)(void);
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
u64 (*get_val)(struct cpudata*, int pstate);
void (*get_vid)(struct cpudata *);
int32_t (*get_target_pstate)(struct cpudata *);
};
struct cpu_defaults {
struct pstate_adjust_policy pid_policy;
struct pstate_funcs funcs;
};
static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu);
static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
static struct pstate_adjust_policy pid_params;
static struct pstate_funcs pstate_funcs;
static int hwp_active;
struct perf_limits {
int no_turbo;
int turbo_disabled;
int max_perf_pct;
int min_perf_pct;
int32_t max_perf;
int32_t min_perf;
int max_policy_pct;
int max_sysfs_pct;
int min_policy_pct;
int min_sysfs_pct;
};
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
static struct perf_limits performance_limits = {
.no_turbo = 0,
.turbo_disabled = 0,
.max_perf_pct = 100,
.max_perf = int_tofp(1),
.min_perf_pct = 100,
.min_perf = int_tofp(1),
.max_policy_pct = 100,
.max_sysfs_pct = 100,
.min_policy_pct = 0,
.min_sysfs_pct = 0,
};
static struct perf_limits powersave_limits = {
.no_turbo = 0,
.turbo_disabled = 0,
.max_perf_pct = 100,
.max_perf = int_tofp(1),
.min_perf_pct = 0,
.min_perf = 0,
.max_policy_pct = 100,
.max_sysfs_pct = 100,
.min_policy_pct = 0,
.min_sysfs_pct = 0,
};
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE
static struct perf_limits *limits = &performance_limits;
#else
static struct perf_limits *limits = &powersave_limits;
#endif
static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
int deadband, int integral) {
pid->setpoint = int_tofp(setpoint);
pid->deadband = int_tofp(deadband);
pid->integral = int_tofp(integral);
pid->last_err = int_tofp(setpoint) - int_tofp(busy);
}
static inline void pid_p_gain_set(struct _pid *pid, int percent)
{
pid->p_gain = div_fp(int_tofp(percent), int_tofp(100));
}
static inline void pid_i_gain_set(struct _pid *pid, int percent)
{
pid->i_gain = div_fp(int_tofp(percent), int_tofp(100));
}
static inline void pid_d_gain_set(struct _pid *pid, int percent)
{
pid->d_gain = div_fp(int_tofp(percent), int_tofp(100));
}
static signed int pid_calc(struct _pid *pid, int32_t busy)
{
signed int result;
int32_t pterm, dterm, fp_error;
int32_t integral_limit;
fp_error = pid->setpoint - busy;
if (abs(fp_error) <= pid->deadband)
return 0;
pterm = mul_fp(pid->p_gain, fp_error);
pid->integral += fp_error;
/*
* We limit the integral here so that it will never
* get higher than 30. This prevents it from becoming
* too large an input over long periods of time and allows
* it to get factored out sooner.
*
* The value of 30 was chosen through experimentation.
*/
integral_limit = int_tofp(30);
if (pid->integral > integral_limit)
pid->integral = integral_limit;
if (pid->integral < -integral_limit)
pid->integral = -integral_limit;
dterm = mul_fp(pid->d_gain, fp_error - pid->last_err);
pid->last_err = fp_error;
result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
result = result + (1 << (FRAC_BITS-1));
return (signed int)fp_toint(result);
}
static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu)
{
pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct);
pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct);
pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct);
pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0);
}
static inline void intel_pstate_reset_all_pid(void)
{
unsigned int cpu;
for_each_online_cpu(cpu) {
if (all_cpu_data[cpu])
intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
}
}
static inline void update_turbo_state(void)
{
u64 misc_en;
struct cpudata *cpu;
cpu = all_cpu_data[0];
rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->turbo_disabled =
(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
}
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
static void intel_pstate_hwp_set(const struct cpumask *cpumask)
{
int min, hw_min, max, hw_max, cpu, range, adj_range;
u64 value, cap;
rdmsrl(MSR_HWP_CAPABILITIES, cap);
hw_min = HWP_LOWEST_PERF(cap);
hw_max = HWP_HIGHEST_PERF(cap);
range = hw_max - hw_min;
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
for_each_cpu(cpu, cpumask) {
rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
adj_range = limits->min_perf_pct * range / 100;
min = hw_min + adj_range;
value &= ~HWP_MIN_PERF(~0L);
value |= HWP_MIN_PERF(min);
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
adj_range = limits->max_perf_pct * range / 100;
max = hw_min + adj_range;
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->no_turbo) {
hw_max = HWP_GUARANTEED_PERF(cap);
if (hw_max < max)
max = hw_max;
}
value &= ~HWP_MAX_PERF(~0L);
value |= HWP_MAX_PERF(max);
wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
}
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
}
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
static void intel_pstate_hwp_set_online_cpus(void)
{
get_online_cpus();
intel_pstate_hwp_set(cpu_online_mask);
put_online_cpus();
}
/************************** debugfs begin ************************/
static int pid_param_set(void *data, u64 val)
{
*(u32 *)data = val;
intel_pstate_reset_all_pid();
return 0;
}
static int pid_param_get(void *data, u64 *val)
{
*val = *(u32 *)data;
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, "%llu\n");
struct pid_param {
char *name;
void *value;
};
static struct pid_param pid_files[] = {
{"sample_rate_ms", &pid_params.sample_rate_ms},
{"d_gain_pct", &pid_params.d_gain_pct},
{"i_gain_pct", &pid_params.i_gain_pct},
{"deadband", &pid_params.deadband},
{"setpoint", &pid_params.setpoint},
{"p_gain_pct", &pid_params.p_gain_pct},
{NULL, NULL}
};
static void __init intel_pstate_debug_expose_params(void)
{
struct dentry *debugfs_parent;
int i = 0;
if (hwp_active)
return;
debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
if (IS_ERR_OR_NULL(debugfs_parent))
return;
while (pid_files[i].name) {
debugfs_create_file(pid_files[i].name, 0660,
debugfs_parent, pid_files[i].value,
&fops_pid_param);
i++;
}
}
/************************** debugfs end ************************/
/************************** sysfs begin ************************/
#define show_one(file_name, object) \
static ssize_t show_##file_name \
(struct kobject *kobj, struct attribute *attr, char *buf) \
{ \
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
return sprintf(buf, "%u\n", limits->object); \
}
static ssize_t show_turbo_pct(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct cpudata *cpu;
int total, no_turbo, turbo_pct;
uint32_t turbo_fp;
cpu = all_cpu_data[0];
total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total));
turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
return sprintf(buf, "%u\n", turbo_pct);
}
static ssize_t show_num_pstates(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct cpudata *cpu;
int total;
cpu = all_cpu_data[0];
total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
return sprintf(buf, "%u\n", total);
}
static ssize_t show_no_turbo(struct kobject *kobj,
struct attribute *attr, char *buf)
{
ssize_t ret;
update_turbo_state();
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->turbo_disabled)
ret = sprintf(buf, "%u\n", limits->turbo_disabled);
else
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
ret = sprintf(buf, "%u\n", limits->no_turbo);
return ret;
}
static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
const char *buf, size_t count)
{
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
update_turbo_state();
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->turbo_disabled) {
pr_warn("intel_pstate: Turbo disabled by BIOS or unavailable on processor\n");
return -EPERM;
}
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->no_turbo = clamp_t(int, input, 0, 1);
if (hwp_active)
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
intel_pstate_hwp_set_online_cpus();
return count;
}
static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
const char *buf, size_t count)
{
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->max_sysfs_pct = clamp_t(int, input, 0 , 100);
limits->max_perf_pct = min(limits->max_policy_pct,
limits->max_sysfs_pct);
limits->max_perf_pct = max(limits->min_policy_pct,
limits->max_perf_pct);
limits->max_perf_pct = max(limits->min_perf_pct,
limits->max_perf_pct);
limits->max_perf = div_fp(int_tofp(limits->max_perf_pct),
int_tofp(100));
if (hwp_active)
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
intel_pstate_hwp_set_online_cpus();
return count;
}
static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
const char *buf, size_t count)
{
unsigned int input;
int ret;
ret = sscanf(buf, "%u", &input);
if (ret != 1)
return -EINVAL;
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->min_sysfs_pct = clamp_t(int, input, 0 , 100);
limits->min_perf_pct = max(limits->min_policy_pct,
limits->min_sysfs_pct);
limits->min_perf_pct = min(limits->max_policy_pct,
limits->min_perf_pct);
limits->min_perf_pct = min(limits->max_perf_pct,
limits->min_perf_pct);
limits->min_perf = div_fp(int_tofp(limits->min_perf_pct),
int_tofp(100));
if (hwp_active)
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
intel_pstate_hwp_set_online_cpus();
return count;
}
show_one(max_perf_pct, max_perf_pct);
show_one(min_perf_pct, min_perf_pct);
define_one_global_rw(no_turbo);
define_one_global_rw(max_perf_pct);
define_one_global_rw(min_perf_pct);
define_one_global_ro(turbo_pct);
define_one_global_ro(num_pstates);
static struct attribute *intel_pstate_attributes[] = {
&no_turbo.attr,
&max_perf_pct.attr,
&min_perf_pct.attr,
&turbo_pct.attr,
&num_pstates.attr,
NULL
};
static struct attribute_group intel_pstate_attr_group = {
.attrs = intel_pstate_attributes,
};
static void __init intel_pstate_sysfs_expose_params(void)
{
struct kobject *intel_pstate_kobject;
int rc;
intel_pstate_kobject = kobject_create_and_add("intel_pstate",
&cpu_subsys.dev_root->kobj);
BUG_ON(!intel_pstate_kobject);
rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
BUG_ON(rc);
}
/************************** sysfs end ************************/
static void intel_pstate_hwp_enable(struct cpudata *cpudata)
{
/* First disable HWP notification interrupt as we don't process them */
wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
}
static int atom_get_min_pstate(void)
{
u64 value;
rdmsrl(ATOM_RATIOS, value);
return (value >> 8) & 0x7F;
}
static int atom_get_max_pstate(void)
{
u64 value;
rdmsrl(ATOM_RATIOS, value);
return (value >> 16) & 0x7F;
}
static int atom_get_turbo_pstate(void)
{
u64 value;
rdmsrl(ATOM_TURBO_RATIOS, value);
return value & 0x7F;
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
static u64 atom_get_val(struct cpudata *cpudata, int pstate)
{
u64 val;
int32_t vid_fp;
u32 vid;
val = (u64)pstate << 8;
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->no_turbo && !limits->turbo_disabled)
val |= (u64)1 << 32;
vid_fp = cpudata->vid.min + mul_fp(
int_tofp(pstate - cpudata->pstate.min_pstate),
cpudata->vid.ratio);
vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
vid = ceiling_fp(vid_fp);
if (pstate > cpudata->pstate.max_pstate)
vid = cpudata->vid.turbo;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
return val | vid;
}
static int silvermont_get_scaling(void)
{
u64 value;
int i;
/* Defined in Table 35-6 from SDM (Sept 2015) */
static int silvermont_freq_table[] = {
83300, 100000, 133300, 116700, 80000};
rdmsrl(MSR_FSB_FREQ, value);
i = value & 0x7;
WARN_ON(i > 4);
return silvermont_freq_table[i];
}
static int airmont_get_scaling(void)
{
u64 value;
int i;
/* Defined in Table 35-10 from SDM (Sept 2015) */
static int airmont_freq_table[] = {
83300, 100000, 133300, 116700, 80000,
93300, 90000, 88900, 87500};
rdmsrl(MSR_FSB_FREQ, value);
i = value & 0xF;
WARN_ON(i > 8);
return airmont_freq_table[i];
}
static void atom_get_vid(struct cpudata *cpudata)
{
u64 value;
rdmsrl(ATOM_VIDS, value);
cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
cpudata->vid.ratio = div_fp(
cpudata->vid.max - cpudata->vid.min,
int_tofp(cpudata->pstate.max_pstate -
cpudata->pstate.min_pstate));
rdmsrl(ATOM_TURBO_VIDS, value);
cpudata->vid.turbo = value & 0x7f;
}
static int core_get_min_pstate(void)
{
u64 value;
rdmsrl(MSR_PLATFORM_INFO, value);
return (value >> 40) & 0xFF;
}
static int core_get_max_pstate_physical(void)
{
u64 value;
rdmsrl(MSR_PLATFORM_INFO, value);
return (value >> 8) & 0xFF;
}
static int core_get_max_pstate(void)
{
u64 tar;
u64 plat_info;
int max_pstate;
int err;
rdmsrl(MSR_PLATFORM_INFO, plat_info);
max_pstate = (plat_info >> 8) & 0xFF;
err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar);
if (!err) {
/* Do some sanity checking for safety */
if (plat_info & 0x600000000) {
u64 tdp_ctrl;
u64 tdp_ratio;
int tdp_msr;
err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl);
if (err)
goto skip_tar;
tdp_msr = MSR_CONFIG_TDP_NOMINAL + tdp_ctrl;
err = rdmsrl_safe(tdp_msr, &tdp_ratio);
if (err)
goto skip_tar;
if (tdp_ratio - 1 == tar) {
max_pstate = tar;
pr_debug("max_pstate=TAC %x\n", max_pstate);
} else {
goto skip_tar;
}
}
}
skip_tar:
return max_pstate;
}
static int core_get_turbo_pstate(void)
{
u64 value;
int nont, ret;
rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value);
nont = core_get_max_pstate();
ret = (value) & 255;
if (ret <= nont)
ret = nont;
return ret;
}
static inline int core_get_scaling(void)
{
return 100000;
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
static u64 core_get_val(struct cpudata *cpudata, int pstate)
{
u64 val;
val = (u64)pstate << 8;
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->no_turbo && !limits->turbo_disabled)
val |= (u64)1 << 32;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
return val;
}
static int knl_get_turbo_pstate(void)
{
u64 value;
int nont, ret;
rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value);
nont = core_get_max_pstate();
ret = (((value) >> 8) & 0xFF);
if (ret <= nont)
ret = nont;
return ret;
}
static struct cpu_defaults core_params = {
.pid_policy = {
.sample_rate_ms = 10,
.deadband = 0,
.setpoint = 97,
.p_gain_pct = 20,
.d_gain_pct = 0,
.i_gain_pct = 0,
},
.funcs = {
.get_max = core_get_max_pstate,
.get_max_physical = core_get_max_pstate_physical,
.get_min = core_get_min_pstate,
.get_turbo = core_get_turbo_pstate,
.get_scaling = core_get_scaling,
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
.get_val = core_get_val,
.get_target_pstate = get_target_pstate_use_performance,
},
};
static struct cpu_defaults silvermont_params = {
.pid_policy = {
.sample_rate_ms = 10,
.deadband = 0,
.setpoint = 60,
.p_gain_pct = 14,
.d_gain_pct = 0,
.i_gain_pct = 4,
},
.funcs = {
.get_max = atom_get_max_pstate,
.get_max_physical = atom_get_max_pstate,
.get_min = atom_get_min_pstate,
.get_turbo = atom_get_turbo_pstate,
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
.get_val = atom_get_val,
.get_scaling = silvermont_get_scaling,
.get_vid = atom_get_vid,
.get_target_pstate = get_target_pstate_use_cpu_load,
},
};
static struct cpu_defaults airmont_params = {
.pid_policy = {
.sample_rate_ms = 10,
.deadband = 0,
.setpoint = 60,
.p_gain_pct = 14,
.d_gain_pct = 0,
.i_gain_pct = 4,
},
.funcs = {
.get_max = atom_get_max_pstate,
.get_max_physical = atom_get_max_pstate,
.get_min = atom_get_min_pstate,
.get_turbo = atom_get_turbo_pstate,
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
.get_val = atom_get_val,
.get_scaling = airmont_get_scaling,
.get_vid = atom_get_vid,
.get_target_pstate = get_target_pstate_use_cpu_load,
},
};
static struct cpu_defaults knl_params = {
.pid_policy = {
.sample_rate_ms = 10,
.deadband = 0,
.setpoint = 97,
.p_gain_pct = 20,
.d_gain_pct = 0,
.i_gain_pct = 0,
},
.funcs = {
.get_max = core_get_max_pstate,
.get_max_physical = core_get_max_pstate_physical,
.get_min = core_get_min_pstate,
.get_turbo = knl_get_turbo_pstate,
.get_scaling = core_get_scaling,
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
.get_val = core_get_val,
.get_target_pstate = get_target_pstate_use_performance,
},
};
static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
{
int max_perf = cpu->pstate.turbo_pstate;
int max_perf_adj;
int min_perf;
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->no_turbo || limits->turbo_disabled)
max_perf = cpu->pstate.max_pstate;
/*
* performance can be limited by user through sysfs, by cpufreq
* policy, or by cpu specific default values determined through
* experimentation.
*/
max_perf_adj = fp_toint(max_perf * limits->max_perf);
*max = clamp_t(int, max_perf_adj,
cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
min_perf = fp_toint(max_perf * limits->min_perf);
*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate)
{
trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
cpu->pstate.current_pstate = pstate;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
static void intel_pstate_set_min_pstate(struct cpudata *cpu)
{
int pstate = cpu->pstate.min_pstate;
intel_pstate_record_pstate(cpu, pstate);
/*
* Generally, there is no guarantee that this code will always run on
* the CPU being updated, so force the register update to run on the
* right CPU.
*/
wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL,
pstate_funcs.get_val(cpu, pstate));
}
static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
{
cpu->pstate.min_pstate = pstate_funcs.get_min();
cpu->pstate.max_pstate = pstate_funcs.get_max();
cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
cpu->pstate.scaling = pstate_funcs.get_scaling();
if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
intel_pstate_set_min_pstate(cpu);
}
static inline void intel_pstate_calc_busy(struct cpudata *cpu)
{
struct sample *sample = &cpu->sample;
int64_t core_pct;
core_pct = int_tofp(sample->aperf) * int_tofp(100);
core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
sample->core_pct_busy = (int32_t)core_pct;
}
static inline bool intel_pstate_sample(struct cpudata *cpu, u64 time)
{
u64 aperf, mperf;
unsigned long flags;
u64 tsc;
local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
tsc = rdtsc();
if (cpu->prev_mperf == mperf || cpu->prev_tsc == tsc) {
local_irq_restore(flags);
return false;
}
local_irq_restore(flags);
cpu->last_sample_time = cpu->sample.time;
cpu->sample.time = time;
cpu->sample.aperf = aperf;
cpu->sample.mperf = mperf;
cpu->sample.tsc = tsc;
cpu->sample.aperf -= cpu->prev_aperf;
cpu->sample.mperf -= cpu->prev_mperf;
cpu->sample.tsc -= cpu->prev_tsc;
cpu->prev_aperf = aperf;
cpu->prev_mperf = mperf;
cpu->prev_tsc = tsc;
intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-04-02 07:06:21 +08:00
/*
* First time this function is invoked in a given cycle, all of the
* previous sample data fields are equal to zero or stale and they must
* be populated with meaningful numbers for things to work, so assume
* that sample.time will always be reset before setting the utilization
* update hook and make the caller skip the sample then.
*/
return !!cpu->last_sample_time;
}
static inline int32_t get_avg_frequency(struct cpudata *cpu)
{
return div64_u64(cpu->pstate.max_pstate_physical * cpu->sample.aperf *
cpu->pstate.scaling, cpu->sample.mperf);
}
static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
{
struct sample *sample = &cpu->sample;
u64 cummulative_iowait, delta_iowait_us;
u64 delta_iowait_mperf;
u64 mperf, now;
int32_t cpu_load;
cummulative_iowait = get_cpu_iowait_time_us(cpu->cpu, &now);
/*
* Convert iowait time into number of IO cycles spent at max_freq.
* IO is considered as busy only for the cpu_load algorithm. For
* performance this is not needed since we always try to reach the
* maximum P-State, so we are already boosting the IOs.
*/
delta_iowait_us = cummulative_iowait - cpu->prev_cummulative_iowait;
delta_iowait_mperf = div64_u64(delta_iowait_us * cpu->pstate.scaling *
cpu->pstate.max_pstate, MSEC_PER_SEC);
mperf = cpu->sample.mperf + delta_iowait_mperf;
cpu->prev_cummulative_iowait = cummulative_iowait;
/*
* The load can be estimated as the ratio of the mperf counter
* running at a constant frequency during active periods
* (C0) and the time stamp counter running at the same frequency
* also during C-states.
*/
cpu_load = div64_u64(int_tofp(100) * mperf, sample->tsc);
cpu->sample.busy_scaled = cpu_load;
return cpu->pstate.current_pstate - pid_calc(&cpu->pid, cpu_load);
}
static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
{
int32_t core_busy, max_pstate, current_pstate, sample_ratio;
u64 duration_ns;
intel_pstate_calc_busy(cpu);
/*
* core_busy is the ratio of actual performance to max
* max_pstate is the max non turbo pstate available
* current_pstate was the pstate that was requested during
* the last sample period.
*
* We normalize core_busy, which was our actual percent
* performance to what we requested during the last sample
* period. The result will be a percentage of busy at a
* specified pstate.
*/
core_busy = cpu->sample.core_pct_busy;
max_pstate = int_tofp(cpu->pstate.max_pstate_physical);
current_pstate = int_tofp(cpu->pstate.current_pstate);
core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
/*
* Since our utilization update callback will not run unless we are
* in C0, check if the actual elapsed time is significantly greater (3x)
* than our sample interval. If it is, then we were idle for a long
* enough period of time to adjust our busyness.
*/
duration_ns = cpu->sample.time - cpu->last_sample_time;
intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-04-02 07:06:21 +08:00
if ((s64)duration_ns > pid_params.sample_rate_ns * 3) {
sample_ratio = div_fp(int_tofp(pid_params.sample_rate_ns),
int_tofp(duration_ns));
core_busy = mul_fp(core_busy, sample_ratio);
}
cpu->sample.busy_scaled = core_busy;
return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy);
}
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
{
int max_perf, min_perf;
update_turbo_state();
intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
pstate = clamp_t(int, pstate, min_perf, max_perf);
if (pstate == cpu->pstate.current_pstate)
return;
intel_pstate_record_pstate(cpu, pstate);
wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate));
}
static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
{
int from, target_pstate;
struct sample *sample;
from = cpu->pstate.current_pstate;
target_pstate = pstate_funcs.get_target_pstate(cpu);
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
intel_pstate_update_pstate(cpu, target_pstate);
sample = &cpu->sample;
trace_pstate_sample(fp_toint(sample->core_pct_busy),
fp_toint(sample->busy_scaled),
from,
cpu->pstate.current_pstate,
sample->mperf,
sample->aperf,
sample->tsc,
get_avg_frequency(cpu));
}
static void intel_pstate_update_util(struct update_util_data *data, u64 time,
unsigned long util, unsigned long max)
{
struct cpudata *cpu = container_of(data, struct cpudata, update_util);
u64 delta_ns = time - cpu->sample.time;
if ((s64)delta_ns >= pid_params.sample_rate_ns) {
bool sample_taken = intel_pstate_sample(cpu, time);
if (sample_taken && !hwp_active)
intel_pstate_adjust_busy_pstate(cpu);
}
}
#define ICPU(model, policy) \
{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
(unsigned long)&policy }
static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
ICPU(0x2a, core_params),
ICPU(0x2d, core_params),
ICPU(0x37, silvermont_params),
ICPU(0x3a, core_params),
ICPU(0x3c, core_params),
ICPU(0x3d, core_params),
ICPU(0x3e, core_params),
ICPU(0x3f, core_params),
ICPU(0x45, core_params),
ICPU(0x46, core_params),
ICPU(0x47, core_params),
ICPU(0x4c, airmont_params),
ICPU(0x4e, core_params),
ICPU(0x4f, core_params),
ICPU(0x5e, core_params),
ICPU(0x56, core_params),
ICPU(0x57, knl_params),
{}
};
MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] = {
ICPU(0x56, core_params),
{}
};
static int intel_pstate_init_cpu(unsigned int cpunum)
{
struct cpudata *cpu;
if (!all_cpu_data[cpunum])
all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
GFP_KERNEL);
if (!all_cpu_data[cpunum])
return -ENOMEM;
cpu = all_cpu_data[cpunum];
cpu->cpu = cpunum;
if (hwp_active) {
intel_pstate_hwp_enable(cpu);
pid_params.sample_rate_ms = 50;
pid_params.sample_rate_ns = 50 * NSEC_PER_MSEC;
}
intel_pstate_get_cpu_pstates(cpu);
intel_pstate_busy_pid_reset(cpu);
cpu->update_util.func = intel_pstate_update_util;
pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
return 0;
}
static unsigned int intel_pstate_get(unsigned int cpu_num)
{
struct sample *sample;
struct cpudata *cpu;
cpu = all_cpu_data[cpu_num];
if (!cpu)
return 0;
sample = &cpu->sample;
return get_avg_frequency(cpu);
}
intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-04-02 07:06:21 +08:00
static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
{
intel_pstate: Avoid extra invocation of intel_pstate_sample() The initialization of intel_pstate for a given CPU involves populating the fields of its struct cpudata that represent the previous sample, but currently that is done in a problematic way. Namely, intel_pstate_init_cpu() makes an extra call to intel_pstate_sample() so it reads the current register values that will be used to populate the "previous sample" record during the next invocation of intel_pstate_sample(). However, after commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) that doesn't work for last_sample_time, because the time value is passed to intel_pstate_sample() as an argument now. Passing 0 to it from intel_pstate_init_cpu() is problematic, because that causes cpu->last_sample_time == 0 to be visible in get_target_pstate_use_performance() (and hence the extra cpu->last_sample_time > 0 check in there) and effectively allows the first invocation of intel_pstate_sample() from intel_pstate_update_util() to happen immediately after the initialization which may lead to a significant "turn on" effect in the governor algorithm. To mitigate that issue, rework the initialization to avoid the extra intel_pstate_sample() call from intel_pstate_init_cpu(). Instead, make intel_pstate_sample() return false if it has been called with cpu->sample.time equal to zero, which will make intel_pstate_update_util() skip the sample in that case, and reset cpu->sample.time from intel_pstate_set_update_util_hook() to make the algorithm start properly every time the hook is set. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-04-02 07:06:21 +08:00
struct cpudata *cpu = all_cpu_data[cpu_num];
/* Prevent intel_pstate_update_util() from using stale data. */
cpu->sample.time = 0;
cpufreq_set_update_util_data(cpu_num, &cpu->update_util);
}
static void intel_pstate_clear_update_util_hook(unsigned int cpu)
{
cpufreq_set_update_util_data(cpu, NULL);
synchronize_sched();
}
static int intel_pstate_set_policy(struct cpufreq_policy *policy)
{
if (!policy->cpuinfo.max_freq)
return -ENODEV;
intel_pstate_clear_update_util_hook(policy->cpu);
if (policy->policy == CPUFREQ_POLICY_PERFORMANCE &&
policy->max >= policy->cpuinfo.max_freq) {
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
pr_debug("intel_pstate: set performance\n");
limits = &performance_limits;
goto out;
}
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
pr_debug("intel_pstate: set powersave\n");
limits = &powersave_limits;
limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100);
limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
policy->cpuinfo.max_freq);
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100);
/* Normalize user input to [min_policy_pct, max_policy_pct] */
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->min_perf_pct = max(limits->min_policy_pct,
limits->min_sysfs_pct);
limits->min_perf_pct = min(limits->max_policy_pct,
limits->min_perf_pct);
limits->max_perf_pct = min(limits->max_policy_pct,
limits->max_sysfs_pct);
limits->max_perf_pct = max(limits->min_policy_pct,
limits->max_perf_pct);
limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
/* Make sure min_perf_pct <= max_perf_pct */
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
limits->min_perf = div_fp(int_tofp(limits->min_perf_pct),
int_tofp(100));
limits->max_perf = div_fp(int_tofp(limits->max_perf_pct),
int_tofp(100));
out:
intel_pstate_set_update_util_hook(policy->cpu);
if (hwp_active)
intel_pstate: Update frequencies of policy->cpus only from ->set_policy() The intel-pstate driver is using intel_pstate_hwp_set() from two separate paths, i.e. ->set_policy() callback and sysfs update path for the files present in /sys/devices/system/cpu/intel_pstate/ directory. While an update to the sysfs path applies to all the CPUs being managed by the driver (which essentially means all the online CPUs), the update via the ->set_policy() callback applies to a smaller group of CPUs managed by the policy for which ->set_policy() is called. And so, intel_pstate_hwp_set() should update frequencies of only the CPUs that are part of policy->cpus mask, while it is called from ->set_policy() callback. In order to do that, add a parameter (cpumask) to intel_pstate_hwp_set() and apply the frequency changes only to the concerned CPUs. For ->set_policy() path, we are only concerned about policy->cpus, and so policy->rwsem lock taken by the core prior to calling ->set_policy() is enough to take care of any races. The larger lock acquired by get_online_cpus() is required only for the updates to sysfs files. Add another routine, intel_pstate_hwp_set_online_cpus(), and call it from the sysfs update paths. This also fixes a lockdep reported recently, where policy->rwsem and get_online_cpus() could have been acquired in any order causing an ABBA deadlock. The sequence of events leading to that was: intel_pstate_init(...) ...cpufreq_online(...) down_write(&policy->rwsem); // Locks policy->rwsem ... cpufreq_init_policy(policy); ...intel_pstate_hwp_set(); get_online_cpus(); // Temporarily locks cpu_hotplug.lock ... up_write(&policy->rwsem); pm_suspend(...) ...disable_nonboot_cpus() _cpu_down() cpu_hotplug_begin(); // Locks cpu_hotplug.lock __cpu_notify(CPU_DOWN_PREPARE, ...); ...cpufreq_offline_prepare(); down_write(&policy->rwsem); // Locks policy->rwsem Reported-and-tested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-02-22 12:57:46 +08:00
intel_pstate_hwp_set(policy->cpus);
return 0;
}
static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
{
cpufreq_verify_within_cpu_limits(policy);
if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
policy->policy != CPUFREQ_POLICY_PERFORMANCE)
return -EINVAL;
return 0;
}
static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
{
int cpu_num = policy->cpu;
struct cpudata *cpu = all_cpu_data[cpu_num];
pr_debug("intel_pstate: CPU %d exiting\n", cpu_num);
intel_pstate_clear_update_util_hook(cpu_num);
if (hwp_active)
return;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
intel_pstate_set_min_pstate(cpu);
}
static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
{
struct cpudata *cpu;
int rc;
rc = intel_pstate_init_cpu(policy->cpu);
if (rc)
return rc;
cpu = all_cpu_data[policy->cpu];
cpufreq: intel_pstate: Fix intel_pstate powersave min_perf_pct value On systems that initialize the intel_pstate driver with the performance governor, and then switch to the powersave governor will not transition to lower cpu frequencies until /sys/devices/system/cpu/intel_pstate/min_perf_pct is set to a low value. The behavior of governor switching changed after commit a04759924e25 ("[cpufreq] intel_pstate: honor user space min_perf_pct override on resume"). The commit introduced tracking of performance percentage changes via sysfs in order to restore userspace changes during suspend/resume. The problem occurs because the global values of the newly introduced max_sysfs_pct and min_sysfs_pct are not lowered on the governor change and this causes the powersave governor to inherit the performance governor's settings. A simple change would have been to reset max_sysfs_pct to 100 and min_sysfs_pct to 0 on a governor change, which fixes the problem with governor switching. However, since we cannot break userspace[1] the fix is now to give each governor its own limits storage area so that governor specific changes are tracked. I successfully tested this by booting with both the performance governor and the powersave governor by default, and switching between the two governors (while monitoring /sys/devices/system/cpu/intel_pstate/ values, and looking at the output of cpupower frequency-info). Suspend/Resume testing was performed by Doug Smythies. [1] Systems which suspend/resume using the unmaintained pm-utils package will always transition to the performance governor before the suspend and after the resume. This means a system using the powersave governor will go from powersave to performance, then suspend/resume, performance to powersave. The simple change during governor changes would have been overwritten when the governor changed before and after the suspend/resume. I have submitted https://bugzilla.redhat.com/show_bug.cgi?id=1271225 against Fedora to remove the 94cpufreq file that causes the problem. It should be noted that pm-utils is obsoleted with newer versions of systemd. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Kristen Carlson Accardi <kristen@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2015-10-15 19:34:15 +08:00
if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
policy->policy = CPUFREQ_POLICY_PERFORMANCE;
else
policy->policy = CPUFREQ_POLICY_POWERSAVE;
policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
/* cpuinfo and default policy values */
policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
policy->cpuinfo.max_freq =
cpu->pstate.turbo_pstate * cpu->pstate.scaling;
policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
cpumask_set_cpu(policy->cpu, policy->cpus);
return 0;
}
static struct cpufreq_driver intel_pstate_driver = {
.flags = CPUFREQ_CONST_LOOPS,
.verify = intel_pstate_verify_policy,
.setpolicy = intel_pstate_set_policy,
.get = intel_pstate_get,
.init = intel_pstate_cpu_init,
.stop_cpu = intel_pstate_stop_cpu,
.name = "intel_pstate",
};
static int __initdata no_load;
static int __initdata no_hwp;
static int __initdata hwp_only;
static unsigned int force_load;
static int intel_pstate_msrs_not_valid(void)
{
if (!pstate_funcs.get_max() ||
!pstate_funcs.get_min() ||
!pstate_funcs.get_turbo())
return -ENODEV;
return 0;
}
static void copy_pid_params(struct pstate_adjust_policy *policy)
{
pid_params.sample_rate_ms = policy->sample_rate_ms;
pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
pid_params.p_gain_pct = policy->p_gain_pct;
pid_params.i_gain_pct = policy->i_gain_pct;
pid_params.d_gain_pct = policy->d_gain_pct;
pid_params.deadband = policy->deadband;
pid_params.setpoint = policy->setpoint;
}
static void copy_cpu_funcs(struct pstate_funcs *funcs)
{
pstate_funcs.get_max = funcs->get_max;
pstate_funcs.get_max_physical = funcs->get_max_physical;
pstate_funcs.get_min = funcs->get_min;
pstate_funcs.get_turbo = funcs->get_turbo;
pstate_funcs.get_scaling = funcs->get_scaling;
intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer <jwboyer@fedoraproject.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2016-03-19 06:20:02 +08:00
pstate_funcs.get_val = funcs->get_val;
pstate_funcs.get_vid = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
}
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
#if IS_ENABLED(CONFIG_ACPI)
#include <acpi/processor.h>
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
static bool intel_pstate_no_acpi_pss(void)
{
int i;
for_each_possible_cpu(i) {
acpi_status status;
union acpi_object *pss;
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
struct acpi_processor *pr = per_cpu(processors, i);
if (!pr)
continue;
status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
if (ACPI_FAILURE(status))
continue;
pss = buffer.pointer;
if (pss && pss->type == ACPI_TYPE_PACKAGE) {
kfree(pss);
return false;
}
kfree(pss);
}
return true;
}
static bool intel_pstate_has_acpi_ppc(void)
{
int i;
for_each_possible_cpu(i) {
struct acpi_processor *pr = per_cpu(processors, i);
if (!pr)
continue;
if (acpi_has_method(pr->handle, "_PPC"))
return true;
}
return false;
}
enum {
PSS,
PPC,
};
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
struct hw_vendor_info {
u16 valid;
char oem_id[ACPI_OEM_ID_SIZE];
char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
int oem_pwr_table;
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
};
/* Hardware vendor-specific info that has its own power management modes */
static struct hw_vendor_info vendor_info[] = {
{1, "HP ", "ProLiant", PSS},
{1, "ORACLE", "X4-2 ", PPC},
{1, "ORACLE", "X4-2L ", PPC},
{1, "ORACLE", "X4-2B ", PPC},
{1, "ORACLE", "X3-2 ", PPC},
{1, "ORACLE", "X3-2L ", PPC},
{1, "ORACLE", "X3-2B ", PPC},
{1, "ORACLE", "X4470M2 ", PPC},
{1, "ORACLE", "X4270M3 ", PPC},
{1, "ORACLE", "X4270M2 ", PPC},
{1, "ORACLE", "X4170M2 ", PPC},
{1, "ORACLE", "X4170 M3", PPC},
{1, "ORACLE", "X4275 M3", PPC},
{1, "ORACLE", "X6-2 ", PPC},
{1, "ORACLE", "Sudbury ", PPC},
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
{0, "", ""},
};
static bool intel_pstate_platform_pwr_mgmt_exists(void)
{
struct acpi_table_header hdr;
struct hw_vendor_info *v_info;
const struct x86_cpu_id *id;
u64 misc_pwr;
id = x86_match_cpu(intel_pstate_cpu_oob_ids);
if (id) {
rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
if ( misc_pwr & (1 << 8))
return true;
}
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
if (acpi_disabled ||
ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr)))
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
return false;
for (v_info = vendor_info; v_info->valid; v_info++) {
if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
!strncmp(hdr.oem_table_id, v_info->oem_table_id,
ACPI_OEM_TABLE_ID_SIZE))
switch (v_info->oem_pwr_table) {
case PSS:
return intel_pstate_no_acpi_pss();
case PPC:
return intel_pstate_has_acpi_ppc() &&
(!force_load);
}
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
}
return false;
}
#else /* CONFIG_ACPI not enabled */
static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
intel_pstate: skip the driver if ACPI has power mgmt option Do not load the Intel pstate driver if the platform firmware (ACPI BIOS) supports the power management alternatives. The ACPI BIOS indicates that the OS control mode can be used if the _PSS (Performance Supported States) is defined in ACPI table. For the OS control mode, the Intel pstate driver will be loaded. HP BIOS has several power management modes (firmware, OS-control and so on). For the OS control mode in HP BIOS, the Intel p-state driver will be loaded. When the customer chooses the firmware power management in HP BIOS, the Intel p-state driver will be ignored. I put hw_vendor_info vendor_info in case other vendors (Dell, Lenovo...) have their firmware power management. Vendors should make sure their firmware power management works properly, and they can go for adding their vendor info to the variable. I have verified the patch on HP ProLiant servers. The patch worked correctly. Signed-off-by: Adrian Huang <adrianhuang0701@gmail.com> [rjw: Fixed up !CONFIG_ACPI build] [Linda Knippers: As Adrian has recently left HP, I retested the updated patch on an HP ProLiant server and verified that it is behaving correctly. When the BIOS is configured for OS control for power management, the intel_pstate driver loads as expected. When the BIOS is configured to provide the power management, the intel_pstate driver does not load and we get the pcc_cpufreq driver instead.] Signed-off-by: Linda Knippers <linda.knippers@hp.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-10-31 23:24:05 +08:00
#endif /* CONFIG_ACPI */
static const struct x86_cpu_id hwp_support_ids[] __initconst = {
{ X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP },
{}
};
static int __init intel_pstate_init(void)
{
int cpu, rc = 0;
const struct x86_cpu_id *id;
struct cpu_defaults *cpu_def;
if (no_load)
return -ENODEV;
if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
copy_cpu_funcs(&core_params.funcs);
hwp_active++;
goto hwp_cpu_matched;
}
id = x86_match_cpu(intel_pstate_cpu_ids);
if (!id)
return -ENODEV;
cpu_def = (struct cpu_defaults *)id->driver_data;
copy_pid_params(&cpu_def->pid_policy);
copy_cpu_funcs(&cpu_def->funcs);
if (intel_pstate_msrs_not_valid())
return -ENODEV;
hwp_cpu_matched:
/*
* The Intel pstate driver will be ignored if the platform
* firmware has its own power management modes.
*/
if (intel_pstate_platform_pwr_mgmt_exists())
return -ENODEV;
pr_info("Intel P-state driver initializing.\n");
all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
if (!all_cpu_data)
return -ENOMEM;
if (!hwp_active && hwp_only)
goto out;
rc = cpufreq_register_driver(&intel_pstate_driver);
if (rc)
goto out;
intel_pstate_debug_expose_params();
intel_pstate_sysfs_expose_params();
if (hwp_active)
pr_info("intel_pstate: HWP enabled\n");
return rc;
out:
get_online_cpus();
for_each_online_cpu(cpu) {
if (all_cpu_data[cpu]) {
intel_pstate_clear_update_util_hook(cpu);
kfree(all_cpu_data[cpu]);
}
}
put_online_cpus();
vfree(all_cpu_data);
return -ENODEV;
}
device_initcall(intel_pstate_init);
static int __init intel_pstate_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strcmp(str, "disable"))
no_load = 1;
if (!strcmp(str, "no_hwp")) {
pr_info("intel_pstate: HWP disabled\n");
no_hwp = 1;
}
if (!strcmp(str, "force"))
force_load = 1;
if (!strcmp(str, "hwp_only"))
hwp_only = 1;
return 0;
}
early_param("intel_pstate", intel_pstate_setup);
MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
MODULE_LICENSE("GPL");