From 9fd0c40451468754754271ba0cbb63b6927911df Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 3 May 2018 10:09:10 +0100 Subject: [PATCH 01/19] cpupower: fix spelling mistake: "logilename" -> "logfilename" Trivial fix to spelling mistake in dprintf message Signed-off-by: Colin Ian King Signed-off-by: Shuah Khan (Samsung OSG) --- tools/power/cpupower/bench/parse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/bench/parse.c b/tools/power/cpupower/bench/parse.c index 9b65f052081f..9ba8a44ad2a7 100644 --- a/tools/power/cpupower/bench/parse.c +++ b/tools/power/cpupower/bench/parse.c @@ -104,7 +104,7 @@ FILE *prepare_output(const char *dirname) dirname, time(NULL)); } - dprintf("logilename: %s\n", filename); + dprintf("logfilename: %s\n", filename); output = fopen(filename, "w+"); if (output == NULL) { From f9652d5cae04eb5e85303c087f5842d320499c65 Mon Sep 17 00:00:00 2001 From: Abhishek Goel Date: Mon, 28 May 2018 06:03:03 -0500 Subject: [PATCH 02/19] cpupower : Fix header name to read idle state name The names of the idle states in the output of cpupower monitor command are truncated to 4 characters. On POWER9, this creates ambiguity as the states are named "stop0", "stop1", etc. root:~# cpupower monitor |Idle_Stats PKG |CORE|CPU | snoo | stop | stop | stop | stop | stop | stop 0| 0| 0| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 1.90 0| 0| 1| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 0| 2| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 0| 3| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 This patch modifies the output to print the state name that results in a legible output. The names will be printed with atmost 1 padding in left. root:~# cpupower monitor | Idle_Stats PKG|CORE| CPU|snooze|stop0L| stop0|stop1L| stop1|stop2L| stop2 0| 0| 0| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.72 0| 0| 1| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 0| 2| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 0| 0| 3| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.00 This patch does not affect the output for intel. Output for intel before applying the patch: root:~# cpupower monitor |Idle_Stats CPU | POLL | C1-S | C1E- | C3-S | C6-S | C7s- | C8-S | C9-S | C10- 0| 0.00| 0.14| 0.39| 0.35| 7.41| 0.00| 17.67| 1.01| 70.03 2| 0.00| 0.19| 0.47| 0.10| 6.50| 0.00| 29.66| 2.17| 58.07 1| 0.00| 0.11| 0.50| 1.50| 9.11| 0.18| 18.19| 0.40| 66.63 3| 0.00| 0.67| 0.42| 0.03| 5.84| 0.00| 12.58| 0.77| 77.14 Output for intel after applying the patch: root:~# cpupower monitor | Idle_Stats CPU| POLL | C1-S | C1E- | C3-S | C6-S | C7s- | C8-S | C9-S | C10- 0| 0.03| 0.33| 1.01| 0.27| 3.03| 0.00| 19.18| 0.00| 71.24 2| 0.00| 1.58| 0.58| 0.42| 8.55| 0.09| 21.11| 0.99| 63.32 1| 0.00| 1.26| 0.88| 0.43| 9.00| 0.02| 7.78| 4.65| 71.91 3| 0.00| 0.30| 0.42| 0.06| 13.62| 0.21| 30.29| 0.00| 52.45 Signed-off-by: Abhishek Goel Signed-off-by: Shuah Khan (Samsung OSG) --- .../utils/idle_monitor/cpuidle_sysfs.c | 15 ++++++++ .../utils/idle_monitor/cpupower-monitor.c | 35 +++++++++++-------- .../utils/idle_monitor/cpupower-monitor.h | 9 +++++ 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c index 5b3205f16217..5b8c4956ff9a 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -126,6 +126,20 @@ void fix_up_intel_idle_driver_name(char *tmp, int num) } } +#ifdef __powerpc__ +void map_power_idle_state_name(char *tmp) +{ + if (!strncmp(tmp, "stop0_lite", CSTATE_NAME_LEN)) + strcpy(tmp, "stop0L"); + else if (!strncmp(tmp, "stop1_lite", CSTATE_NAME_LEN)) + strcpy(tmp, "stop1L"); + else if (!strncmp(tmp, "stop2_lite", CSTATE_NAME_LEN)) + strcpy(tmp, "stop2L"); +} +#else +void map_power_idle_state_name(char *tmp) { } +#endif + static struct cpuidle_monitor *cpuidle_register(void) { int num; @@ -145,6 +159,7 @@ static struct cpuidle_monitor *cpuidle_register(void) if (tmp == NULL) continue; + map_power_idle_state_name(tmp); fix_up_intel_idle_driver_name(tmp, num); strncpy(cpuidle_cstates[num].name, tmp, CSTATE_NAME_LEN - 1); free(tmp); diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index 05f953f0f0a0..051da0a7c454 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -70,36 +70,43 @@ void print_n_spaces(int n) printf(" "); } -/* size of s must be at least n + 1 */ +/*s is filled with left and right spaces + *to make its length atleast n+1 + */ int fill_string_with_spaces(char *s, int n) { + char *temp; int len = strlen(s); - if (len > n) + + if (len >= n) return -1; + + temp = malloc(sizeof(char) * (n+1)); for (; len < n; len++) s[len] = ' '; s[len] = '\0'; + snprintf(temp, n+1, " %s", s); + strcpy(s, temp); + free(temp); return 0; } +#define MAX_COL_WIDTH 6 void print_header(int topology_depth) { int unsigned mon; int state, need_len; cstate_t s; char buf[128] = ""; - int percent_width = 4; fill_string_with_spaces(buf, topology_depth * 5 - 1); printf("%s|", buf); for (mon = 0; mon < avail_monitors; mon++) { - need_len = monitors[mon]->hw_states_num * (percent_width + 3) + need_len = monitors[mon]->hw_states_num * (MAX_COL_WIDTH + 1) - 1; - if (mon != 0) { - printf("|| "); - need_len--; - } + if (mon != 0) + printf("||"); sprintf(buf, "%s", monitors[mon]->name); fill_string_with_spaces(buf, need_len); printf("%s", buf); @@ -107,23 +114,21 @@ void print_header(int topology_depth) printf("\n"); if (topology_depth > 2) - printf("PKG |"); + printf(" PKG|"); if (topology_depth > 1) printf("CORE|"); if (topology_depth > 0) - printf("CPU |"); + printf(" CPU|"); for (mon = 0; mon < avail_monitors; mon++) { if (mon != 0) - printf("|| "); - else - printf(" "); + printf("||"); for (state = 0; state < monitors[mon]->hw_states_num; state++) { if (state != 0) - printf(" | "); + printf("|"); s = monitors[mon]->hw_states[state]; sprintf(buf, "%s", s.name); - fill_string_with_spaces(buf, percent_width); + fill_string_with_spaces(buf, MAX_COL_WIDTH); printf("%s", buf); } printf(" "); diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h index 9e43f3371fbc..2ae50b499e0a 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.h @@ -15,7 +15,16 @@ #define MONITORS_MAX 20 #define MONITOR_NAME_LEN 20 + +/* CSTATE_NAME_LEN is limited by header field width defined + * in cpupower-monitor.c. Header field width is defined to be + * sum of percent width and two spaces for padding. + */ +#ifdef __powerpc__ +#define CSTATE_NAME_LEN 7 +#else #define CSTATE_NAME_LEN 5 +#endif #define CSTATE_DESC_LEN 60 int cpu_count; From ac28927659bec665be97fc2c2dfc059f1f913fbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Jun 2018 13:44:24 +0200 Subject: [PATCH 03/19] cpufreq: kryo: allow building as a loadable module Building the kryo cpufreq driver while QCOM_SMEM is a loadable module results in a link error: drivers/cpufreq/qcom-cpufreq-kryo.o: In function `qcom_cpufreq_kryo_probe': qcom-cpufreq-kryo.c:(.text+0xbc): undefined reference to `qcom_smem_get' The problem is that Kconfig ignores interprets the dependency as met when the dependent symbol is a 'bool' one. By making it 'tristate', it will be forced to be a module here, which builds successfully. Fixes: 46e2856b8e18 (cpufreq: Add Kryo CPU scaling driver) Signed-off-by: Arnd Bergmann Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.arm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index c7ce928fbf1f..52f5f1a2040c 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -125,7 +125,7 @@ config ARM_OMAP2PLUS_CPUFREQ default ARCH_OMAP2PLUS config ARM_QCOM_CPUFREQ_KRYO - bool "Qualcomm Kryo based CPUFreq" + tristate "Qualcomm Kryo based CPUFreq" depends on ARM64 depends on QCOM_QFPROM depends on QCOM_SMEM From 08e9cc4032626792b49009547454bcef44c2e12b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 1 Jun 2018 14:05:12 +0100 Subject: [PATCH 04/19] cpufreq: ACPI: make function acpi_cpufreq_fast_switch() static The acpi_cpufreq_fast_switch() function is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: drivers/cpufreq/acpi-cpufreq.c:468:14: warning: symbol 'acpi_cpufreq_fast_switch' was not declared. Should it be static? Signed-off-by: Colin Ian King Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 9449657d72f0..32ba4bc972e7 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -465,8 +465,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, return result; } -unsigned int acpi_cpufreq_fast_switch(struct cpufreq_policy *policy, - unsigned int target_freq) +static unsigned int acpi_cpufreq_fast_switch(struct cpufreq_policy *policy, + unsigned int target_freq) { struct acpi_cpufreq_data *data = policy->driver_data; struct acpi_processor_performance *perf; From e5d295b06d69a1924665a16a4987be475addd00f Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Thu, 31 May 2018 17:21:43 -0500 Subject: [PATCH 05/19] cpufreq: ti-cpufreq: Fix an incorrect error return value Commit 05829d9431df (cpufreq: ti-cpufreq: kfree opp_data when failure) has fixed a memory leak in the failure path, however the patch returned a positive value on get_cpu_device() failure instead of the previous negative value. Fix this incorrect error return value properly. Fixes: 05829d9431df (cpufreq: ti-cpufreq: kfree opp_data when failure) Cc: 4.14+ # v4.14+ Signed-off-by: Suman Anna Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/ti-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index 6ba709b6f095..896caba5dfe5 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -226,7 +226,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev) opp_data->cpu_dev = get_cpu_device(0); if (!opp_data->cpu_dev) { pr_err("%s: Failed to get device for CPU0\n", __func__); - ret = ENODEV; + ret = -ENODEV; goto free_opp_data; } From d7231f993ad4081da2c2784e2692617e2bd0551e Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Thu, 31 May 2018 17:21:44 -0500 Subject: [PATCH 06/19] cpufreq: ti-cpufreq: Use devres managed API in probe() The ti_cpufreq_probe() function uses regular kzalloc to allocate the ti_cpufreq_data structure and kfree for freeing this memory on failures. Simplify this code by using the devres managed API. Signed-off-by: Suman Anna Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/ti-cpufreq.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index 896caba5dfe5..3f0e2a14895a 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -217,7 +217,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev) if (!match) return -ENODEV; - opp_data = kzalloc(sizeof(*opp_data), GFP_KERNEL); + opp_data = devm_kzalloc(&pdev->dev, sizeof(*opp_data), GFP_KERNEL); if (!opp_data) return -ENOMEM; @@ -226,8 +226,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev) opp_data->cpu_dev = get_cpu_device(0); if (!opp_data->cpu_dev) { pr_err("%s: Failed to get device for CPU0\n", __func__); - ret = -ENODEV; - goto free_opp_data; + return -ENODEV; } opp_data->opp_node = dev_pm_opp_of_get_opp_desc_node(opp_data->cpu_dev); @@ -285,8 +284,6 @@ static int ti_cpufreq_probe(struct platform_device *pdev) fail_put_node: of_node_put(opp_data->opp_node); -free_opp_data: - kfree(opp_data); return ret; } From e0efd5be63e821066b5e6325cf237eb41367552f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 5 Jun 2018 14:42:39 -0700 Subject: [PATCH 07/19] cpufreq: intel_pstate: Add HWP boost utility and sched util hooks Added two utility functions to HWP boost up gradually and boost down to the default cached HWP request values. Boost up: Boost up updates HWP request minimum value in steps. This minimum value can reach upto at HWP request maximum values depends on how frequently, this boost up function is called. At max, boost up will take three steps to reach the maximum, depending on the current HWP request levels and HWP capabilities. For example, if the current settings are: If P0 (Turbo max) = P1 (Guaranteed max) = min No boost at all. If P0 (Turbo max) > P1 (Guaranteed max) = min Should result in one level boost only for P0. If P0 (Turbo max) = P1 (Guaranteed max) > min Should result in two level boost: (min + p1)/2 and P1. If P0 (Turbo max) > P1 (Guaranteed max) > min Should result in three level boost: (min + p1)/2, P1 and P0. We don't set any level between P0 and P1 as there is no guarantee that they will be honored. Boost down: After the system is idle for hold time of 3ms, the HWP request is reset to the default value from HWP init or user modified one via sysfs. Caching of HWP Request and Capabilities Store the HWP request value last set using MSR_HWP_REQUEST and read MSR_HWP_CAPABILITIES. This avoid reading of MSRs in the boost utility functions. These boost utility functions calculated limits are based on the latest HWP request value, which can be modified by setpolicy() callback. So if user space modifies the minimum perf value, that will be accounted for every time the boost up is called. There will be case when there can be contention with the user modified minimum perf, in that case user value will gain precedence. For example just before HWP_REQUEST MSR is updated from setpolicy() callback, the boost up function is called via scheduler tick callback. Here the cached MSR value is already the latest and limits are updated based on the latest user limits, but on return the MSR write callback called from setpolicy() callback will update the HWP_REQUEST value. This will be used till next time the boost up function is called. In addition add a variable to control HWP dynamic boosting. When HWP dynamic boost is active then set the HWP specific update util hook. The contents in the utility hooks will be filled in the subsequent patches. Reported-by: Mel Gorman Tested-by: Giovanni Gherdovich Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 100 ++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 08960a55eb27..3949e3861f55 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -221,6 +221,9 @@ struct global_params { * preference/bias * @epp_saved: Saved EPP/EPB during system suspend or CPU offline * operation + * @hwp_req_cached: Cached value of the last HWP Request MSR + * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR + * @hwp_boost_min: Last HWP boosted min performance * * This structure stores per CPU instance data for all CPUs. */ @@ -253,6 +256,9 @@ struct cpudata { s16 epp_policy; s16 epp_default; s16 epp_saved; + u64 hwp_req_cached; + u64 hwp_cap_cached; + u32 hwp_boost_min; }; static struct cpudata **all_cpu_data; @@ -285,6 +291,7 @@ static struct pstate_funcs pstate_funcs __read_mostly; static int hwp_active __read_mostly; static bool per_cpu_limits __read_mostly; +static bool hwp_boost __read_mostly; static struct cpufreq_driver *intel_pstate_driver __read_mostly; @@ -689,6 +696,7 @@ static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, u64 cap; rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); + WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap); if (global.no_turbo) *current_max = HWP_GUARANTEED_PERF(cap); else @@ -763,6 +771,7 @@ static void intel_pstate_hwp_set(unsigned int cpu) intel_pstate_set_epb(cpu, epp); } skip_epp: + WRITE_ONCE(cpu_data->hwp_req_cached, value); wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } @@ -1381,6 +1390,81 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) intel_pstate_set_min_pstate(cpu); } +/* + * Long hold time will keep high perf limits for long time, + * which negatively impacts perf/watt for some workloads, + * like specpower. 3ms is based on experiements on some + * workoads. + */ +static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC; + +static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) +{ + u64 hwp_req = READ_ONCE(cpu->hwp_req_cached); + u32 max_limit = (hwp_req & 0xff00) >> 8; + u32 min_limit = (hwp_req & 0xff); + u32 boost_level1; + + /* + * Cases to consider (User changes via sysfs or boot time): + * If, P0 (Turbo max) = P1 (Guaranteed max) = min: + * No boost, return. + * If, P0 (Turbo max) > P1 (Guaranteed max) = min: + * Should result in one level boost only for P0. + * If, P0 (Turbo max) = P1 (Guaranteed max) > min: + * Should result in two level boost: + * (min + p1)/2 and P1. + * If, P0 (Turbo max) > P1 (Guaranteed max) > min: + * Should result in three level boost: + * (min + p1)/2, P1 and P0. + */ + + /* If max and min are equal or already at max, nothing to boost */ + if (max_limit == min_limit || cpu->hwp_boost_min >= max_limit) + return; + + if (!cpu->hwp_boost_min) + cpu->hwp_boost_min = min_limit; + + /* level at half way mark between min and guranteed */ + boost_level1 = (HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) + min_limit) >> 1; + + if (cpu->hwp_boost_min < boost_level1) + cpu->hwp_boost_min = boost_level1; + else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) + cpu->hwp_boost_min = HWP_GUARANTEED_PERF(cpu->hwp_cap_cached); + else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) && + max_limit != HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) + cpu->hwp_boost_min = max_limit; + else + return; + + hwp_req = (hwp_req & ~GENMASK_ULL(7, 0)) | cpu->hwp_boost_min; + wrmsrl(MSR_HWP_REQUEST, hwp_req); + cpu->last_update = cpu->sample.time; +} + +static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu) +{ + if (cpu->hwp_boost_min) { + bool expired; + + /* Check if we are idle for hold time to boost down */ + expired = time_after64(cpu->sample.time, cpu->last_update + + hwp_boost_hold_time_ns); + if (expired) { + wrmsrl(MSR_HWP_REQUEST, cpu->hwp_req_cached); + cpu->hwp_boost_min = 0; + } + } + cpu->last_update = cpu->sample.time; +} + +static inline void intel_pstate_update_util_hwp(struct update_util_data *data, + u64 time, unsigned int flags) +{ +} + static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) { struct sample *sample = &cpu->sample; @@ -1684,7 +1768,7 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { struct cpudata *cpu = all_cpu_data[cpu_num]; - if (hwp_active) + if (hwp_active && !hwp_boost) return; if (cpu->update_util_set) @@ -1693,7 +1777,9 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num) /* Prevent intel_pstate_update_util() from using stale data. */ cpu->sample.time = 0; cpufreq_add_update_util_hook(cpu_num, &cpu->update_util, - intel_pstate_update_util); + (hwp_active ? + intel_pstate_update_util_hwp : + intel_pstate_update_util)); cpu->update_util_set = true; } @@ -1805,8 +1891,16 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_set_update_util_hook(policy->cpu); } - if (hwp_active) + if (hwp_active) { + /* + * When hwp_boost was active before and dynamically it + * was turned off, in that case we need to clear the + * update util hook. + */ + if (!hwp_boost) + intel_pstate_clear_update_util_hook(policy->cpu); intel_pstate_hwp_set(policy->cpu); + } mutex_unlock(&intel_pstate_limits_lock); From 52ccc4314293272397b117f3cc6f0f368c81431c Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 5 Jun 2018 14:42:40 -0700 Subject: [PATCH 08/19] cpufreq: intel_pstate: HWP boost performance on IO wakeup This change uses SCHED_CPUFREQ_IOWAIT flag to boost HWP performance. Since SCHED_CPUFREQ_IOWAIT flag is set frequently, we don't start boosting steps unless we see two consecutive flags in two ticks. This avoids boosting due to IO because of regular system activities. To avoid synchronization issues, the actual processing of the flag is done on the local CPU callback. Reported-by: Mel Gorman Tested-by: Giovanni Gherdovich Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 39 ++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3949e3861f55..5b2b6b6d1ff4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -223,6 +223,8 @@ struct global_params { * operation * @hwp_req_cached: Cached value of the last HWP Request MSR * @hwp_cap_cached: Cached value of the last HWP Capabilities MSR + * @last_io_update: Last time when IO wake flag was set + * @sched_flags: Store scheduler flags for possible cross CPU update * @hwp_boost_min: Last HWP boosted min performance * * This structure stores per CPU instance data for all CPUs. @@ -258,6 +260,8 @@ struct cpudata { s16 epp_saved; u64 hwp_req_cached; u64 hwp_cap_cached; + u64 last_io_update; + unsigned int sched_flags; u32 hwp_boost_min; }; @@ -1460,9 +1464,44 @@ static inline void intel_pstate_hwp_boost_down(struct cpudata *cpu) cpu->last_update = cpu->sample.time; } +static inline void intel_pstate_update_util_hwp_local(struct cpudata *cpu, + u64 time) +{ + cpu->sample.time = time; + + if (cpu->sched_flags & SCHED_CPUFREQ_IOWAIT) { + bool do_io = false; + + cpu->sched_flags = 0; + /* + * Set iowait_boost flag and update time. Since IO WAIT flag + * is set all the time, we can't just conclude that there is + * some IO bound activity is scheduled on this CPU with just + * one occurrence. If we receive at least two in two + * consecutive ticks, then we treat as boost candidate. + */ + if (time_before64(time, cpu->last_io_update + 2 * TICK_NSEC)) + do_io = true; + + cpu->last_io_update = time; + + if (do_io) + intel_pstate_hwp_boost_up(cpu); + + } else { + intel_pstate_hwp_boost_down(cpu); + } +} + static inline void intel_pstate_update_util_hwp(struct update_util_data *data, u64 time, unsigned int flags) { + struct cpudata *cpu = container_of(data, struct cpudata, update_util); + + cpu->sched_flags |= flags; + + if (smp_processor_id() == cpu->cpu) + intel_pstate_update_util_hwp_local(cpu, time); } static inline void intel_pstate_calc_avg_perf(struct cpudata *cpu) From aaaece3de9d7709d79004dd5d5aa7c9b366f0675 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 5 Jun 2018 14:42:41 -0700 Subject: [PATCH 09/19] cpufreq: intel_pstate: New sysfs entry to control HWP boost A new attribute is added to intel_pstate sysfs to enable/disable HWP dynamic performance boost. Reported-by: Mel Gorman Tested-by: Giovanni Gherdovich Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 5b2b6b6d1ff4..70bf63bb4e0e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1033,6 +1033,30 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, return count; } +static ssize_t show_hwp_dynamic_boost(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", hwp_boost); +} + +static ssize_t store_hwp_dynamic_boost(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + + ret = kstrtouint(buf, 10, &input); + if (ret) + return ret; + + mutex_lock(&intel_pstate_driver_lock); + hwp_boost = !!input; + intel_pstate_update_policies(); + mutex_unlock(&intel_pstate_driver_lock); + + return count; +} + show_one(max_perf_pct, max_perf_pct); show_one(min_perf_pct, min_perf_pct); @@ -1042,6 +1066,7 @@ define_one_global_rw(max_perf_pct); define_one_global_rw(min_perf_pct); define_one_global_ro(turbo_pct); define_one_global_ro(num_pstates); +define_one_global_rw(hwp_dynamic_boost); static struct attribute *intel_pstate_attributes[] = { &status.attr, @@ -1082,6 +1107,11 @@ static void __init intel_pstate_sysfs_expose_params(void) rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); WARN_ON(rc); + if (hwp_active) { + rc = sysfs_create_file(intel_pstate_kobject, + &hwp_dynamic_boost.attr); + WARN_ON(rc); + } } /************************** sysfs end ************************/ From 657c292ce1bb67b1e61cf927a2b6ea135fb700df Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 31 May 2018 12:59:55 +0200 Subject: [PATCH 10/19] PM / Domains: dt: Allow power-domain property to be a list of specifiers To be able to describe topologies where devices are partitioned across multiple power domains, let's extend the power-domain property to allow being a list of PM domain specifiers. Suggested-by: Jon Hunter Signed-off-by: Ulf Hansson Reviewed-by: Rob Herring Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- .../bindings/power/power_domain.txt | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt index 4733f76cbe48..9b387f861aed 100644 --- a/Documentation/devicetree/bindings/power/power_domain.txt +++ b/Documentation/devicetree/bindings/power/power_domain.txt @@ -111,8 +111,8 @@ Example 3: ==PM domain consumers== Required properties: - - power-domains : A phandle and PM domain specifier as defined by bindings of - the power controller specified by phandle. + - power-domains : A list of PM domain specifiers, as defined by bindings of + the power controller that is the PM domain provider. Example: @@ -122,9 +122,18 @@ Example: power-domains = <&power 0>; }; -The node above defines a typical PM domain consumer device, which is located -inside a PM domain with index 0 of a power controller represented by a node -with the label "power". + leaky-device@12351000 { + compatible = "foo,i-leak-current"; + reg = <0x12351000 0x1000>; + power-domains = <&power 0>, <&power 1> ; + }; + +The first example above defines a typical PM domain consumer device, which is +located inside a PM domain with index 0 of a power controller represented by a +node with the label "power". +In the second example the consumer device are partitioned across two PM domains, +the first with index 0 and the second with index 1, of a power controller that +is represented by a node with the label "power. Optional properties: - required-opps: This contains phandle to an OPP node in another device's OPP From bcd931f298d4a5660a4ff6f6629831d917a916d8 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 31 May 2018 12:59:56 +0200 Subject: [PATCH 11/19] PM / Domains: Don't attach devices in genpd with multi PM domains The power-domain DT property may now contain a list of PM domain specifiers, which represents that a device are partitioned across multiple PM domains. This leads to a new situation in genpd_dev_pm_attach(), as only one PM domain can be attached per device. To remain things simple for the most common configuration, when a single PM domain is used, let's treat the multiple PM domain case as being specific. In other words, let's change genpd_dev_pm_attach() to check for multiple PM domains and prevent it from attach any PM domain for this case. Instead, leave this to be managed separately, from following changes to genpd. Suggested-by: Jon Hunter Signed-off-by: Ulf Hansson Acked-by: Jon Hunter Tested-by: Jon Hunter Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 6f403d6fccb2..908c44779ae7 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2229,10 +2229,10 @@ static void genpd_dev_pm_sync(struct device *dev) * attaches the device to retrieved pm_domain ops. * * Returns 1 on successfully attached PM domain, 0 when the device don't need a - * PM domain or a negative error code in case of failures. Note that if a - * power-domain exists for the device, but it cannot be found or turned on, - * then return -EPROBE_DEFER to ensure that the device is not probed and to - * re-try again later. + * PM domain or when multiple power-domains exists for it, else a negative error + * code. Note that if a power-domain exists for the device, but it cannot be + * found or turned on, then return -EPROBE_DEFER to ensure that the device is + * not probed and to re-try again later. */ int genpd_dev_pm_attach(struct device *dev) { @@ -2243,10 +2243,18 @@ int genpd_dev_pm_attach(struct device *dev) if (!dev->of_node) return 0; + /* + * Devices with multiple PM domains must be attached separately, as we + * can only attach one PM domain per device. + */ + if (of_count_phandle_with_args(dev->of_node, "power-domains", + "#power-domain-cells") != 1) + return 0; + ret = of_parse_phandle_with_args(dev->of_node, "power-domains", "#power-domain-cells", 0, &pd_args); if (ret < 0) - return 0; + return ret; mutex_lock(&gpd_list_lock); pd = genpd_get_from_provider(&pd_args); From 8cb1cbd644d5bba5b72eedd632f249c1c677b792 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 31 May 2018 12:59:57 +0200 Subject: [PATCH 12/19] PM / Domains: Split genpd_dev_pm_attach() To extend genpd to deal with allowing multiple PM domains per device, some of the code in genpd_dev_pm_attach() can be re-used. Let's prepare for this by moving some of the code into a sub-function. Signed-off-by: Ulf Hansson Acked-by: Jon Hunter Tested-by: Jon Hunter Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 60 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 908c44779ae7..b1fcbf917974 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2221,38 +2221,15 @@ static void genpd_dev_pm_sync(struct device *dev) genpd_queue_power_off_work(pd); } -/** - * genpd_dev_pm_attach - Attach a device to its PM domain using DT. - * @dev: Device to attach. - * - * Parse device's OF node to find a PM domain specifier. If such is found, - * attaches the device to retrieved pm_domain ops. - * - * Returns 1 on successfully attached PM domain, 0 when the device don't need a - * PM domain or when multiple power-domains exists for it, else a negative error - * code. Note that if a power-domain exists for the device, but it cannot be - * found or turned on, then return -EPROBE_DEFER to ensure that the device is - * not probed and to re-try again later. - */ -int genpd_dev_pm_attach(struct device *dev) +static int __genpd_dev_pm_attach(struct device *dev, struct device_node *np, + unsigned int index) { struct of_phandle_args pd_args; struct generic_pm_domain *pd; int ret; - if (!dev->of_node) - return 0; - - /* - * Devices with multiple PM domains must be attached separately, as we - * can only attach one PM domain per device. - */ - if (of_count_phandle_with_args(dev->of_node, "power-domains", - "#power-domain-cells") != 1) - return 0; - - ret = of_parse_phandle_with_args(dev->of_node, "power-domains", - "#power-domain-cells", 0, &pd_args); + ret = of_parse_phandle_with_args(np, "power-domains", + "#power-domain-cells", index, &pd_args); if (ret < 0) return ret; @@ -2290,6 +2267,35 @@ int genpd_dev_pm_attach(struct device *dev) return ret ? -EPROBE_DEFER : 1; } + +/** + * genpd_dev_pm_attach - Attach a device to its PM domain using DT. + * @dev: Device to attach. + * + * Parse device's OF node to find a PM domain specifier. If such is found, + * attaches the device to retrieved pm_domain ops. + * + * Returns 1 on successfully attached PM domain, 0 when the device don't need a + * PM domain or when multiple power-domains exists for it, else a negative error + * code. Note that if a power-domain exists for the device, but it cannot be + * found or turned on, then return -EPROBE_DEFER to ensure that the device is + * not probed and to re-try again later. + */ +int genpd_dev_pm_attach(struct device *dev) +{ + if (!dev->of_node) + return 0; + + /* + * Devices with multiple PM domains must be attached separately, as we + * can only attach one PM domain per device. + */ + if (of_count_phandle_with_args(dev->of_node, "power-domains", + "#power-domain-cells") != 1) + return 0; + + return __genpd_dev_pm_attach(dev, dev->of_node, 0); +} EXPORT_SYMBOL_GPL(genpd_dev_pm_attach); static const struct of_device_id idle_state_match[] = { From 3c095f32a92be4d07f3172a777dab1aacdb6a728 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 31 May 2018 12:59:58 +0200 Subject: [PATCH 13/19] PM / Domains: Add support for multi PM domains per device to genpd To support devices being partitioned across multiple PM domains, let's begin with extending genpd to cope with these kind of configurations. Therefore, add a new exported function genpd_dev_pm_attach_by_id(), which is similar to the existing genpd_dev_pm_attach(), but with the difference that it allows its callers to provide an index to the PM domain that it wants to attach. Note that, genpd_dev_pm_attach_by_id() shall only be called by the driver core / PM core, similar to how the existing dev_pm_domain_attach() makes use of genpd_dev_pm_attach(). However, this is implemented by following changes on top. Because, only one PM domain can be attached per device, genpd needs to create a virtual device that it can attach/detach instead. More precisely, let the new function genpd_dev_pm_attach_by_id() register a virtual struct device via calling device_register(). Then let it attach this device to the corresponding PM domain, rather than the one that is provided by the caller. The actual attaching is done via re-using the existing genpd OF functions. At successful attachment, genpd_dev_pm_attach_by_id() returns the created virtual device, which allows the caller to operate on it to deal with power management. Following changes on top, provides more details in this regards. To deal with detaching of a PM domain for the multiple PM domains case, let's also extend the existing genpd_dev_pm_detach() function, to cover the cleanup of the created virtual device, via make it call device_unregister() on it. In this way, there is no need to introduce a new function to deal with detach for the multiple PM domain case, but instead the existing one is re-used. Signed-off-by: Ulf Hansson Acked-by: Jon Hunter Tested-by: Jon Hunter Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 80 +++++++++++++++++++++++++++++++++++++ include/linux/pm_domain.h | 8 ++++ 2 files changed, 88 insertions(+) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index b1fcbf917974..4925af5c4cf0 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2171,6 +2171,15 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np) } EXPORT_SYMBOL_GPL(of_genpd_remove_last); +static void genpd_release_dev(struct device *dev) +{ + kfree(dev); +} + +static struct bus_type genpd_bus_type = { + .name = "genpd", +}; + /** * genpd_dev_pm_detach - Detach a device from its PM domain. * @dev: Device to detach. @@ -2208,6 +2217,10 @@ static void genpd_dev_pm_detach(struct device *dev, bool power_off) /* Check if PM domain can be powered off after removing this device. */ genpd_queue_power_off_work(pd); + + /* Unregister the device if it was created by genpd. */ + if (dev->bus == &genpd_bus_type) + device_unregister(dev); } static void genpd_dev_pm_sync(struct device *dev) @@ -2298,6 +2311,67 @@ int genpd_dev_pm_attach(struct device *dev) } EXPORT_SYMBOL_GPL(genpd_dev_pm_attach); +/** + * genpd_dev_pm_attach_by_id - Associate a device with one of its PM domains. + * @dev: The device used to lookup the PM domain. + * @index: The index of the PM domain. + * + * Parse device's OF node to find a PM domain specifier at the provided @index. + * If such is found, creates a virtual device and attaches it to the retrieved + * pm_domain ops. To deal with detaching of the virtual device, the ->detach() + * callback in the struct dev_pm_domain are assigned to genpd_dev_pm_detach(). + * + * Returns the created virtual device if successfully attached PM domain, NULL + * when the device don't need a PM domain, else an ERR_PTR() in case of + * failures. If a power-domain exists for the device, but cannot be found or + * turned on, then ERR_PTR(-EPROBE_DEFER) is returned to ensure that the device + * is not probed and to re-try again later. + */ +struct device *genpd_dev_pm_attach_by_id(struct device *dev, + unsigned int index) +{ + struct device *genpd_dev; + int num_domains; + int ret; + + if (!dev->of_node) + return NULL; + + /* Deal only with devices using multiple PM domains. */ + num_domains = of_count_phandle_with_args(dev->of_node, "power-domains", + "#power-domain-cells"); + if (num_domains < 2 || index >= num_domains) + return NULL; + + /* Allocate and register device on the genpd bus. */ + genpd_dev = kzalloc(sizeof(*genpd_dev), GFP_KERNEL); + if (!genpd_dev) + return ERR_PTR(-ENOMEM); + + dev_set_name(genpd_dev, "genpd:%u:%s", index, dev_name(dev)); + genpd_dev->bus = &genpd_bus_type; + genpd_dev->release = genpd_release_dev; + + ret = device_register(genpd_dev); + if (ret) { + kfree(genpd_dev); + return ERR_PTR(ret); + } + + /* Try to attach the device to the PM domain at the specified index. */ + ret = __genpd_dev_pm_attach(genpd_dev, dev->of_node, index); + if (ret < 1) { + device_unregister(genpd_dev); + return ret ? ERR_PTR(ret) : NULL; + } + + pm_runtime_set_active(genpd_dev); + pm_runtime_enable(genpd_dev); + + return genpd_dev; +} +EXPORT_SYMBOL_GPL(genpd_dev_pm_attach_by_id); + static const struct of_device_id idle_state_match[] = { { .compatible = "domain-idle-state", }, { } @@ -2457,6 +2531,12 @@ unsigned int of_genpd_opp_to_performance_state(struct device *dev, } EXPORT_SYMBOL_GPL(of_genpd_opp_to_performance_state); +static int __init genpd_bus_init(void) +{ + return bus_register(&genpd_bus_type); +} +core_initcall(genpd_bus_init); + #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */ diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 42e0d649e653..82458e8e2e01 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -237,6 +237,8 @@ unsigned int of_genpd_opp_to_performance_state(struct device *dev, struct device_node *opp_node); int genpd_dev_pm_attach(struct device *dev); +struct device *genpd_dev_pm_attach_by_id(struct device *dev, + unsigned int index); #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */ static inline int of_genpd_add_provider_simple(struct device_node *np, struct generic_pm_domain *genpd) @@ -282,6 +284,12 @@ static inline int genpd_dev_pm_attach(struct device *dev) return 0; } +static inline struct device *genpd_dev_pm_attach_by_id(struct device *dev, + unsigned int index) +{ + return NULL; +} + static inline struct generic_pm_domain *of_genpd_remove_last(struct device_node *np) { From 82e12d9e0bd59f3d24be9c735258e2e98e4f54f6 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 31 May 2018 12:59:59 +0200 Subject: [PATCH 14/19] PM / Domains: Add dev_pm_domain_attach_by_id() to manage multi PM domains The existing dev_pm_domain_attach() function, allows a single PM domain to be attached per device. To be able to support devices that are partitioned across multiple PM domains, let's introduce a new interface, dev_pm_domain_attach_by_id(). The dev_pm_domain_attach_by_id() returns a new allocated struct device with the corresponding attached PM domain. This enables for example a driver to operate on the new device from a power management point of view. The driver may then also benefit from using the received device, to set up so called device-links towards its original device. Depending on the situation, these links may then be dynamically changed. The new interface is typically called by drivers during their probe phase, in case they manages devices which uses multiple PM domains. If that is the case, the driver also becomes responsible of managing the detaching of the PM domains, which typically should be done at the remove phase. Detaching is done by calling the existing dev_pm_domain_detach() function and for each of the received devices from dev_pm_domain_attach_by_id(). Note, currently its only genpd that supports multiple PM domains per device, but dev_pm_domain_attach_by_id() can easily by extended to cover other PM domain types, if/when needed. Signed-off-by: Ulf Hansson Acked-by: Jon Hunter Tested-by: Jon Hunter Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/common.c | 43 ++++++++++++++++++++++++++++++++++--- include/linux/pm_domain.h | 7 ++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c index 7ae62b6355b8..df41b4780b3b 100644 --- a/drivers/base/power/common.c +++ b/drivers/base/power/common.c @@ -116,14 +116,51 @@ int dev_pm_domain_attach(struct device *dev, bool power_on) } EXPORT_SYMBOL_GPL(dev_pm_domain_attach); +/** + * dev_pm_domain_attach_by_id - Associate a device with one of its PM domains. + * @dev: The device used to lookup the PM domain. + * @index: The index of the PM domain. + * + * As @dev may only be attached to a single PM domain, the backend PM domain + * provider creates a virtual device to attach instead. If attachment succeeds, + * the ->detach() callback in the struct dev_pm_domain are assigned by the + * corresponding backend attach function, as to deal with detaching of the + * created virtual device. + * + * This function should typically be invoked by a driver during the probe phase, + * in case its device requires power management through multiple PM domains. The + * driver may benefit from using the received device, to configure device-links + * towards its original device. Depending on the use-case and if needed, the + * links may be dynamically changed by the driver, which allows it to control + * the power to the PM domains independently from each other. + * + * Callers must ensure proper synchronization of this function with power + * management callbacks. + * + * Returns the virtual created device when successfully attached to its PM + * domain, NULL in case @dev don't need a PM domain, else an ERR_PTR(). + * Note that, to detach the returned virtual device, the driver shall call + * dev_pm_domain_detach() on it, typically during the remove phase. + */ +struct device *dev_pm_domain_attach_by_id(struct device *dev, + unsigned int index) +{ + if (dev->pm_domain) + return ERR_PTR(-EEXIST); + + return genpd_dev_pm_attach_by_id(dev, index); +} +EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id); + /** * dev_pm_domain_detach - Detach a device from its PM domain. * @dev: Device to detach. * @power_off: Used to indicate whether we should power off the device. * - * This functions will reverse the actions from dev_pm_domain_attach() and thus - * try to detach the @dev from its PM domain. Typically it should be invoked - * from subsystem level code during the remove phase. + * This functions will reverse the actions from dev_pm_domain_attach() and + * dev_pm_domain_attach_by_id(), thus it detaches @dev from its PM domain. + * Typically it should be invoked during the remove phase, either from + * subsystem level code or from drivers. * * Callers must ensure proper synchronization of this function with power * management callbacks. diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 82458e8e2e01..9206a4fef9ac 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -299,6 +299,8 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np) #ifdef CONFIG_PM int dev_pm_domain_attach(struct device *dev, bool power_on); +struct device *dev_pm_domain_attach_by_id(struct device *dev, + unsigned int index); void dev_pm_domain_detach(struct device *dev, bool power_off); void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd); #else @@ -306,6 +308,11 @@ static inline int dev_pm_domain_attach(struct device *dev, bool power_on) { return 0; } +static inline struct device *dev_pm_domain_attach_by_id(struct device *dev, + unsigned int index) +{ + return NULL; +} static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {} static inline void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd) {} From 2d5ed61ce9820a1fe7b076cc45c169524d767746 Mon Sep 17 00:00:00 2001 From: Ravi Chandra Sadineni Date: Fri, 1 Jun 2018 19:32:15 -0700 Subject: [PATCH 15/19] PM / wakeup: Export wakeup_count instead of event_count via sysfs Currently we export event_count instead of wakeup_count via the per-device wakeup_count sysfs attribute. Change it to wakeup_count to make it more meaningful. wakeup_count increments only when events_check_enabled is set, that is whenever writes the current wakeup count to /sys/power/wakeup_count. Also events_check_enabled is cleared on every resume. User space is expected to write to this just before suspend. This way pm_wakeup_event(), when called from IRQs handles, will increment wakeup_count only if we are in system-wide suspend-resume cycle and should give a fair approximation of how many times a device may have triggered a wakeup from system suspend. event_count on the other hand will increment every time pm_wakeup_event() is called irrespective of whether we are in a suspend-resume cycle and some drivers call it on every interrupt which makes it less useful for system wakeup tracking. Signed-off-by: Ravi Chandra Sadineni Acked-by: Pavel Machek [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index 0f651efc58a1..d713738ce796 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -353,7 +353,7 @@ static ssize_t wakeup_count_show(struct device *dev, spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { - count = dev->power.wakeup->event_count; + count = dev->power.wakeup->wakeup_count; enabled = true; } spin_unlock_irq(&dev->power.lock); From 41ab43c9c89e06ff08a4750d1b09e227ea97894f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 5 Jun 2018 14:42:42 -0700 Subject: [PATCH 16/19] cpufreq: intel_pstate: enable boost for Skylake Xeon Enable HWP boost on Skylake server and workstations. Reported-by: Mel Gorman Tested-by: Giovanni Gherdovich Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 70bf63bb4e0e..352d5b2d5b58 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1794,6 +1794,12 @@ static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[] = { {} }; +static const struct x86_cpu_id intel_pstate_hwp_boost_ids[] = { + ICPU(INTEL_FAM6_SKYLAKE_X, core_funcs), + ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_funcs), + {} +}; + static int intel_pstate_init_cpu(unsigned int cpunum) { struct cpudata *cpu; @@ -1824,6 +1830,10 @@ static int intel_pstate_init_cpu(unsigned int cpunum) intel_pstate_disable_ee(cpunum); intel_pstate_hwp_enable(cpu); + + id = x86_match_cpu(intel_pstate_hwp_boost_ids); + if (id) + hwp_boost = true; } intel_pstate_get_cpu_pstates(cpu); From 7592019634f8473f0b0973ce79297183077bdbc2 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 8 Jun 2018 09:07:33 +0800 Subject: [PATCH 17/19] cpufreq: governors: Fix long idle detection logic in load calculation According to current code implementation, detecting the long idle period is done by checking if the interval between two adjacent utilization update handlers is long enough. Although this mechanism can detect if the idle period is long enough (no utilization hooks invoked during idle period), it might not cover a corner case: if the task has occupied the CPU for too long which causes no context switches during that period, then no utilization handler will be launched until this high prio task is scheduled out. As a result, the idle_periods field might be calculated incorrectly because it regards the 100% load as 0% and makes the conservative governor who uses this field confusing. Change the detection to compare the idle_time with sampling_rate directly. Reported-by: Artem S. Tashkinov Signed-off-by: Chen Yu Acked-by: Viresh Kumar Cc: All applicable Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 871bf9cf55cf..1d50e97d49f1 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -165,7 +165,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) * calls, so the previous load value can be used then. */ load = j_cdbs->prev_load; - } else if (unlikely(time_elapsed > 2 * sampling_rate && + } else if (unlikely((int)idle_time > 2 * sampling_rate && j_cdbs->prev_load)) { /* * If the CPU had gone completely idle and a task has @@ -185,10 +185,8 @@ unsigned int dbs_update(struct cpufreq_policy *policy) * clear prev_load to guarantee that the load will be * computed again next time. * - * Detecting this situation is easy: the governor's - * utilization update handler would not have run during - * CPU-idle periods. Hence, an unusually large - * 'time_elapsed' (as compared to the sampling rate) + * Detecting this situation is easy: an unusually large + * 'idle_time' (as compared to the sampling rate) * indicates this scenario. */ load = j_cdbs->prev_load; @@ -217,8 +215,8 @@ unsigned int dbs_update(struct cpufreq_policy *policy) j_cdbs->prev_load = load; } - if (time_elapsed > 2 * sampling_rate) { - unsigned int periods = time_elapsed / sampling_rate; + if (unlikely((int)idle_time > 2 * sampling_rate)) { + unsigned int periods = idle_time / sampling_rate; if (periods < idle_periods) idle_periods = periods; From 0aa9abd4c212fc1cd111cc0a9fc571f0d86e63cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Szymanski?= Date: Tue, 22 May 2018 08:28:51 +0200 Subject: [PATCH 18/19] cpufreq: imx6q: check speed grades for i.MX6ULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check the max speed supported from the fuses for i.MX6ULL and update the operating points table accordingly. Signed-off-by: Sébastien Szymanski Acked-by: Viresh Kumar Tested-by: Stefan Agner Reviewed-by: Stefan Agner Reviewed-by: Fabio Estevam Acked-by: Shawn Guo Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/imx6q-cpufreq.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index 83cf631fc9bc..f094687cae52 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -266,6 +266,8 @@ static void imx6q_opp_check_speed_grading(struct device *dev) } #define OCOTP_CFG3_6UL_SPEED_696MHZ 0x2 +#define OCOTP_CFG3_6ULL_SPEED_792MHZ 0x2 +#define OCOTP_CFG3_6ULL_SPEED_900MHZ 0x3 static void imx6ul_opp_check_speed_grading(struct device *dev) { @@ -287,16 +289,30 @@ static void imx6ul_opp_check_speed_grading(struct device *dev) * Speed GRADING[1:0] defines the max speed of ARM: * 2b'00: Reserved; * 2b'01: 528000000Hz; - * 2b'10: 696000000Hz; - * 2b'11: Reserved; + * 2b'10: 696000000Hz on i.MX6UL, 792000000Hz on i.MX6ULL; + * 2b'11: 900000000Hz on i.MX6ULL only; * We need to set the max speed of ARM according to fuse map. */ val = readl_relaxed(base + OCOTP_CFG3); val >>= OCOTP_CFG3_SPEED_SHIFT; val &= 0x3; - if (val != OCOTP_CFG3_6UL_SPEED_696MHZ) - if (dev_pm_opp_disable(dev, 696000000)) - dev_warn(dev, "failed to disable 696MHz OPP\n"); + + if (of_machine_is_compatible("fsl,imx6ul")) { + if (val != OCOTP_CFG3_6UL_SPEED_696MHZ) + if (dev_pm_opp_disable(dev, 696000000)) + dev_warn(dev, "failed to disable 696MHz OPP\n"); + } + + if (of_machine_is_compatible("fsl,imx6ull")) { + if (val != OCOTP_CFG3_6ULL_SPEED_792MHZ) + if (dev_pm_opp_disable(dev, 792000000)) + dev_warn(dev, "failed to disable 792MHz OPP\n"); + + if (val != OCOTP_CFG3_6ULL_SPEED_900MHZ) + if (dev_pm_opp_disable(dev, 900000000)) + dev_warn(dev, "failed to disable 900MHz OPP\n"); + } + iounmap(base); put_node: of_node_put(np); @@ -356,7 +372,8 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) goto put_reg; } - if (of_machine_is_compatible("fsl,imx6ul")) + if (of_machine_is_compatible("fsl,imx6ul") || + of_machine_is_compatible("fsl,imx6ull")) imx6ul_opp_check_speed_grading(cpu_dev); else imx6q_opp_check_speed_grading(cpu_dev); From b06c0b2f087ab498d51d50f5ae353133b602f614 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 12 Jun 2018 10:24:13 +0200 Subject: [PATCH 19/19] Revert "PM / runtime: Fixup reference counting of device link suppliers at probe" Revert commit 1e8378619841 (PM / runtime: Fixup reference counting of device link suppliers at probe), as it has introduced a regression and the condition it was designed to address should be covered by the existing code. Reported-by: Marek Szyprowski Signed-off-by: Rafael J. Wysocki --- drivers/base/dd.c | 3 ++- drivers/base/power/runtime.c | 27 ++++++++++++++++++++++++--- include/linux/pm_runtime.h | 6 ++++-- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index a41c91bfac0e..10454fe54482 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -580,7 +580,7 @@ int driver_probe_device(struct device_driver *drv, struct device *dev) pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); - pm_runtime_resume_suppliers(dev); + pm_runtime_get_suppliers(dev); if (dev->parent) pm_runtime_get_sync(dev->parent); @@ -591,6 +591,7 @@ int driver_probe_device(struct device_driver *drv, struct device *dev) if (dev->parent) pm_runtime_put(dev->parent); + pm_runtime_put_suppliers(dev); return ret; } diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index c6030f100c08..beb85c31f3fa 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1563,16 +1563,37 @@ void pm_runtime_clean_up_links(struct device *dev) } /** - * pm_runtime_resume_suppliers - Resume supplier devices. + * pm_runtime_get_suppliers - Resume and reference-count supplier devices. * @dev: Consumer device. */ -void pm_runtime_resume_suppliers(struct device *dev) +void pm_runtime_get_suppliers(struct device *dev) { + struct device_link *link; int idx; idx = device_links_read_lock(); - rpm_get_suppliers(dev); + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + if (link->flags & DL_FLAG_PM_RUNTIME) + pm_runtime_get_sync(link->supplier); + + device_links_read_unlock(idx); +} + +/** + * pm_runtime_put_suppliers - Drop references to supplier devices. + * @dev: Consumer device. + */ +void pm_runtime_put_suppliers(struct device *dev) +{ + struct device_link *link; + int idx; + + idx = device_links_read_lock(); + + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + if (link->flags & DL_FLAG_PM_RUNTIME) + pm_runtime_put(link->supplier); device_links_read_unlock(idx); } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index db5dbbf7a48d..f0fc4700b6ff 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -56,7 +56,8 @@ extern void pm_runtime_update_max_time_suspended(struct device *dev, s64 delta_ns); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); extern void pm_runtime_clean_up_links(struct device *dev); -extern void pm_runtime_resume_suppliers(struct device *dev); +extern void pm_runtime_get_suppliers(struct device *dev); +extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device *dev); @@ -172,7 +173,8 @@ static inline unsigned long pm_runtime_autosuspend_expiration( static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} static inline void pm_runtime_clean_up_links(struct device *dev) {} -static inline void pm_runtime_resume_suppliers(struct device *dev) {} +static inline void pm_runtime_get_suppliers(struct device *dev) {} +static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device *dev) {}