From 8a7e2d2ea080d10a189a1d611344b0330468ebc3 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 28 Aug 2018 11:31:18 +0200 Subject: [PATCH 01/73] cpupower: remove stringop-truncation waring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The strncpy doesn't null terminate the string because the size is too short by one byte. parse.c: In function ‘prepare_default_config’: parse.c:148:2: warning: ‘strncpy’ output truncated before terminating nul copying 8 bytes from a string of the same length [-Wstringop-truncation] strncpy(config->governor, "ondemand", 8); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The normal method of passing the length of the destination buffer works correctly here. Fixes: 7fe2f6399a84 ("cpupowerutils - cpufrequtils extended with quite some features") Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan (Samsung OSG) --- tools/power/cpupower/bench/parse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/bench/parse.c b/tools/power/cpupower/bench/parse.c index 9ba8a44ad2a7..84caee38418f 100644 --- a/tools/power/cpupower/bench/parse.c +++ b/tools/power/cpupower/bench/parse.c @@ -145,7 +145,7 @@ struct config *prepare_default_config() config->cpu = 0; config->prio = SCHED_HIGH; config->verbose = 0; - strncpy(config->governor, "ondemand", 8); + strncpy(config->governor, "ondemand", sizeof(config->governor)); config->output = stdout; From a4a008e53c9e5ac8d6070bd1b2eddfc1ae0d2338 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 31 Aug 2018 11:22:29 +0300 Subject: [PATCH 02/73] intel_idle: Get rid of custom ICPU() macro Replace custom grown macro with generic INTEL_CPU_FAM6() one. No functional change intended. Signed-off-by: Andy Shevchenko Acked-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 75 +++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index b2ccce5fb071..791b8a366e6e 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1066,46 +1066,43 @@ static const struct idle_cpu idle_cpu_dnv = { .disable_promotion_to_c1e = true, }; -#define ICPU(model, cpu) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&cpu } - static const struct x86_cpu_id intel_idle_ids[] __initconst = { - ICPU(INTEL_FAM6_NEHALEM_EP, idle_cpu_nehalem), - ICPU(INTEL_FAM6_NEHALEM, idle_cpu_nehalem), - ICPU(INTEL_FAM6_NEHALEM_G, idle_cpu_nehalem), - ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem), - ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem), - ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem), - ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom), - ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft), - ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem), - ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb), - ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), - ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), - ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), - ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), - ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), - ICPU(INTEL_FAM6_HASWELL_CORE, idle_cpu_hsw), - ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw), - ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw), - ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw), - ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn), - ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw), - ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw), - ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw), - ICPU(INTEL_FAM6_BROADWELL_XEON_D, idle_cpu_bdw), - ICPU(INTEL_FAM6_SKYLAKE_MOBILE, idle_cpu_skl), - ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, idle_cpu_skl), - ICPU(INTEL_FAM6_KABYLAKE_MOBILE, idle_cpu_skl), - ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl), - ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), - ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), - ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl), - ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), - ICPU(INTEL_FAM6_ATOM_GEMINI_LAKE, idle_cpu_bxt), - ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), + INTEL_CPU_FAM6(NEHALEM_EP, idle_cpu_nehalem), + INTEL_CPU_FAM6(NEHALEM, idle_cpu_nehalem), + INTEL_CPU_FAM6(NEHALEM_G, idle_cpu_nehalem), + INTEL_CPU_FAM6(WESTMERE, idle_cpu_nehalem), + INTEL_CPU_FAM6(WESTMERE_EP, idle_cpu_nehalem), + INTEL_CPU_FAM6(NEHALEM_EX, idle_cpu_nehalem), + INTEL_CPU_FAM6(ATOM_PINEVIEW, idle_cpu_atom), + INTEL_CPU_FAM6(ATOM_LINCROFT, idle_cpu_lincroft), + INTEL_CPU_FAM6(WESTMERE_EX, idle_cpu_nehalem), + INTEL_CPU_FAM6(SANDYBRIDGE, idle_cpu_snb), + INTEL_CPU_FAM6(SANDYBRIDGE_X, idle_cpu_snb), + INTEL_CPU_FAM6(ATOM_CEDARVIEW, idle_cpu_atom), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, idle_cpu_byt), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, idle_cpu_tangier), + INTEL_CPU_FAM6(ATOM_AIRMONT, idle_cpu_cht), + INTEL_CPU_FAM6(IVYBRIDGE, idle_cpu_ivb), + INTEL_CPU_FAM6(IVYBRIDGE_X, idle_cpu_ivt), + INTEL_CPU_FAM6(HASWELL_CORE, idle_cpu_hsw), + INTEL_CPU_FAM6(HASWELL_X, idle_cpu_hsw), + INTEL_CPU_FAM6(HASWELL_ULT, idle_cpu_hsw), + INTEL_CPU_FAM6(HASWELL_GT3E, idle_cpu_hsw), + INTEL_CPU_FAM6(ATOM_SILVERMONT2, idle_cpu_avn), + INTEL_CPU_FAM6(BROADWELL_CORE, idle_cpu_bdw), + INTEL_CPU_FAM6(BROADWELL_GT3E, idle_cpu_bdw), + INTEL_CPU_FAM6(BROADWELL_X, idle_cpu_bdw), + INTEL_CPU_FAM6(BROADWELL_XEON_D, idle_cpu_bdw), + INTEL_CPU_FAM6(SKYLAKE_MOBILE, idle_cpu_skl), + INTEL_CPU_FAM6(SKYLAKE_DESKTOP, idle_cpu_skl), + INTEL_CPU_FAM6(KABYLAKE_MOBILE, idle_cpu_skl), + INTEL_CPU_FAM6(KABYLAKE_DESKTOP, idle_cpu_skl), + INTEL_CPU_FAM6(SKYLAKE_X, idle_cpu_skx), + INTEL_CPU_FAM6(XEON_PHI_KNL, idle_cpu_knl), + INTEL_CPU_FAM6(XEON_PHI_KNM, idle_cpu_knl), + INTEL_CPU_FAM6(ATOM_GOLDMONT, idle_cpu_bxt), + INTEL_CPU_FAM6(ATOM_GEMINI_LAKE, idle_cpu_bxt), + INTEL_CPU_FAM6(ATOM_DENVERTON, idle_cpu_dnv), {} }; From 17ed15183c2453ba2f03a813fd8b1594bd9baf98 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 31 Aug 2018 11:25:13 +0300 Subject: [PATCH 03/73] powercap: RAPL: Get rid of custom RAPL_CPU() macro Replace custom grown macro with generic INTEL_CPU_FAM6() one. No functional change intended. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 61 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 295d8dcba48c..bb92874b1175 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1133,47 +1133,40 @@ static const struct rapl_defaults rapl_defaults_cht = { .compute_time_window = rapl_compute_time_window_atom, }; -#define RAPL_CPU(_model, _ops) { \ - .vendor = X86_VENDOR_INTEL, \ - .family = 6, \ - .model = _model, \ - .driver_data = (kernel_ulong_t)&_ops, \ - } - static const struct x86_cpu_id rapl_ids[] __initconst = { - RAPL_CPU(INTEL_FAM6_SANDYBRIDGE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_SANDYBRIDGE_X, rapl_defaults_core), + INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core), + INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_IVYBRIDGE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_IVYBRIDGE_X, rapl_defaults_core), + INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core), + INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_HASWELL_CORE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_HASWELL_ULT, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_HASWELL_GT3E, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_HASWELL_X, rapl_defaults_hsw_server), + INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core), + INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core), + INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core), + INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server), - RAPL_CPU(INTEL_FAM6_BROADWELL_CORE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_BROADWELL_GT3E, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_BROADWELL_XEON_D, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_BROADWELL_X, rapl_defaults_hsw_server), + INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core), + INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core), + INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core), + INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server), - RAPL_CPU(INTEL_FAM6_SKYLAKE_DESKTOP, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_SKYLAKE_MOBILE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_SKYLAKE_X, rapl_defaults_hsw_server), - RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_CANNONLAKE_MOBILE, rapl_defaults_core), + INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core), + INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core), + INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server), + INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core), + INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core), + INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt), - RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht), - RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng), - RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann), - RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_ATOM_GEMINI_LAKE, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, rapl_defaults_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, rapl_defaults_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, rapl_defaults_ann), + INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core), + INTEL_CPU_FAM6(ATOM_GEMINI_LAKE, rapl_defaults_core), + INTEL_CPU_FAM6(ATOM_DENVERTON, rapl_defaults_core), - RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), - RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM, rapl_defaults_hsw_server), + INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server), + INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server), {} }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); From 0e7ea2f3b0e09333e54fb8322f85a45cb01fdd61 Mon Sep 17 00:00:00 2001 From: Igor Stoppa Date: Fri, 7 Sep 2018 19:09:55 +0300 Subject: [PATCH 04/73] cpufreq: remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f53fb41efb7b..7aa3dcad2175 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -403,7 +403,7 @@ EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); void cpufreq_freq_transition_end(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int transition_failed) { - if (unlikely(WARN_ON(!policy->transition_ongoing))) + if (WARN_ON(!policy->transition_ongoing)) return; cpufreq_notify_post_transition(policy, freqs, transition_failed); From 3bb756449b2ddca8e059d011af2e732c857c61d7 Mon Sep 17 00:00:00 2001 From: "Vladimir D. Seleznev" Date: Sat, 1 Sep 2018 07:51:15 +0300 Subject: [PATCH 05/73] PM / hibernate: Documentation: fix image_size default value This commit updates the default value of /sys/power/image_size in the documentation. Since ac5c24ec1e983313ef0015258fba6f630e54e7cn the `image_size' value is set to about 2/5 of RAM, according to kernel/power/snapshot.c: image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; but this change was not reflected everywhere in the documentation. Signed-off-by: Vladimir D. Seleznev Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 2 +- Documentation/power/swsusp.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 2f813d644c69..18b7dc929234 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -99,7 +99,7 @@ Description: this file, the suspend image will be as small as possible. Reading from this file will display the current image size - limit, which is set to 500 MB by default. + limit, which is set to around 2/5 of available RAM by default. What: /sys/power/pm_trace Date: August 2006 diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt index cc87adf44c0a..236d1fb13640 100644 --- a/Documentation/power/swsusp.txt +++ b/Documentation/power/swsusp.txt @@ -56,7 +56,7 @@ If you want to limit the suspend image size to N bytes, do echo N > /sys/power/image_size -before suspend (it is limited to 500 MB by default). +before suspend (it is limited to around 2/5 of available RAM by default). . The resume process checks for the presence of the resume device, if found, it then checks the contents for the hibernation image signature. From 8412dbd642585107e29b20920876de98e7df5819 Mon Sep 17 00:00:00 2001 From: Todd Brandt Date: Wed, 22 Aug 2018 18:37:11 -0700 Subject: [PATCH 06/73] PM / sleep: Show freezing tasks that caused a suspend abort For debug purposes it would be nice to see which tasks caused a suspend abort, i.e. which tasks were still in the process of freezing when a wakeup event occurred. This patch adds the info to pm_debug_messages. Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 7381d49a44db..4b6a54da7e65 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -96,7 +96,7 @@ static int try_to_freeze_tasks(bool user_only) if (wq_busy) show_workqueue_state(); - if (!wakeup) { + if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) From 51b177637b5c54061bbd12f228a26755f466cbb0 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 27 Aug 2018 20:52:15 -0500 Subject: [PATCH 07/73] cpufreq: Convert to using %pOFn instead of device_node.name In preparation to remove the node name pointer from struct device_node, convert printf users to use the %pOFn format specifier. Signed-off-by: Rob Herring Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/s5pv210-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c index 5d31c2db12a3..dbecd7667db2 100644 --- a/drivers/cpufreq/s5pv210-cpufreq.c +++ b/drivers/cpufreq/s5pv210-cpufreq.c @@ -611,8 +611,8 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev) for_each_compatible_node(np, NULL, "samsung,s5pv210-dmc") { id = of_alias_get_id(np, "dmc"); if (id < 0 || id >= ARRAY_SIZE(dmc_base)) { - pr_err("%s: failed to get alias of dmc node '%s'\n", - __func__, np->name); + pr_err("%s: failed to get alias of dmc node '%pOFn'\n", + __func__, np); of_node_put(np); return id; } From d1e1303173d7fe3c107a85805a62c13e760f2da4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 11 Sep 2018 11:12:51 +0100 Subject: [PATCH 08/73] cpufreq: dt: Add support for r8a7744 Add the compatible strings for supporting the generic cpufreq driver on the Renesas RZ/G1N (R8A7744) SoC. Signed-off-by: Biju Das Reviewed-by: Fabrizio Castro Acked-by: Viresh Kumar Reviewed-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index fe14c57de6ca..805f8a0c3470 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -58,6 +58,7 @@ static const struct of_device_id whitelist[] __initconst = { { .compatible = "renesas,r8a73a4", }, { .compatible = "renesas,r8a7740", }, { .compatible = "renesas,r8a7743", }, + { .compatible = "renesas,r8a7744", }, { .compatible = "renesas,r8a7745", }, { .compatible = "renesas,r8a7778", }, { .compatible = "renesas,r8a7779", }, From 6a5f95b5a4f4ff29e4071bc5b95f8f3a2aef046b Mon Sep 17 00:00:00 2001 From: Fieah Lim Date: Tue, 11 Sep 2018 03:59:01 +0800 Subject: [PATCH 09/73] cpuidle: Remove unnecessary wrapper cpuidle_get_last_residency() cpuidle_get_last_residency() is just a wrapper for retrieving the last_residency member of struct cpuidle_device. It's also weirdly the only wrapper function for accessing cpuidle_* struct member (by my best guess is it could be a leftover from v2.x). Anyhow, since the only two users (the ladder and menu governors) can access dev->last_residency directly, and it's more intuitive to do it that way, let's just get rid of the wrapper. This patch tidies up CPU idle code a bit without functional changes. Signed-off-by: Fieah Lim [ rjw: Changelog cleanup ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/ladder.c | 2 +- drivers/cpuidle/governors/menu.c | 2 +- include/linux/cpuidle.h | 10 ---------- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 704880a6612a..f0dddc66af26 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -80,7 +80,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, last_state = &ldev->states[last_idx]; - last_residency = cpuidle_get_last_residency(dev) - drv->states[last_idx].exit_latency; + last_residency = dev->last_residency - drv->states[last_idx].exit_latency; /* consider promotion */ if (last_idx < drv->state_count - 1 && diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index e26a40971b26..021b08dd139c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -514,7 +514,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) measured_us = 9 * MAX_INTERESTING / 10; } else { /* measured value */ - measured_us = cpuidle_get_last_residency(dev); + measured_us = dev->last_residency; /* Deduct exit latency */ if (measured_us > 2 * target->exit_latency) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4325d6fdde9b..d262f620890e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -99,16 +99,6 @@ struct cpuidle_device { DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices); DECLARE_PER_CPU(struct cpuidle_device, cpuidle_dev); -/** - * cpuidle_get_last_residency - retrieves the last state's residency time - * @dev: the target CPU - */ -static inline int cpuidle_get_last_residency(struct cpuidle_device *dev) -{ - return dev->last_residency; -} - - /**************************** * CPUIDLE DRIVER INTERFACE * ****************************/ From 7037b43e0076cce1c07c72540e219fb5db7ea01f Mon Sep 17 00:00:00 2001 From: Fieah Lim Date: Tue, 11 Sep 2018 05:47:25 +0800 Subject: [PATCH 10/73] cpuidle: enter_state: Don't needlessly calculate diff time Currently, ktime_us_delta() is invoked unconditionally to compute the idle residency of the CPU, but it only makes sense to do that if a valid idle state has been entered, so move the ktime_us_delta() invocation after the entered_state >= 0 check. While at it, merge two comment blocks in there into one and drop a space between type casting of diff. This patch has no functional changes. Signed-off-by: Fieah Lim [ rjw: Changelog cleanup, comment format fix ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 6df894d65d9e..4a97446f66d8 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -247,17 +247,17 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, if (!cpuidle_state_is_coupled(drv, index)) local_irq_enable(); - diff = ktime_us_delta(time_end, time_start); - if (diff > INT_MAX) - diff = INT_MAX; - - dev->last_residency = (int) diff; - if (entered_state >= 0) { - /* Update cpuidle counters */ - /* This can be moved to within driver enter routine + /* + * Update cpuidle counters + * This can be moved to within driver enter routine, * but that results in multiple copies of same code. */ + diff = ktime_us_delta(time_end, time_start); + if (diff > INT_MAX) + diff = INT_MAX; + + dev->last_residency = (int)diff; dev->states_usage[entered_state].time += dev->last_residency; dev->states_usage[entered_state].usage++; } else { From 2fbb8670b4ff4454f1c0de510f788d737edc4b90 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Sep 2018 11:14:34 +0530 Subject: [PATCH 11/73] OPP: Free OPP table properly on performance state irregularities The OPP table was freed, but not the individual OPPs which is done from _dev_pm_opp_remove_table(). Fix it by calling _dev_pm_opp_remove_table() as well. Cc: 4.18 # v4.18 Fixes: 3ba98324e81a ("PM / OPP: Get performance state using genpd helper") Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 7af0ddec936b..20988c426650 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -425,6 +425,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np) dev_err(dev, "Not all nodes have performance state set (%d: %d)\n", count, pstate_count); ret = -ENOENT; + _dev_pm_opp_remove_table(opp_table, dev, false); goto put_opp_table; } From 404b1369ea26f598b96ae4d3183262a879703cfe Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 13 Sep 2018 13:09:27 +0530 Subject: [PATCH 12/73] OPP: Don't try to remove all OPP tables on failure dev_pm_opp_of_cpumask_add_table() creates the OPP table for all CPUs present in the cpumask and on errors it should revert all changes it has done. It actually is doing a bit more than that. On errors, it tries to free all the OPP tables, even the one it hasn't created yet. This may also end up freeing the OPP tables which were created from separate path, like dev_pm_opp_set_supported_hw(). Reported-and-tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/cpu.c | 8 ++++++-- drivers/opp/of.c | 4 ++-- drivers/opp/opp.h | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c index 0c0910709435..2eb5e2e7ff66 100644 --- a/drivers/opp/cpu.c +++ b/drivers/opp/cpu.c @@ -108,7 +108,8 @@ void dev_pm_opp_free_cpufreq_table(struct device *dev, EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table); #endif /* CONFIG_CPU_FREQ */ -void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of) +void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, + int last_cpu) { struct device *cpu_dev; int cpu; @@ -116,6 +117,9 @@ void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of) WARN_ON(cpumask_empty(cpumask)); for_each_cpu(cpu, cpumask) { + if (cpu == last_cpu) + break; + cpu_dev = get_cpu_device(cpu); if (!cpu_dev) { pr_err("%s: failed to get cpu%d device\n", __func__, @@ -140,7 +144,7 @@ void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of) */ void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask) { - _dev_pm_opp_cpumask_remove_table(cpumask, false); + _dev_pm_opp_cpumask_remove_table(cpumask, false, -1); } EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table); diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 20988c426650..86222586f27b 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -592,7 +592,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed); */ void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask) { - _dev_pm_opp_cpumask_remove_table(cpumask, true); + _dev_pm_opp_cpumask_remove_table(cpumask, true, -1); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table); @@ -627,7 +627,7 @@ int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask) __func__, cpu, ret); /* Free all other OPPs */ - dev_pm_opp_of_cpumask_remove_table(cpumask); + _dev_pm_opp_cpumask_remove_table(cpumask, true, cpu); break; } } diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 7c540fd063b2..a9d22aa534c3 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -196,7 +196,7 @@ struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table); void _opp_free(struct dev_pm_opp *opp); int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table, bool rate_not_available); int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic); -void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of); +void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, int last_cpu); struct opp_table *_add_opp_table(struct device *dev); #ifdef CONFIG_OF From 3d2556992a878a2210d3be498416aee39e0c32aa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 3 Aug 2018 07:05:21 +0530 Subject: [PATCH 13/73] OPP: Protect dev_list with opp_table lock The dev_list needs to be protected with a lock, else we may have simultaneous access (addition/removal) to it and that would be racy. Extend scope of the opp_table lock to protect dev_list as well. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 21 +++++++++++++++++++-- drivers/opp/cpu.c | 2 ++ drivers/opp/opp.h | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 31ff03dbeb83..9f8aa31265fe 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -48,9 +48,14 @@ static struct opp_device *_find_opp_dev(const struct device *dev, static struct opp_table *_find_opp_table_unlocked(struct device *dev) { struct opp_table *opp_table; + bool found; list_for_each_entry(opp_table, &opp_tables, node) { - if (_find_opp_dev(dev, opp_table)) { + mutex_lock(&opp_table->lock); + found = !!_find_opp_dev(dev, opp_table); + mutex_unlock(&opp_table->lock); + + if (found) { _get_opp_table_kref(opp_table); return opp_table; @@ -766,6 +771,8 @@ struct opp_device *_add_opp_dev(const struct device *dev, /* Initialize opp-dev */ opp_dev->dev = dev; + + mutex_lock(&opp_table->lock); list_add(&opp_dev->node, &opp_table->dev_list); /* Create debugfs entries for the opp_table */ @@ -773,6 +780,7 @@ struct opp_device *_add_opp_dev(const struct device *dev, if (ret) dev_err(dev, "%s: Failed to register opp debugfs (%d)\n", __func__, ret); + mutex_unlock(&opp_table->lock); return opp_dev; } @@ -791,6 +799,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev) if (!opp_table) return NULL; + mutex_init(&opp_table->lock); INIT_LIST_HEAD(&opp_table->dev_list); opp_dev = _add_opp_dev(dev, opp_table); @@ -812,7 +821,6 @@ static struct opp_table *_allocate_opp_table(struct device *dev) BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head); INIT_LIST_HEAD(&opp_table->opp_list); - mutex_init(&opp_table->lock); kref_init(&opp_table->kref); /* Secure the device table modification */ @@ -854,6 +862,10 @@ static void _opp_table_kref_release(struct kref *kref) if (!IS_ERR(opp_table->clk)) clk_put(opp_table->clk); + /* + * No need to take opp_table->lock here as we are guaranteed that no + * references to the OPP table are taken at this point. + */ opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device, node); @@ -1716,6 +1728,9 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, { struct dev_pm_opp *opp, *tmp; + /* Protect dev_list */ + mutex_lock(&opp_table->lock); + /* Find if opp_table manages a single device */ if (list_is_singular(&opp_table->dev_list)) { /* Free static OPPs */ @@ -1733,6 +1748,8 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, } else { _remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table); } + + mutex_unlock(&opp_table->lock); } void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all) diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c index 2eb5e2e7ff66..36586f66cd83 100644 --- a/drivers/opp/cpu.c +++ b/drivers/opp/cpu.c @@ -226,8 +226,10 @@ int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) cpumask_clear(cpumask); if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) { + mutex_lock(&opp_table->lock); list_for_each_entry(opp_dev, &opp_table->dev_list, node) cpumask_set_cpu(opp_dev->dev->id, cpumask); + mutex_unlock(&opp_table->lock); } else { cpumask_set_cpu(cpu_dev->id, cpumask); } diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index a9d22aa534c3..88e9f47aadf1 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -126,7 +126,7 @@ enum opp_table_access { * @dev_list: list of devices that share these OPPs * @opp_list: table of opps * @kref: for reference count of the table. - * @lock: mutex protecting the opp_list. + * @lock: mutex protecting the opp_list and dev_list. * @np: struct device_node pointer for opp's DT node. * @clock_latency_ns_max: Max clock latency in nanoseconds. * @shared_opp: OPP is shared between multiple devices. From eb7c8743d6cf489e30091e6656fd4d3306621e9a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 5 Sep 2018 16:17:14 +0530 Subject: [PATCH 14/73] OPP: Pass index to _of_init_opp_table() This is a preparatory patch required for the next commit which will start using OPP table's node pointer in _of_init_opp_table(), which requires the index in order to read the OPP table's phandle. This commit adds the index argument in the call chains in order to get it delivered to _of_init_opp_table(). Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 19 +++++++++++++++---- drivers/opp/of.c | 12 +++++++----- drivers/opp/opp.h | 4 ++-- include/linux/pm_opp.h | 6 ++++++ 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 9f8aa31265fe..332748adc262 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -785,7 +785,7 @@ struct opp_device *_add_opp_dev(const struct device *dev, return opp_dev; } -static struct opp_table *_allocate_opp_table(struct device *dev) +static struct opp_table *_allocate_opp_table(struct device *dev, int index) { struct opp_table *opp_table; struct opp_device *opp_dev; @@ -808,7 +808,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev) return NULL; } - _of_init_opp_table(opp_table, dev); + _of_init_opp_table(opp_table, dev, index); /* Find clk for the device */ opp_table->clk = clk_get(dev, NULL); @@ -833,7 +833,7 @@ void _get_opp_table_kref(struct opp_table *opp_table) kref_get(&opp_table->kref); } -struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) +static struct opp_table *_opp_get_opp_table(struct device *dev, int index) { struct opp_table *opp_table; @@ -844,15 +844,26 @@ struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) if (!IS_ERR(opp_table)) goto unlock; - opp_table = _allocate_opp_table(dev); + opp_table = _allocate_opp_table(dev, index); unlock: mutex_unlock(&opp_table_lock); return opp_table; } + +struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) +{ + return _opp_get_opp_table(dev, 0); +} EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table); +struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, + int index) +{ + return _opp_get_opp_table(dev, index); +} + static void _opp_table_kref_release(struct kref *kref) { struct opp_table *opp_table = container_of(kref, struct opp_table, kref); diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 86222586f27b..1a9e1242a2a7 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -52,7 +52,8 @@ static struct opp_table *_managed_opp(const struct device_node *np) return managed_table; } -void _of_init_opp_table(struct opp_table *opp_table, struct device *dev) +void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, + int index) { struct device_node *np; @@ -378,7 +379,8 @@ static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev, } /* Initializes OPP tables based on new bindings */ -static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np) +static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, + int index) { struct device_node *np; struct opp_table *opp_table; @@ -393,7 +395,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np) goto put_opp_table; } - opp_table = dev_pm_opp_get_opp_table(dev); + opp_table = dev_pm_opp_get_opp_table_indexed(dev, index); if (!opp_table) return -ENOMEM; @@ -526,7 +528,7 @@ int dev_pm_opp_of_add_table(struct device *dev) return _of_add_opp_table_v1(dev); } - ret = _of_add_opp_table_v2(dev, opp_np); + ret = _of_add_opp_table_v2(dev, opp_np, 0); of_node_put(opp_np); return ret; @@ -574,7 +576,7 @@ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) return -ENODEV; } - ret = _of_add_opp_table_v2(dev, opp_np); + ret = _of_add_opp_table_v2(dev, opp_np, index); of_node_put(opp_np); return ret; diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 88e9f47aadf1..b235e76fc8cc 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -200,9 +200,9 @@ void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, in struct opp_table *_add_opp_table(struct device *dev); #ifdef CONFIG_OF -void _of_init_opp_table(struct opp_table *opp_table, struct device *dev); +void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index); #else -static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev) {} +static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index) {} #endif #ifdef CONFIG_DEBUG_FS diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 099b31960dec..5d399eeef172 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -79,6 +79,7 @@ struct dev_pm_set_opp_data { #if defined(CONFIG_PM_OPP) struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); +struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, int index); void dev_pm_opp_put_opp_table(struct opp_table *opp_table); unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp); @@ -136,6 +137,11 @@ static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) return ERR_PTR(-ENOTSUPP); } +static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, int index) +{ + return ERR_PTR(-ENOTSUPP); +} + static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp) From f06ed90e7051a3a50c2172001c86ff9645e5c2ba Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Jun 2018 12:04:43 +0530 Subject: [PATCH 15/73] OPP: Parse OPP table's DT properties from _of_init_opp_table() Parse the DT properties present in the OPP table from _of_init_opp_table(), which is a dedicated routine for DT parsing. Minor relocation of helpers is required for this. It is possible now for _managed_opp() to return a partially initialized OPP table if the OPP table is created via the helpers like dev_pm_opp_set_supported_hw() and we need another flag to indicate if the static OPP are already parsed or not to make sure we don't incorrectly skip initializing the static OPPs. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 77 +++++++++++++++++++++++++++++------------------ drivers/opp/opp.h | 2 ++ 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 1a9e1242a2a7..4a19f76880d3 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -23,6 +23,24 @@ #include "opp.h" +/* + * Returns opp descriptor node for a device node, caller must + * do of_node_put(). + */ +static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np, + int index) +{ + /* "operating-points-v2" can be an array for power domain providers */ + return of_parse_phandle(np, "operating-points-v2", index); +} + +/* Returns opp descriptor node for a device, caller must do of_node_put() */ +struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev) +{ + return _opp_of_get_opp_desc_node(dev->of_node, 0); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node); + static struct opp_table *_managed_opp(const struct device_node *np) { struct opp_table *opp_table, *managed_table = NULL; @@ -55,22 +73,37 @@ static struct opp_table *_managed_opp(const struct device_node *np) void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index) { - struct device_node *np; + struct device_node *np, *opp_np; + u32 val; /* * Only required for backward compatibility with v1 bindings, but isn't * harmful for other cases. And so we do it unconditionally. */ np = of_node_get(dev->of_node); - if (np) { - u32 val; + if (!np) + return; - if (!of_property_read_u32(np, "clock-latency", &val)) - opp_table->clock_latency_ns_max = val; - of_property_read_u32(np, "voltage-tolerance", - &opp_table->voltage_tolerance_v1); - of_node_put(np); - } + if (!of_property_read_u32(np, "clock-latency", &val)) + opp_table->clock_latency_ns_max = val; + of_property_read_u32(np, "voltage-tolerance", + &opp_table->voltage_tolerance_v1); + + /* Get OPP table node */ + opp_np = _opp_of_get_opp_desc_node(np, index); + of_node_put(np); + + if (!opp_np) + return; + + if (of_property_read_bool(opp_np, "opp-shared")) + opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED; + else + opp_table->shared_opp = OPP_TABLE_ACCESS_EXCLUSIVE; + + opp_table->np = opp_np; + + of_node_put(opp_np); } static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table, @@ -250,22 +283,6 @@ void dev_pm_opp_of_remove_table(struct device *dev) } EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); -/* Returns opp descriptor node for a device node, caller must - * do of_node_put() */ -static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np, - int index) -{ - /* "operating-points-v2" can be an array for power domain providers */ - return of_parse_phandle(np, "operating-points-v2", index); -} - -/* Returns opp descriptor node for a device, caller must do of_node_put() */ -struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev) -{ - return _opp_of_get_opp_desc_node(dev->of_node, 0); -} -EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node); - /** * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings) * @opp_table: OPP table @@ -392,6 +409,9 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, /* OPPs are already managed */ if (!_add_opp_dev(dev, opp_table)) ret = -ENOMEM; + else if (!opp_table->parsed_static_opps) + goto initialize_static_opps; + goto put_opp_table; } @@ -399,6 +419,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, if (!opp_table) return -ENOMEM; +initialize_static_opps: /* We have opp-table node now, iterate over it and add OPPs */ for_each_available_child_of_node(opp_np, np) { count++; @@ -434,11 +455,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, if (pstate_count) opp_table->genpd_performance_state = true; - opp_table->np = opp_np; - if (of_property_read_bool(opp_np, "opp-shared")) - opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED; - else - opp_table->shared_opp = OPP_TABLE_ACCESS_EXCLUSIVE; + opp_table->parsed_static_opps = true; put_opp_table: dev_pm_opp_put_opp_table(opp_table); diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index b235e76fc8cc..b04c2b511c4d 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -129,6 +129,7 @@ enum opp_table_access { * @lock: mutex protecting the opp_list and dev_list. * @np: struct device_node pointer for opp's DT node. * @clock_latency_ns_max: Max clock latency in nanoseconds. + * @parsed_static_opps: True if OPPs are initialized from DT. * @shared_opp: OPP is shared between multiple devices. * @suspend_opp: Pointer to OPP to be used during device suspend. * @supported_hw: Array of version number to support. @@ -164,6 +165,7 @@ struct opp_table { /* For backward compatibility with v1 bindings */ unsigned int voltage_tolerance_v1; + bool parsed_static_opps; enum opp_table_access shared_opp; struct dev_pm_opp *suspend_opp; From 0ad8c623907c27f4b8572d36c4ba73ea103e3108 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 2 Aug 2018 14:23:09 +0530 Subject: [PATCH 16/73] OPP: Don't take OPP table's kref for static OPPs The reference count is only required to be incremented for every call that may lead to adding the OPP table. For static OPPs the same should be done from the parent routine which adds all static OPPs together and so only one refcount for all static OPPs. Update code to reflect that. The refcount is incremented every time a dynamic OPP is created (as that can lead to creating the OPP table) and the same is dropped when the OPP is removed. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 332748adc262..2a6976265580 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -919,7 +919,6 @@ static void _opp_kref_release(struct kref *kref) kfree(opp); mutex_unlock(&opp_table->lock); - dev_pm_opp_put_opp_table(opp_table); } void dev_pm_opp_get(struct dev_pm_opp *opp) @@ -963,11 +962,15 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq) if (found) { dev_pm_opp_put(opp); + + /* Drop the reference taken by dev_pm_opp_add() */ + dev_pm_opp_put_opp_table(opp_table); } else { dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n", __func__, freq); } + /* Drop the reference taken by _find_opp_table() */ dev_pm_opp_put_opp_table(opp_table); } EXPORT_SYMBOL_GPL(dev_pm_opp_remove); @@ -1085,9 +1088,6 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, new_opp->opp_table = opp_table; kref_init(&new_opp->kref); - /* Get a reference to the OPP table */ - _get_opp_table_kref(opp_table); - ret = opp_debug_create_one(new_opp, opp_table); if (ret) dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n", @@ -1566,8 +1566,9 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) return -ENOMEM; ret = _opp_add_v1(opp_table, dev, freq, u_volt, true); + if (ret) + dev_pm_opp_put_opp_table(opp_table); - dev_pm_opp_put_opp_table(opp_table); return ret; } EXPORT_SYMBOL_GPL(dev_pm_opp_add); From d0e8ae6c26da7b9436775dfd9768d7a821ed47b7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 11 Sep 2018 11:14:11 +0530 Subject: [PATCH 17/73] OPP: Create separate kref for static OPPs list The static OPPs don't always get freed with the OPP table, it can happen before that as well. For example, if the OPP table is first created using helpers like dev_pm_opp_set_supported_hw() and the OPPs are created at a later point. Now when the OPPs are removed, the OPP table stays until the time dev_pm_opp_put_supported_hw() is called. Later patches will streamline the freeing of OPP table and that requires the static OPPs to get freed with help of a separate kernel reference. This patch prepares for that by creating a separate kref for static OPPs list. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 33 ++++++++++++++++++++++++++++++++- drivers/opp/of.c | 7 +++++++ drivers/opp/opp.h | 3 +++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 2a6976265580..b555121b878b 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -892,6 +892,33 @@ static void _opp_table_kref_release(struct kref *kref) mutex_unlock(&opp_table_lock); } +void _opp_remove_all_static(struct opp_table *opp_table) +{ + struct dev_pm_opp *opp, *tmp; + + list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) { + if (!opp->dynamic) + dev_pm_opp_put(opp); + } + + opp_table->parsed_static_opps = false; +} + +static void _opp_table_list_kref_release(struct kref *kref) +{ + struct opp_table *opp_table = container_of(kref, struct opp_table, + list_kref); + + _opp_remove_all_static(opp_table); + mutex_unlock(&opp_table_lock); +} + +void _put_opp_list_kref(struct opp_table *opp_table) +{ + kref_put_mutex(&opp_table->list_kref, _opp_table_list_kref_release, + &opp_table_lock); +} + void dev_pm_opp_put_opp_table(struct opp_table *opp_table) { kref_put_mutex(&opp_table->kref, _opp_table_kref_release, @@ -1746,8 +1773,11 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, /* Find if opp_table manages a single device */ if (list_is_singular(&opp_table->dev_list)) { /* Free static OPPs */ + _put_opp_list_kref(opp_table); + + /* Free dynamic OPPs */ list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) { - if (remove_all || !opp->dynamic) + if (remove_all) dev_pm_opp_put(opp); } @@ -1758,6 +1788,7 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, if (opp_table->genpd_performance_state) dev_pm_genpd_set_performance_state(dev, 0); } else { + _put_opp_list_kref(opp_table); _remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table); } diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 4a19f76880d3..aaa4bab69846 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -411,6 +411,8 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, ret = -ENOMEM; else if (!opp_table->parsed_static_opps) goto initialize_static_opps; + else + kref_get(&opp_table->list_kref); goto put_opp_table; } @@ -420,6 +422,8 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, return -ENOMEM; initialize_static_opps: + kref_init(&opp_table->list_kref); + /* We have opp-table node now, iterate over it and add OPPs */ for_each_available_child_of_node(opp_np, np) { count++; @@ -437,6 +441,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, /* There should be one of more OPP defined */ if (WARN_ON(!count)) { ret = -ENOENT; + _put_opp_list_kref(opp_table); goto put_opp_table; } @@ -491,6 +496,8 @@ static int _of_add_opp_table_v1(struct device *dev) if (!opp_table) return -ENOMEM; + kref_init(&opp_table->list_kref); + val = prop->value; while (nr) { unsigned long freq = be32_to_cpup(val++) * 1000; diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index b04c2b511c4d..9274116c90e4 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -126,6 +126,7 @@ enum opp_table_access { * @dev_list: list of devices that share these OPPs * @opp_list: table of opps * @kref: for reference count of the table. + * @list_kref: for reference count of the OPP list. * @lock: mutex protecting the opp_list and dev_list. * @np: struct device_node pointer for opp's DT node. * @clock_latency_ns_max: Max clock latency in nanoseconds. @@ -157,6 +158,7 @@ struct opp_table { struct list_head dev_list; struct list_head opp_list; struct kref kref; + struct kref list_kref; struct mutex lock; struct device_node *np; @@ -200,6 +202,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *o int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic); void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, int last_cpu); struct opp_table *_add_opp_table(struct device *dev); +void _put_opp_list_kref(struct opp_table *opp_table); #ifdef CONFIG_OF void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index); From 883071c4bd9a6e45bf0b5e45575d110eaeb2e6d0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 12 Sep 2018 12:25:22 +0530 Subject: [PATCH 18/73] cpufreq: mvebu: Remove OPPs using dev_pm_opp_remove() dev_pm_opp_cpumask_remove_table() is going to change in the next commit and will not remove dynamic OPPs automatically. They must be removed with a call to dev_pm_opp_remove(). Reviewed-by: Gregory CLEMENT Signed-off-by: Viresh Kumar --- drivers/cpufreq/mvebu-cpufreq.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/mvebu-cpufreq.c b/drivers/cpufreq/mvebu-cpufreq.c index 31513bd42705..6d33a639f902 100644 --- a/drivers/cpufreq/mvebu-cpufreq.c +++ b/drivers/cpufreq/mvebu-cpufreq.c @@ -84,9 +84,10 @@ static int __init armada_xp_pmsu_cpufreq_init(void) ret = dev_pm_opp_add(cpu_dev, clk_get_rate(clk) / 2, 0); if (ret) { + dev_pm_opp_remove(cpu_dev, clk_get_rate(clk)); clk_put(clk); dev_err(cpu_dev, "Failed to register OPPs\n"); - goto opp_register_failed; + return ret; } ret = dev_pm_opp_set_sharing_cpus(cpu_dev, @@ -99,11 +100,5 @@ static int __init armada_xp_pmsu_cpufreq_init(void) platform_device_register_simple("cpufreq-dt", -1, NULL, 0); return 0; - -opp_register_failed: - /* As registering has failed remove all the opp for all cpus */ - dev_pm_opp_cpumask_remove_table(cpu_possible_mask); - - return ret; } device_initcall(armada_xp_pmsu_cpufreq_init); From 2a4eb7358aba6beff7fa23f028c733310756e525 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 13 Sep 2018 13:14:36 +0530 Subject: [PATCH 19/73] OPP: Don't remove dynamic OPPs from _dev_pm_opp_remove_table() Only one platform was depending on this feature and it is already updated now. Stop removing dynamic OPPs from _dev_pm_opp_remove_table(). This simplifies lot of paths and removes unnecessary parameters. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 20 +++++--------------- drivers/opp/cpu.c | 9 +++------ drivers/opp/of.c | 12 ++++++------ drivers/opp/opp.h | 6 +++--- 4 files changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index b555121b878b..2319ad4a0177 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1759,14 +1759,10 @@ int dev_pm_opp_unregister_notifier(struct device *dev, EXPORT_SYMBOL(dev_pm_opp_unregister_notifier); /* - * Free OPPs either created using static entries present in DT or even the - * dynamically added entries based on remove_all param. + * Free OPPs either created using static entries present in DT. */ -void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, - bool remove_all) +void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev) { - struct dev_pm_opp *opp, *tmp; - /* Protect dev_list */ mutex_lock(&opp_table->lock); @@ -1775,12 +1771,6 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, /* Free static OPPs */ _put_opp_list_kref(opp_table); - /* Free dynamic OPPs */ - list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) { - if (remove_all) - dev_pm_opp_put(opp); - } - /* * The OPP table is getting removed, drop the performance state * constraints. @@ -1795,7 +1785,7 @@ void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, mutex_unlock(&opp_table->lock); } -void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all) +void _dev_pm_opp_find_and_remove_table(struct device *dev) { struct opp_table *opp_table; @@ -1812,7 +1802,7 @@ void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all) return; } - _dev_pm_opp_remove_table(opp_table, dev, remove_all); + _dev_pm_opp_remove_table(opp_table, dev); dev_pm_opp_put_opp_table(opp_table); } @@ -1826,6 +1816,6 @@ void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all) */ void dev_pm_opp_remove_table(struct device *dev) { - _dev_pm_opp_find_and_remove_table(dev, true); + _dev_pm_opp_find_and_remove_table(dev); } EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table); diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c index 36586f66cd83..ab6d07e78945 100644 --- a/drivers/opp/cpu.c +++ b/drivers/opp/cpu.c @@ -108,7 +108,7 @@ void dev_pm_opp_free_cpufreq_table(struct device *dev, EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table); #endif /* CONFIG_CPU_FREQ */ -void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, +void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu) { struct device *cpu_dev; @@ -127,10 +127,7 @@ void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, continue; } - if (of) - dev_pm_opp_of_remove_table(cpu_dev); - else - dev_pm_opp_remove_table(cpu_dev); + _dev_pm_opp_find_and_remove_table(cpu_dev); } } @@ -144,7 +141,7 @@ void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, */ void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask) { - _dev_pm_opp_cpumask_remove_table(cpumask, false, -1); + _dev_pm_opp_cpumask_remove_table(cpumask, -1); } EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table); diff --git a/drivers/opp/of.c b/drivers/opp/of.c index aaa4bab69846..861cc75de329 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -279,7 +279,7 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev, */ void dev_pm_opp_of_remove_table(struct device *dev) { - _dev_pm_opp_find_and_remove_table(dev, false); + _dev_pm_opp_find_and_remove_table(dev); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); @@ -432,7 +432,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, if (ret) { dev_err(dev, "%s: Failed to add OPP, %d\n", __func__, ret); - _dev_pm_opp_remove_table(opp_table, dev, false); + _dev_pm_opp_remove_table(opp_table, dev); of_node_put(np); goto put_opp_table; } @@ -453,7 +453,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, dev_err(dev, "Not all nodes have performance state set (%d: %d)\n", count, pstate_count); ret = -ENOENT; - _dev_pm_opp_remove_table(opp_table, dev, false); + _dev_pm_opp_remove_table(opp_table, dev); goto put_opp_table; } @@ -507,7 +507,7 @@ static int _of_add_opp_table_v1(struct device *dev) if (ret) { dev_err(dev, "%s: Failed to add OPP %ld (%d)\n", __func__, freq, ret); - _dev_pm_opp_remove_table(opp_table, dev, false); + _dev_pm_opp_remove_table(opp_table, dev); break; } nr -= 2; @@ -618,7 +618,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed); */ void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask) { - _dev_pm_opp_cpumask_remove_table(cpumask, true, -1); + _dev_pm_opp_cpumask_remove_table(cpumask, -1); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table); @@ -653,7 +653,7 @@ int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask) __func__, cpu, ret); /* Free all other OPPs */ - _dev_pm_opp_cpumask_remove_table(cpumask, true, cpu); + _dev_pm_opp_cpumask_remove_table(cpumask, cpu); break; } } diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 9274116c90e4..98dd7d39e1ad 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -194,13 +194,13 @@ void _get_opp_table_kref(struct opp_table *opp_table); int _get_opp_count(struct opp_table *opp_table); struct opp_table *_find_opp_table(struct device *dev); struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table); -void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, bool remove_all); -void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all); +void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev); +void _dev_pm_opp_find_and_remove_table(struct device *dev); struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table); void _opp_free(struct dev_pm_opp *opp); int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table, bool rate_not_available); int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic); -void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of, int last_cpu); +void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu); struct opp_table *_add_opp_table(struct device *dev); void _put_opp_list_kref(struct opp_table *opp_table); From cdd6ed90cdb6c2fd982909501f0a109274147fb4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 12 Sep 2018 12:35:19 +0530 Subject: [PATCH 20/73] OPP: Use a single mechanism to free the OPP table Currently there are two separate ways to free the OPP table based on how it is created in the first place. We call _dev_pm_opp_remove_table() to free the static and/or dynamic OPP, OPP list devices, etc. This is done for the case where the OPP table is added while initializing the OPPs, like via the path dev_pm_opp_of_add_table(). We also call dev_pm_opp_put_opp_table() in some cases which eventually frees the OPP table structure once the reference count reaches 0. This is used by the first case as well as other cases like dev_pm_opp_set_regulators() where the OPPs aren't necessarily initialized at this point. This whole thing is a bit unclear and messy and obstruct any further cleanup/fixup of OPP core. This patch tries to streamline this by keeping a single path for OPP table destruction, i.e. dev_pm_opp_put_opp_table(). All the cleanup happens in _opp_table_kref_release() now after the reference count reaches 0. _dev_pm_opp_remove_table() is removed as it isn't required anymore. We don't drop the reference to the OPP table after creating it from _of_add_opp_table_v{1|2}() anymore and the same is dropped only when we try to remove them. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 54 ++++++++++++++-------------------------------- drivers/opp/of.c | 34 ++++++++++++++++------------- drivers/opp/opp.h | 2 +- 3 files changed, 36 insertions(+), 54 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 2319ad4a0177..d3e33fd32694 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -867,23 +867,24 @@ struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, static void _opp_table_kref_release(struct kref *kref) { struct opp_table *opp_table = container_of(kref, struct opp_table, kref); - struct opp_device *opp_dev; + struct opp_device *opp_dev, *temp; /* Release clk */ if (!IS_ERR(opp_table->clk)) clk_put(opp_table->clk); - /* - * No need to take opp_table->lock here as we are guaranteed that no - * references to the OPP table are taken at this point. - */ - opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device, - node); + WARN_ON(!list_empty(&opp_table->opp_list)); - _remove_opp_dev(opp_dev, opp_table); + list_for_each_entry_safe(opp_dev, temp, &opp_table->dev_list, node) { + /* + * The OPP table is getting removed, drop the performance state + * constraints. + */ + if (opp_table->genpd_performance_state) + dev_pm_genpd_set_performance_state((struct device *)(opp_dev->dev), 0); - /* dev_list must be empty now */ - WARN_ON(!list_empty(&opp_table->dev_list)); + _remove_opp_dev(opp_dev, opp_table); + } mutex_destroy(&opp_table->lock); list_del(&opp_table->node); @@ -1758,33 +1759,6 @@ int dev_pm_opp_unregister_notifier(struct device *dev, } EXPORT_SYMBOL(dev_pm_opp_unregister_notifier); -/* - * Free OPPs either created using static entries present in DT. - */ -void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev) -{ - /* Protect dev_list */ - mutex_lock(&opp_table->lock); - - /* Find if opp_table manages a single device */ - if (list_is_singular(&opp_table->dev_list)) { - /* Free static OPPs */ - _put_opp_list_kref(opp_table); - - /* - * The OPP table is getting removed, drop the performance state - * constraints. - */ - if (opp_table->genpd_performance_state) - dev_pm_genpd_set_performance_state(dev, 0); - } else { - _put_opp_list_kref(opp_table); - _remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table); - } - - mutex_unlock(&opp_table->lock); -} - void _dev_pm_opp_find_and_remove_table(struct device *dev) { struct opp_table *opp_table; @@ -1802,8 +1776,12 @@ void _dev_pm_opp_find_and_remove_table(struct device *dev) return; } - _dev_pm_opp_remove_table(opp_table, dev); + _put_opp_list_kref(opp_table); + /* Drop reference taken by _find_opp_table() */ + dev_pm_opp_put_opp_table(opp_table); + + /* Drop reference taken while the OPP table was added */ dev_pm_opp_put_opp_table(opp_table); } diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 861cc75de329..ae0436eaa911 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -407,14 +407,17 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, opp_table = _managed_opp(opp_np); if (opp_table) { /* OPPs are already managed */ - if (!_add_opp_dev(dev, opp_table)) + if (!_add_opp_dev(dev, opp_table)) { ret = -ENOMEM; - else if (!opp_table->parsed_static_opps) - goto initialize_static_opps; - else - kref_get(&opp_table->list_kref); + goto put_opp_table; + } - goto put_opp_table; + if (opp_table->parsed_static_opps) { + kref_get(&opp_table->list_kref); + return 0; + } + + goto initialize_static_opps; } opp_table = dev_pm_opp_get_opp_table_indexed(dev, index); @@ -432,17 +435,15 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, if (ret) { dev_err(dev, "%s: Failed to add OPP, %d\n", __func__, ret); - _dev_pm_opp_remove_table(opp_table, dev); of_node_put(np); - goto put_opp_table; + goto put_list_kref; } } /* There should be one of more OPP defined */ if (WARN_ON(!count)) { ret = -ENOENT; - _put_opp_list_kref(opp_table); - goto put_opp_table; + goto put_list_kref; } list_for_each_entry(opp, &opp_table->opp_list, node) @@ -453,8 +454,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, dev_err(dev, "Not all nodes have performance state set (%d: %d)\n", count, pstate_count); ret = -ENOENT; - _dev_pm_opp_remove_table(opp_table, dev); - goto put_opp_table; + goto put_list_kref; } if (pstate_count) @@ -462,6 +462,10 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, opp_table->parsed_static_opps = true; + return 0; + +put_list_kref: + _put_opp_list_kref(opp_table); put_opp_table: dev_pm_opp_put_opp_table(opp_table); @@ -507,13 +511,13 @@ static int _of_add_opp_table_v1(struct device *dev) if (ret) { dev_err(dev, "%s: Failed to add OPP %ld (%d)\n", __func__, freq, ret); - _dev_pm_opp_remove_table(opp_table, dev); - break; + _put_opp_list_kref(opp_table); + dev_pm_opp_put_opp_table(opp_table); + return ret; } nr -= 2; } - dev_pm_opp_put_opp_table(opp_table); return ret; } diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 98dd7d39e1ad..f9fbb7553fc4 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -190,11 +190,11 @@ struct opp_table { /* Routines internal to opp core */ void dev_pm_opp_get(struct dev_pm_opp *opp); +void _opp_remove_all_static(struct opp_table *opp_table); void _get_opp_table_kref(struct opp_table *opp_table); int _get_opp_count(struct opp_table *opp_table); struct opp_table *_find_opp_table(struct device *dev); struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table); -void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev); void _dev_pm_opp_find_and_remove_table(struct device *dev); struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table); void _opp_free(struct dev_pm_opp *opp); From 283d55e68d8a0f302057f57dcbd4d2e000c2ac85 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 7 Sep 2018 09:01:54 +0530 Subject: [PATCH 21/73] OPP: Prevent creating multiple OPP tables for devices sharing OPP nodes When two or more devices are sharing their clock and voltage rails, they share the same OPP table. But there are some corner cases where the OPP core incorrectly creates separate OPP tables for them. For example, CPU 0 and 1 share clock/voltage rails. The platform specific code calls dev_pm_opp_set_regulators() for CPU0 and the OPP core creates an OPP table for it (the individual OPPs aren't initialized as of now). The same is repeated for CPU1 then. Because _opp_get_opp_table() doesn't compare DT node pointers currently, it fails to find the link between CPU0 and CPU1 and so creates a new OPP table. Fix this by calling _managed_opp() from _opp_get_opp_table(). _managed_opp() gain an additional argument (index) to get the right node pointer. This resulted in simplifying code in _of_add_opp_table_v2() as well. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 25 ++++++++++++++++++++++--- drivers/opp/of.c | 35 +++++++++++++---------------------- drivers/opp/opp.h | 2 ++ 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index d3e33fd32694..cdf918aaac34 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -759,8 +759,8 @@ static void _remove_opp_dev(struct opp_device *opp_dev, kfree(opp_dev); } -struct opp_device *_add_opp_dev(const struct device *dev, - struct opp_table *opp_table) +static struct opp_device *_add_opp_dev_unlocked(const struct device *dev, + struct opp_table *opp_table) { struct opp_device *opp_dev; int ret; @@ -772,7 +772,6 @@ struct opp_device *_add_opp_dev(const struct device *dev, /* Initialize opp-dev */ opp_dev->dev = dev; - mutex_lock(&opp_table->lock); list_add(&opp_dev->node, &opp_table->dev_list); /* Create debugfs entries for the opp_table */ @@ -780,6 +779,17 @@ struct opp_device *_add_opp_dev(const struct device *dev, if (ret) dev_err(dev, "%s: Failed to register opp debugfs (%d)\n", __func__, ret); + + return opp_dev; +} + +struct opp_device *_add_opp_dev(const struct device *dev, + struct opp_table *opp_table) +{ + struct opp_device *opp_dev; + + mutex_lock(&opp_table->lock); + opp_dev = _add_opp_dev_unlocked(dev, opp_table); mutex_unlock(&opp_table->lock); return opp_dev; @@ -844,6 +854,15 @@ static struct opp_table *_opp_get_opp_table(struct device *dev, int index) if (!IS_ERR(opp_table)) goto unlock; + opp_table = _managed_opp(dev, index); + if (opp_table) { + if (!_add_opp_dev_unlocked(dev, opp_table)) { + dev_pm_opp_put_opp_table(opp_table); + opp_table = NULL; + } + goto unlock; + } + opp_table = _allocate_opp_table(dev, index); unlock: diff --git a/drivers/opp/of.c b/drivers/opp/of.c index ae0436eaa911..1ddef52c27fd 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -41,11 +41,14 @@ struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev) } EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node); -static struct opp_table *_managed_opp(const struct device_node *np) +struct opp_table *_managed_opp(struct device *dev, int index) { struct opp_table *opp_table, *managed_table = NULL; + struct device_node *np; - mutex_lock(&opp_table_lock); + np = _opp_of_get_opp_desc_node(dev->of_node, index); + if (!np) + return NULL; list_for_each_entry(opp_table, &opp_tables, node) { if (opp_table->np == np) { @@ -65,7 +68,7 @@ static struct opp_table *_managed_opp(const struct device_node *np) } } - mutex_unlock(&opp_table_lock); + of_node_put(np); return managed_table; } @@ -401,30 +404,19 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, { struct device_node *np; struct opp_table *opp_table; - int ret = 0, count = 0, pstate_count = 0; + int ret, count = 0, pstate_count = 0; struct dev_pm_opp *opp; - opp_table = _managed_opp(opp_np); - if (opp_table) { - /* OPPs are already managed */ - if (!_add_opp_dev(dev, opp_table)) { - ret = -ENOMEM; - goto put_opp_table; - } - - if (opp_table->parsed_static_opps) { - kref_get(&opp_table->list_kref); - return 0; - } - - goto initialize_static_opps; - } - opp_table = dev_pm_opp_get_opp_table_indexed(dev, index); if (!opp_table) return -ENOMEM; -initialize_static_opps: + /* OPP table is already initialized for the device */ + if (opp_table->parsed_static_opps) { + kref_get(&opp_table->list_kref); + return 0; + } + kref_init(&opp_table->list_kref); /* We have opp-table node now, iterate over it and add OPPs */ @@ -466,7 +458,6 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, put_list_kref: _put_opp_list_kref(opp_table); -put_opp_table: dev_pm_opp_put_opp_table(opp_table); return ret; diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index f9fbb7553fc4..9c6544b4f4f9 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -206,8 +206,10 @@ void _put_opp_list_kref(struct opp_table *opp_table); #ifdef CONFIG_OF void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index); +struct opp_table *_managed_opp(struct device *dev, int index); #else static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index) {} +static inline struct opp_table *_managed_opp(struct device *dev, int index) { return NULL; } #endif #ifdef CONFIG_DEBUG_FS From 5ed4cecd75e90232a19afa502cf477925854561e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 12 Sep 2018 11:21:17 +0530 Subject: [PATCH 22/73] OPP: Pass OPP table to _of_add_opp_table_v{1|2}() Both _of_add_opp_table_v1() and _of_add_opp_table_v2() contain similar code to get the OPP table and their parent routine also parses the DT to find the OPP table's node pointer. This can be simplified by getting the OPP table in advance and then passing it as argument to these routines. Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 72 ++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 1ddef52c27fd..a71ff3acca0f 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -399,18 +399,12 @@ static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev, } /* Initializes OPP tables based on new bindings */ -static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, - int index) +static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table) { struct device_node *np; - struct opp_table *opp_table; int ret, count = 0, pstate_count = 0; struct dev_pm_opp *opp; - opp_table = dev_pm_opp_get_opp_table_indexed(dev, index); - if (!opp_table) - return -ENOMEM; - /* OPP table is already initialized for the device */ if (opp_table->parsed_static_opps) { kref_get(&opp_table->list_kref); @@ -420,7 +414,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, kref_init(&opp_table->list_kref); /* We have opp-table node now, iterate over it and add OPPs */ - for_each_available_child_of_node(opp_np, np) { + for_each_available_child_of_node(opp_table->np, np) { count++; ret = _opp_add_static_v2(opp_table, dev, np); @@ -458,15 +452,13 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np, put_list_kref: _put_opp_list_kref(opp_table); - dev_pm_opp_put_opp_table(opp_table); return ret; } /* Initializes OPP tables based on old-deprecated bindings */ -static int _of_add_opp_table_v1(struct device *dev) +static int _of_add_opp_table_v1(struct device *dev, struct opp_table *opp_table) { - struct opp_table *opp_table; const struct property *prop; const __be32 *val; int nr, ret = 0; @@ -487,10 +479,6 @@ static int _of_add_opp_table_v1(struct device *dev) return -EINVAL; } - opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return -ENOMEM; - kref_init(&opp_table->list_kref); val = prop->value; @@ -503,7 +491,6 @@ static int _of_add_opp_table_v1(struct device *dev) dev_err(dev, "%s: Failed to add OPP %ld (%d)\n", __func__, freq, ret); _put_opp_list_kref(opp_table); - dev_pm_opp_put_opp_table(opp_table); return ret; } nr -= 2; @@ -531,24 +518,24 @@ static int _of_add_opp_table_v1(struct device *dev) */ int dev_pm_opp_of_add_table(struct device *dev) { - struct device_node *opp_np; + struct opp_table *opp_table; int ret; - /* - * OPPs have two version of bindings now. The older one is deprecated, - * try for the new binding first. - */ - opp_np = dev_pm_opp_of_get_opp_desc_node(dev); - if (!opp_np) { - /* - * Try old-deprecated bindings for backward compatibility with - * older dtbs. - */ - return _of_add_opp_table_v1(dev); - } + opp_table = dev_pm_opp_get_opp_table_indexed(dev, 0); + if (!opp_table) + return -ENOMEM; - ret = _of_add_opp_table_v2(dev, opp_np, 0); - of_node_put(opp_np); + /* + * OPPs have two version of bindings now. Also try the old (v1) + * bindings for backward compatibility with older dtbs. + */ + if (opp_table->np) + ret = _of_add_opp_table_v2(dev, opp_table); + else + ret = _of_add_opp_table_v1(dev, opp_table); + + if (ret) + dev_pm_opp_put_opp_table(opp_table); return ret; } @@ -575,28 +562,29 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table); */ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) { - struct device_node *opp_np; + struct opp_table *opp_table; int ret, count; -again: - opp_np = _opp_of_get_opp_desc_node(dev->of_node, index); - if (!opp_np) { + if (index) { /* * If only one phandle is present, then the same OPP table * applies for all index requests. */ count = of_count_phandle_with_args(dev->of_node, "operating-points-v2", NULL); - if (count == 1 && index) { - index = 0; - goto again; - } + if (count != 1) + return -ENODEV; - return -ENODEV; + index = 0; } - ret = _of_add_opp_table_v2(dev, opp_np, index); - of_node_put(opp_np); + opp_table = dev_pm_opp_get_opp_table_indexed(dev, index); + if (!opp_table) + return -ENOMEM; + + ret = _of_add_opp_table_v2(dev, opp_table); + if (ret) + dev_pm_opp_put_opp_table(opp_table); return ret; } From 23c7b54ca1cd1797ef39169ab85e6d46f1c2d061 Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Wed, 4 Jul 2018 10:45:50 +0200 Subject: [PATCH 23/73] PM / devfreq: Fix devfreq_add_device() when drivers are built as modules. When the devfreq driver and the governor driver are built as modules, the call to devfreq_add_device() or governor_store() fails because the governor driver is not loaded at the time the devfreq driver loads. The devfreq driver has a build dependency on the governor but also should have a runtime dependency. We need to make sure that the governor driver is loaded before the devfreq driver. This patch fixes this bug by adding a try_then_request_governor() function. First tries to find the governor, and then, if it is not found, it requests the module and tries again. Fixes: 1b5c1be2c88e (PM / devfreq: map devfreq drivers to governor using name) Signed-off-by: Enric Balletbo i Serra Reviewed-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/devfreq.c | 53 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 4c49bb1330b5..62ead442a872 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -221,6 +222,49 @@ static struct devfreq_governor *find_devfreq_governor(const char *name) return ERR_PTR(-ENODEV); } +/** + * try_then_request_governor() - Try to find the governor and request the + * module if is not found. + * @name: name of the governor + * + * Search the list of devfreq governors and request the module and try again + * if is not found. This can happen when both drivers (the governor driver + * and the driver that call devfreq_add_device) are built as modules. + * devfreq_list_lock should be held by the caller. Returns the matched + * governor's pointer. + */ +static struct devfreq_governor *try_then_request_governor(const char *name) +{ + struct devfreq_governor *governor; + int err = 0; + + if (IS_ERR_OR_NULL(name)) { + pr_err("DEVFREQ: %s: Invalid parameters\n", __func__); + return ERR_PTR(-EINVAL); + } + WARN(!mutex_is_locked(&devfreq_list_lock), + "devfreq_list_lock must be locked."); + + governor = find_devfreq_governor(name); + if (IS_ERR(governor)) { + mutex_unlock(&devfreq_list_lock); + + if (!strncmp(name, DEVFREQ_GOV_SIMPLE_ONDEMAND, + DEVFREQ_NAME_LEN)) + err = request_module("governor_%s", "simpleondemand"); + else + err = request_module("governor_%s", name); + /* Restore previous state before return */ + mutex_lock(&devfreq_list_lock); + if (err) + return NULL; + + governor = find_devfreq_governor(name); + } + + return governor; +} + static int devfreq_notify_transition(struct devfreq *devfreq, struct devfreq_freqs *freqs, unsigned int state) { @@ -646,9 +690,8 @@ struct devfreq *devfreq_add_device(struct device *dev, mutex_unlock(&devfreq->lock); mutex_lock(&devfreq_list_lock); - list_add(&devfreq->node, &devfreq_list); - governor = find_devfreq_governor(devfreq->governor_name); + governor = try_then_request_governor(devfreq->governor_name); if (IS_ERR(governor)) { dev_err(dev, "%s: Unable to find governor for the device\n", __func__); @@ -664,12 +707,14 @@ struct devfreq *devfreq_add_device(struct device *dev, __func__); goto err_init; } + + list_add(&devfreq->node, &devfreq_list); + mutex_unlock(&devfreq_list_lock); return devfreq; err_init: - list_del(&devfreq->node); mutex_unlock(&devfreq_list_lock); device_unregister(&devfreq->dev); @@ -991,7 +1036,7 @@ static ssize_t governor_store(struct device *dev, struct device_attribute *attr, return -EINVAL; mutex_lock(&devfreq_list_lock); - governor = find_devfreq_governor(str_governor); + governor = try_then_request_governor(str_governor); if (IS_ERR(governor)) { ret = PTR_ERR(governor); goto out; From d0e464205b8a1fa21357aad0bbf136500d7e688d Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 24 Apr 2018 12:46:39 -0700 Subject: [PATCH 24/73] PM / devfreq: Drop custom MIN/MAX macros Drop the custom MIN/MAX macros in favour of the standard min/max from kernel.h Signed-off-by: Bjorn Andersson Reviewed-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/devfreq.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 62ead442a872..329c5e3f3338 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -29,9 +29,6 @@ #include #include "governor.h" -#define MAX(a,b) ((a > b) ? a : b) -#define MIN(a,b) ((a < b) ? a : b) - static struct class *devfreq_class; /* @@ -324,8 +321,8 @@ int update_devfreq(struct devfreq *devfreq) * max_freq * min_freq */ - max_freq = MIN(devfreq->scaling_max_freq, devfreq->max_freq); - min_freq = MAX(devfreq->scaling_min_freq, devfreq->min_freq); + max_freq = min(devfreq->scaling_max_freq, devfreq->max_freq); + min_freq = max(devfreq->scaling_min_freq, devfreq->min_freq); if (min_freq && freq < min_freq) { freq = min_freq; @@ -1197,7 +1194,7 @@ static ssize_t min_freq_show(struct device *dev, struct device_attribute *attr, { struct devfreq *df = to_devfreq(dev); - return sprintf(buf, "%lu\n", MAX(df->scaling_min_freq, df->min_freq)); + return sprintf(buf, "%lu\n", max(df->scaling_min_freq, df->min_freq)); } static ssize_t max_freq_store(struct device *dev, struct device_attribute *attr, @@ -1233,7 +1230,7 @@ static ssize_t max_freq_show(struct device *dev, struct device_attribute *attr, { struct devfreq *df = to_devfreq(dev); - return sprintf(buf, "%lu\n", MIN(df->scaling_max_freq, df->max_freq)); + return sprintf(buf, "%lu\n", min(df->scaling_max_freq, df->max_freq)); } static DEVICE_ATTR_RW(max_freq); From df5cf4a36178c5d4f2b8b9469cb2f722e64cd102 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 3 Aug 2018 13:05:09 -0700 Subject: [PATCH 25/73] PM / devfreq: Fix handling of min/max_freq == 0 Commit ab8f58ad72c4 ("PM / devfreq: Set min/max_freq when adding the devfreq device") initializes df->min/max_freq with the min/max OPP when the device is added. Later commit f1d981eaecf8 ("PM / devfreq: Use the available min/max frequency") adds df->scaling_min/max_freq and the following to the frequency adjustment code: max_freq = MIN(devfreq->scaling_max_freq, devfreq->max_freq); With the current handling of min/max_freq this is incorrect: Even though df->max_freq is now initialized to a value != 0 user space can still set it to 0, in this case max_freq would be 0 instead of df->scaling_max_freq as intended. In consequence the frequency adjustment is not performed: if (max_freq && freq > max_freq) { freq = max_freq; To fix this set df->min/max freq to the min/max OPP in max/max_freq_store, when the user passes a value of 0. This also prevents df->max_freq from being set below the min OPP when df->min_freq is 0, and similar for min_freq. Since it is now guaranteed that df->min/max_freq can't be 0 the checks for this case can be removed. Fixes: f1d981eaecf8 ("PM / devfreq: Use the available min/max frequency") Signed-off-by: Matthias Kaehlcke Reviewed-by: Brian Norris Reviewed-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/devfreq.c | 42 ++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 329c5e3f3338..b5e7af60723c 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -324,11 +324,11 @@ int update_devfreq(struct devfreq *devfreq) max_freq = min(devfreq->scaling_max_freq, devfreq->max_freq); min_freq = max(devfreq->scaling_min_freq, devfreq->min_freq); - if (min_freq && freq < min_freq) { + if (freq < min_freq) { freq = min_freq; flags &= ~DEVFREQ_FLAG_LEAST_UPPER_BOUND; /* Use GLB */ } - if (max_freq && freq > max_freq) { + if (freq > max_freq) { freq = max_freq; flags |= DEVFREQ_FLAG_LEAST_UPPER_BOUND; /* Use LUB */ } @@ -1168,17 +1168,26 @@ static ssize_t min_freq_store(struct device *dev, struct device_attribute *attr, struct devfreq *df = to_devfreq(dev); unsigned long value; int ret; - unsigned long max; ret = sscanf(buf, "%lu", &value); if (ret != 1) return -EINVAL; mutex_lock(&df->lock); - max = df->max_freq; - if (value && max && value > max) { - ret = -EINVAL; - goto unlock; + + if (value) { + if (value > df->max_freq) { + ret = -EINVAL; + goto unlock; + } + } else { + unsigned long *freq_table = df->profile->freq_table; + + /* Get minimum frequency according to sorting order */ + if (freq_table[0] < freq_table[df->profile->max_state - 1]) + value = freq_table[0]; + else + value = freq_table[df->profile->max_state - 1]; } df->min_freq = value; @@ -1203,17 +1212,26 @@ static ssize_t max_freq_store(struct device *dev, struct device_attribute *attr, struct devfreq *df = to_devfreq(dev); unsigned long value; int ret; - unsigned long min; ret = sscanf(buf, "%lu", &value); if (ret != 1) return -EINVAL; mutex_lock(&df->lock); - min = df->min_freq; - if (value && min && value < min) { - ret = -EINVAL; - goto unlock; + + if (value) { + if (value < df->min_freq) { + ret = -EINVAL; + goto unlock; + } + } else { + unsigned long *freq_table = df->profile->freq_table; + + /* Get maximum frequency according to sorting order */ + if (freq_table[0] < freq_table[df->profile->max_state - 1]) + value = freq_table[df->profile->max_state - 1]; + else + value = freq_table[0]; } df->max_freq = value; From 6ff66e2a008337b8a005fd0ae2037bed716262cc Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 3 Aug 2018 13:05:10 -0700 Subject: [PATCH 26/73] PM / devfreq: Don't adjust to user limits in governors Several governors use the user space limits df->min/max_freq to adjust the target frequency. This is not necessary, since update_devfreq() already takes care of this. Instead the governor can request the available min/max frequency by setting the target frequency to DEVFREQ_MIN/MAX_FREQ and let update_devfreq() take care of any adjustments. Signed-off-by: Matthias Kaehlcke Reviewed-by: Brian Norris Reviewed-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/governor.h | 3 +++ drivers/devfreq/governor_performance.c | 5 +---- drivers/devfreq/governor_powersave.c | 2 +- drivers/devfreq/governor_simpleondemand.c | 12 +++--------- drivers/devfreq/governor_userspace.c | 16 ++++------------ 5 files changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h index cfc50a61a90d..b81700244ce3 100644 --- a/drivers/devfreq/governor.h +++ b/drivers/devfreq/governor.h @@ -25,6 +25,9 @@ #define DEVFREQ_GOV_SUSPEND 0x4 #define DEVFREQ_GOV_RESUME 0x5 +#define DEVFREQ_MIN_FREQ 0 +#define DEVFREQ_MAX_FREQ ULONG_MAX + /** * struct devfreq_governor - Devfreq policy governor * @node: list node - contains registered devfreq governors diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c index 4d23ecfbd948..ded429fd51be 100644 --- a/drivers/devfreq/governor_performance.c +++ b/drivers/devfreq/governor_performance.c @@ -20,10 +20,7 @@ static int devfreq_performance_func(struct devfreq *df, * target callback should be able to get floor value as * said in devfreq.h */ - if (!df->max_freq) - *freq = UINT_MAX; - else - *freq = df->max_freq; + *freq = DEVFREQ_MAX_FREQ; return 0; } diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c index 0c42f23249ef..9e8897f5ac42 100644 --- a/drivers/devfreq/governor_powersave.c +++ b/drivers/devfreq/governor_powersave.c @@ -20,7 +20,7 @@ static int devfreq_powersave_func(struct devfreq *df, * target callback should be able to get ceiling value as * said in devfreq.h */ - *freq = df->min_freq; + *freq = DEVFREQ_MIN_FREQ; return 0; } diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c index 28e0f2de7100..c0417f0e081e 100644 --- a/drivers/devfreq/governor_simpleondemand.c +++ b/drivers/devfreq/governor_simpleondemand.c @@ -27,7 +27,6 @@ static int devfreq_simple_ondemand_func(struct devfreq *df, unsigned int dfso_upthreshold = DFSO_UPTHRESHOLD; unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENCTIAL; struct devfreq_simple_ondemand_data *data = df->data; - unsigned long max = (df->max_freq) ? df->max_freq : UINT_MAX; err = devfreq_update_stats(df); if (err) @@ -47,7 +46,7 @@ static int devfreq_simple_ondemand_func(struct devfreq *df, /* Assume MAX if it is going to be divided by zero */ if (stat->total_time == 0) { - *freq = max; + *freq = DEVFREQ_MAX_FREQ; return 0; } @@ -60,13 +59,13 @@ static int devfreq_simple_ondemand_func(struct devfreq *df, /* Set MAX if it's busy enough */ if (stat->busy_time * 100 > stat->total_time * dfso_upthreshold) { - *freq = max; + *freq = DEVFREQ_MAX_FREQ; return 0; } /* Set MAX if we do not know the initial frequency */ if (stat->current_frequency == 0) { - *freq = max; + *freq = DEVFREQ_MAX_FREQ; return 0; } @@ -85,11 +84,6 @@ static int devfreq_simple_ondemand_func(struct devfreq *df, b = div_u64(b, (dfso_upthreshold - dfso_downdifferential / 2)); *freq = (unsigned long) b; - if (df->min_freq && *freq < df->min_freq) - *freq = df->min_freq; - if (df->max_freq && *freq > df->max_freq) - *freq = df->max_freq; - return 0; } diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c index 080607c3f34d..378d84c011df 100644 --- a/drivers/devfreq/governor_userspace.c +++ b/drivers/devfreq/governor_userspace.c @@ -26,19 +26,11 @@ static int devfreq_userspace_func(struct devfreq *df, unsigned long *freq) { struct userspace_data *data = df->data; - if (data->valid) { - unsigned long adjusted_freq = data->user_frequency; - - if (df->max_freq && adjusted_freq > df->max_freq) - adjusted_freq = df->max_freq; - - if (df->min_freq && adjusted_freq < df->min_freq) - adjusted_freq = df->min_freq; - - *freq = adjusted_freq; - } else { + if (data->valid) + *freq = data->user_frequency; + else *freq = df->previous_freq; /* No user freq specified yet */ - } + return 0; } From b596d895fa2957e136a6b398b97b06bd42b51291 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 3 Aug 2018 13:05:11 -0700 Subject: [PATCH 27/73] PM / devfreq: Make update_devfreq() public Currently update_devfreq() is only visible to devfreq governors outside of devfreq.c. Make it public to allow drivers that adjust devfreq policies to cause a re-evaluation of the frequency after a policy change. Signed-off-by: Matthias Kaehlcke Reviewed-by: Brian Norris Reviewed-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/governor.h | 3 --- include/linux/devfreq.h | 8 ++++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h index b81700244ce3..f53339ca610f 100644 --- a/drivers/devfreq/governor.h +++ b/drivers/devfreq/governor.h @@ -57,9 +57,6 @@ struct devfreq_governor { unsigned int event, void *data); }; -/* Caution: devfreq->lock must be locked before calling update_devfreq */ -extern int update_devfreq(struct devfreq *devfreq); - extern void devfreq_monitor_start(struct devfreq *devfreq); extern void devfreq_monitor_stop(struct devfreq *devfreq); extern void devfreq_monitor_suspend(struct devfreq *devfreq); diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 3aae5b3af87c..e4963b0f45da 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -198,6 +198,14 @@ extern void devm_devfreq_remove_device(struct device *dev, extern int devfreq_suspend_device(struct devfreq *devfreq); extern int devfreq_resume_device(struct devfreq *devfreq); +/** + * update_devfreq() - Reevaluate the device and configure frequency + * @devfreq: the devfreq device + * + * Note: devfreq->lock must be held + */ +extern int update_devfreq(struct devfreq *devfreq); + /* Helper functions for devfreq user device driver with OPP. */ extern struct dev_pm_opp *devfreq_recommended_opp(struct device *dev, unsigned long *freq, u32 flags); From f037eb8c1f476bc903d99695c1eb9f99ccb46b27 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 27 Aug 2018 20:52:16 -0500 Subject: [PATCH 28/73] PM / devfreq: Convert to using %pOFn instead of device_node.name In preparation to remove the node name pointer from struct device_node, convert printf users to use the %pOFn format specifier. Signed-off-by: Rob Herring Acked-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/event/exynos-ppmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c index a9c64f0d3284..c61de0bdf053 100644 --- a/drivers/devfreq/event/exynos-ppmu.c +++ b/drivers/devfreq/event/exynos-ppmu.c @@ -535,8 +535,8 @@ static int of_get_devfreq_events(struct device_node *np, if (i == ARRAY_SIZE(ppmu_events)) { dev_warn(dev, - "don't know how to configure events : %s\n", - node->name); + "don't know how to configure events : %pOFn\n", + node); continue; } From 2f061fd0c2d852e32e03a903fccd810663c5c31e Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 3 Sep 2018 09:02:07 +0900 Subject: [PATCH 29/73] PM / devfreq: stopping the governor before device_unregister() device_release() is freeing the resources before calling the device specific release callback which is, in the case of devfreq, stopping the governor. It is a problem as some governors are using the device resources. e.g. simpleondemand which is using the devfreq deferrable monitoring work. If it is not stopped before the resources are freed, it might lead to a use after free. Signed-off-by: Vincent Donnefort Reviewed-by: John Einar Reitan [cw00.choi: Fix merge conflict] Reviewed-by: Chanwoo Choi Signed-off-by: MyungJoo Ham --- drivers/devfreq/devfreq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index b5e7af60723c..1868423355d9 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -575,10 +575,6 @@ static void devfreq_dev_release(struct device *dev) list_del(&devfreq->node); mutex_unlock(&devfreq_list_lock); - if (devfreq->governor) - devfreq->governor->event_handler(devfreq, - DEVFREQ_GOV_STOP, NULL); - if (devfreq->profile->exit) devfreq->profile->exit(devfreq->dev.parent); @@ -714,7 +710,7 @@ struct devfreq *devfreq_add_device(struct device *dev, err_init: mutex_unlock(&devfreq_list_lock); - device_unregister(&devfreq->dev); + devfreq_remove_device(devfreq); devfreq = NULL; err_dev: if (devfreq) @@ -735,6 +731,9 @@ int devfreq_remove_device(struct devfreq *devfreq) if (!devfreq) return -EINVAL; + if (devfreq->governor) + devfreq->governor->event_handler(devfreq, + DEVFREQ_GOV_STOP, NULL); device_unregister(&devfreq->dev); return 0; From 8188b154f95014dae4d19892fefb202c8df3f885 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 21 Sep 2018 21:18:43 +0800 Subject: [PATCH 30/73] PM / devfreq: remove redundant null pointer check before kfree kfree has taken the null pointer into account. hence it is safe to remove the redundant null pointer check before kfree. Signed-off-by: zhong jiang Signed-off-by: MyungJoo Ham --- drivers/devfreq/devfreq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 1868423355d9..141413067b5c 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -713,8 +713,7 @@ struct devfreq *devfreq_add_device(struct device *dev, devfreq_remove_device(devfreq); devfreq = NULL; err_dev: - if (devfreq) - kfree(devfreq); + kfree(devfreq); err_out: return ERR_PTR(err); } From cc55f7537db6af371e9c1c6a71161ee40f918824 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:26:24 +0800 Subject: [PATCH 31/73] x86, hibernate: Fix nosave_regions setup for hibernation On 32bit systems, nosave_regions(non RAM areas) located between max_low_pfn and max_pfn are not excluded from hibernation snapshot currently, which may result in a machine check exception when trying to access these unsafe regions during hibernation: [ 612.800453] Disabling lock debugging due to kernel taint [ 612.805786] mce: [Hardware Error]: CPU 0: Machine Check Exception: 5 Bank 6: fe00000000801136 [ 612.814344] mce: [Hardware Error]: RIP !INEXACT! 60:<00000000d90be566> {swsusp_save+0x436/0x560} [ 612.823167] mce: [Hardware Error]: TSC 1f5939fe276 ADDR dd000000 MISC 30e0000086 [ 612.830677] mce: [Hardware Error]: PROCESSOR 0:306c3 TIME 1529487426 SOCKET 0 APIC 0 microcode 24 [ 612.839581] mce: [Hardware Error]: Run the above through 'mcelog --ascii' [ 612.846394] mce: [Hardware Error]: Machine check: Processor context corrupt [ 612.853380] Kernel panic - not syncing: Fatal machine check [ 612.858978] Kernel Offset: 0x18000000 from 0xc1000000 (relocation range: 0xc0000000-0xf7ffdfff) This is because on 32bit systems, pages above max_low_pfn are regarded as high memeory, and accessing unsafe pages might cause expected MCE. On the problematic 32bit system, there are reserved memory above low memory, which triggered the MCE: e820 memory mapping: [ 0.000000] BIOS-e820: [mem 0x0000000000000000-0x000000000009d7ff] usable [ 0.000000] BIOS-e820: [mem 0x000000000009d800-0x000000000009ffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000000e0000-0x00000000000fffff] reserved [ 0.000000] BIOS-e820: [mem 0x0000000000100000-0x00000000d160cfff] usable [ 0.000000] BIOS-e820: [mem 0x00000000d160d000-0x00000000d1613fff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000d1614000-0x00000000d1a44fff] usable [ 0.000000] BIOS-e820: [mem 0x00000000d1a45000-0x00000000d1ecffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000d1ed0000-0x00000000d7eeafff] usable [ 0.000000] BIOS-e820: [mem 0x00000000d7eeb000-0x00000000d7ffffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000d8000000-0x00000000d875ffff] usable [ 0.000000] BIOS-e820: [mem 0x00000000d8760000-0x00000000d87fffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000d8800000-0x00000000d8fadfff] usable [ 0.000000] BIOS-e820: [mem 0x00000000d8fae000-0x00000000d8ffffff] ACPI data [ 0.000000] BIOS-e820: [mem 0x00000000d9000000-0x00000000da71bfff] usable [ 0.000000] BIOS-e820: [mem 0x00000000da71c000-0x00000000da7fffff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000da800000-0x00000000dbb8bfff] usable [ 0.000000] BIOS-e820: [mem 0x00000000dbb8c000-0x00000000dbffffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000dd000000-0x00000000df1fffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000f8000000-0x00000000fbffffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fec00000-0x00000000fec00fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fed00000-0x00000000fed03fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fed1c000-0x00000000fed1ffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fee00000-0x00000000fee00fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000ff000000-0x00000000ffffffff] reserved [ 0.000000] BIOS-e820: [mem 0x0000000100000000-0x000000041edfffff] usable Fix this problem by changing pfn limit from max_low_pfn to max_pfn. This fix does not impact 64bit system because on 64bit max_low_pfn is the same as max_pfn. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Cc: All applicable Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4866badb235..90ecc108bc8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1251,7 +1251,7 @@ void __init setup_arch(char **cmdline_p) x86_init.hyper.guest_late_init(); e820__reserve_resources(); - e820__register_nosave_regions(max_low_pfn); + e820__register_nosave_regions(max_pfn); x86_init.resources.reserve_resources(); From 749fa17093ff67b31dea864531a3698b6a95c26c Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 21 Sep 2018 14:26:38 +0800 Subject: [PATCH 32/73] PM / hibernate: Check the success of generating md5 digest before hibernation Currently if get_e820_md5() fails, then it will hibernate nevertheless. Actually the error code should be propagated to upper caller so that the hibernation could be aware of the result and terminates the process if md5 digest fails. Suggested-by: Thomas Gleixner Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_64.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f8e3b668d20b..e0de959ad02d 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -265,9 +265,9 @@ static int get_e820_md5(struct e820_table *table, void *buf) return ret; } -static void hibernation_e820_save(void *buf) +static int hibernation_e820_save(void *buf) { - get_e820_md5(e820_table_firmware, buf); + return get_e820_md5(e820_table_firmware, buf); } static bool hibernation_e820_mismatch(void *buf) @@ -287,8 +287,9 @@ static bool hibernation_e820_mismatch(void *buf) return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; } #else -static void hibernation_e820_save(void *buf) +static int hibernation_e820_save(void *buf) { + return 0; } static bool hibernation_e820_mismatch(void *buf) @@ -333,9 +334,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->magic = RESTORE_MAGIC; - hibernation_e820_save(rdr->e820_digest); - - return 0; + return hibernation_e820_save(rdr->e820_digest); } /** From 8e5b2a3c5a773f161b57eee7156a63089edd2c5c Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:26:47 +0800 Subject: [PATCH 33/73] x86-32/asm/power: Create stack frames in hibernate_asm_32.S swsusp_arch_suspend() is callable non-leaf function which doesn't honor CONFIG_FRAME_POINTER, which can result in bad stack traces. Also it's not annotated as ELF callable function which can confuse tooling. Create a stack frame for it when CONFIG_FRAME_POINTER is enabled and give it proper ELF function annotation. Also in this patch introduces the restore_registers() symbol and gives it ELF function annotation, thus to prepare for later register restore. Analogous changes were made for 64bit before in commit ef0f3ed5a4ac (x86/asm/power: Create stack frames in hibernate_asm_64.S) and commit 4ce827b4cc58 (x86/power/64: Fix hibernation return address corruption). Signed-off-by: Zhimin Gu Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/suspend_32.h | 4 ++++ arch/x86/power/hibernate_asm_32.S | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 8be6afb58471..fdbd9d7b7bca 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -32,4 +32,8 @@ struct saved_context { unsigned long return_address; } __attribute__((packed)); +/* routines for saving/restoring kernel state */ +extern char core_restore_code[]; +extern char restore_registers[]; + #endif /* _ASM_X86_SUSPEND_32_H */ diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 6e56815e13a0..671d38d0d931 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -12,6 +12,7 @@ #include #include #include +#include .text @@ -24,8 +25,11 @@ ENTRY(swsusp_arch_suspend) pushfl popl saved_context_eflags + FRAME_BEGIN call swsusp_save + FRAME_END ret +ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) movl mmu_cr4_features, %ecx @@ -58,6 +62,10 @@ copy_loop: .p2align 4,,7 done: + + /* code below belongs to the image kernel */ + .align PAGE_SIZE +ENTRY(restore_registers) /* go back to the original page tables */ movl $swapper_pg_dir, %eax subl $__PAGE_OFFSET, %eax @@ -83,3 +91,4 @@ done: xorl %eax, %eax ret +ENDPROC(restore_registers) From 25862a049e6f04cc982f4bed25ed3e6f0a0a5a61 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:26:58 +0800 Subject: [PATCH 34/73] x86, hibernate: Extract the common code of 64/32 bit system Reduce the hibernation code duplication between x86-32 and x86-64 by extracting the common code into hibernate.c. Currently only pfn_is_nosave() is the activated common function in hibernate.c No functional change. Acked-by: Pavel Machek Signed-off-by: Zhimin Gu Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/suspend.h | 8 ++ arch/x86/power/Makefile | 2 +- arch/x86/power/hibernate.c | 246 +++++++++++++++++++++++++++++++++ arch/x86/power/hibernate_32.c | 15 +- arch/x86/power/hibernate_64.c | 221 ----------------------------- 5 files changed, 256 insertions(+), 236 deletions(-) create mode 100644 arch/x86/power/hibernate.c diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index ecffe81ff65c..40b02558749f 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -4,3 +4,11 @@ #else # include #endif +extern unsigned long restore_jump_address __visible; +extern unsigned long jump_address_phys; +extern unsigned long restore_cr3 __visible; +extern unsigned long temp_level4_pgt __visible; +extern unsigned long relocated_restore_code __visible; +extern int relocate_restore_code(void); +/* Defined in hibernate_asm_32/64.S */ +extern asmlinkage __visible int restore_image(void); diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index a4701389562c..37923d715741 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -7,4 +7,4 @@ nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_cpu.o := $(nostackp) obj-$(CONFIG_PM_SLEEP) += cpu.o -obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o +obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c new file mode 100644 index 000000000000..f63793b8de3f --- /dev/null +++ b/arch/x86/power/hibernate.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hibernation support for x86 + * + * Copyright (c) 2007 Rafael J. Wysocki + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address __visible; +unsigned long jump_address_phys; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3 __visible; +unsigned long temp_level4_pgt __visible; +unsigned long relocated_restore_code __visible; + +/** + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn; + unsigned long nosave_end_pfn; + + nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + + return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; +} + +#ifdef CONFIG_X86_64 + +#define MD5_DIGEST_SIZE 16 + +struct restore_data_record { + unsigned long jump_address; + unsigned long jump_address_phys; + unsigned long cr3; + unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; +}; + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 table + * + * @table: the e820 table to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820_table *table, void *buf) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int size; + int ret = 0; + + tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(tfm)) + return -ENOMEM; + + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto free_tfm; + } + + desc->tfm = tfm; + desc->flags = 0; + + size = offsetof(struct e820_table, entries) + + sizeof(struct e820_entry) * table->nr_entries; + + if (crypto_shash_digest(desc, (u8 *)table, size, buf)) + ret = -EINVAL; + + kzfree(desc); + +free_tfm: + crypto_free_shash(tfm); + return ret; +} + +static int hibernation_e820_save(void *buf) +{ + return get_e820_md5(e820_table_firmware, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_table_firmware, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static int hibernation_e820_save(void *buf) +{ + return 0; +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif + +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->magic = RESTORE_MAGIC; + rdr->jump_address = (unsigned long)restore_registers; + rdr->jump_address_phys = __pa_symbol(restore_registers); + + /* + * The restore code fixes up CR3 and CR4 in the following sequence: + * + * [in hibernation asm] + * 1. CR3 <= temporary page tables + * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) + * 3. CR3 <= rdr->cr3 + * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) + * [in restore_processor_state()] + * 5. CR4 <= saved CR4 + * 6. CR3 <= saved CR3 + * + * Our mmu_cr4_features has CR4.PCIDE=0, and toggling + * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so + * rdr->cr3 needs to point to valid page tables but must not + * have any of the PCID bits set. + */ + rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; + + return hibernation_e820_save(rdr->e820_digest); +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + restore_jump_address = rdr->jump_address; + jump_address_phys = rdr->jump_address_phys; + restore_cr3 = rdr->cr3; + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; +} + +int relocate_restore_code(void) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + relocated_restore_code = get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + + memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); + + /* Make the page containing the relocated code executable */ + pgd = (pgd_t *)__va(read_cr3_pa()) + + pgd_index(relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); + if (pud_large(*pud)) { + set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; + } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: + __flush_tlb_all(); + return 0; +} +#endif diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index afc4ed7b1578..f82fbd279d08 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -14,9 +14,7 @@ #include #include #include - -/* Defined in hibernate_asm_32.S */ -extern int restore_image(void); +#include /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; @@ -162,14 +160,3 @@ asmlinkage int swsusp_arch_resume(void) restore_image(); return 0; } - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index e0de959ad02d..8bc2eb0dc464 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -26,26 +26,6 @@ #include #include -/* Defined in hibernate_asm_64.S */ -extern asmlinkage __visible int restore_image(void); - -/* - * Address to jump to in the last phase of restore in order to get to the image - * kernel's text (this value is passed in the image header). - */ -unsigned long restore_jump_address __visible; -unsigned long jump_address_phys; - -/* - * Value of the cr3 register from before the hibernation (this value is passed - * in the image header). - */ -unsigned long restore_cr3 __visible; - -unsigned long temp_level4_pgt __visible; - -unsigned long relocated_restore_code __visible; - static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; @@ -145,45 +125,6 @@ static int set_up_temporary_mappings(void) return 0; } -static int relocate_restore_code(void) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - relocated_restore_code = get_safe_page(GFP_ATOMIC); - if (!relocated_restore_code) - return -ENOMEM; - - memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); - - /* Make the page containing the relocated code executable */ - pgd = (pgd_t *)__va(read_cr3_pa()) + - pgd_index(relocated_restore_code); - p4d = p4d_offset(pgd, relocated_restore_code); - if (p4d_large(*p4d)) { - set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); - goto out; - } - pud = pud_offset(p4d, relocated_restore_code); - if (pud_large(*pud)) { - set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - goto out; - } - pmd = pmd_offset(pud, relocated_restore_code); - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - goto out; - } - pte = pte_offset_kernel(pmd, relocated_restore_code); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); -out: - __flush_tlb_all(); - return 0; -} - asmlinkage int swsusp_arch_resume(void) { int error; @@ -200,165 +141,3 @@ asmlinkage int swsusp_arch_resume(void) restore_image(); return 0; } - -/* - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -#define MD5_DIGEST_SIZE 16 - -struct restore_data_record { - unsigned long jump_address; - unsigned long jump_address_phys; - unsigned long cr3; - unsigned long magic; - u8 e820_digest[MD5_DIGEST_SIZE]; -}; - -#define RESTORE_MAGIC 0x23456789ABCDEF01UL - -#if IS_BUILTIN(CONFIG_CRYPTO_MD5) -/** - * get_e820_md5 - calculate md5 according to given e820 table - * - * @table: the e820 table to be calculated - * @buf: the md5 result to be stored to - */ -static int get_e820_md5(struct e820_table *table, void *buf) -{ - struct crypto_shash *tfm; - struct shash_desc *desc; - int size; - int ret = 0; - - tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(tfm)) - return -ENOMEM; - - desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto free_tfm; - } - - desc->tfm = tfm; - desc->flags = 0; - - size = offsetof(struct e820_table, entries) + - sizeof(struct e820_entry) * table->nr_entries; - - if (crypto_shash_digest(desc, (u8 *)table, size, buf)) - ret = -EINVAL; - - kzfree(desc); - -free_tfm: - crypto_free_shash(tfm); - return ret; -} - -static int hibernation_e820_save(void *buf) -{ - return get_e820_md5(e820_table_firmware, buf); -} - -static bool hibernation_e820_mismatch(void *buf) -{ - int ret; - u8 result[MD5_DIGEST_SIZE]; - - memset(result, 0, MD5_DIGEST_SIZE); - /* If there is no digest in suspend kernel, let it go. */ - if (!memcmp(result, buf, MD5_DIGEST_SIZE)) - return false; - - ret = get_e820_md5(e820_table_firmware, result); - if (ret) - return true; - - return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; -} -#else -static int hibernation_e820_save(void *buf) -{ - return 0; -} - -static bool hibernation_e820_mismatch(void *buf) -{ - /* If md5 is not builtin for restore kernel, let it go. */ - return false; -} -#endif - -/** - * arch_hibernation_header_save - populate the architecture specific part - * of a hibernation image header - * @addr: address to save the data at - */ -int arch_hibernation_header_save(void *addr, unsigned int max_size) -{ - struct restore_data_record *rdr = addr; - - if (max_size < sizeof(struct restore_data_record)) - return -EOVERFLOW; - rdr->jump_address = (unsigned long)restore_registers; - rdr->jump_address_phys = __pa_symbol(restore_registers); - - /* - * The restore code fixes up CR3 and CR4 in the following sequence: - * - * [in hibernation asm] - * 1. CR3 <= temporary page tables - * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) - * 3. CR3 <= rdr->cr3 - * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) - * [in restore_processor_state()] - * 5. CR4 <= saved CR4 - * 6. CR3 <= saved CR3 - * - * Our mmu_cr4_features has CR4.PCIDE=0, and toggling - * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so - * rdr->cr3 needs to point to valid page tables but must not - * have any of the PCID bits set. - */ - rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; - - rdr->magic = RESTORE_MAGIC; - - return hibernation_e820_save(rdr->e820_digest); -} - -/** - * arch_hibernation_header_restore - read the architecture specific data - * from the hibernation image header - * @addr: address to read the data from - */ -int arch_hibernation_header_restore(void *addr) -{ - struct restore_data_record *rdr = addr; - - restore_jump_address = rdr->jump_address; - jump_address_phys = rdr->jump_address_phys; - restore_cr3 = rdr->cr3; - - if (rdr->magic != RESTORE_MAGIC) { - pr_crit("Unrecognized hibernate image header format!\n"); - return -EINVAL; - } - - if (hibernation_e820_mismatch(rdr->e820_digest)) { - pr_crit("Hibernate inconsistent memory map detected!\n"); - return -ENODEV; - } - - return 0; -} From 445565303d19c75858004c129835fa418b740829 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:27:29 +0800 Subject: [PATCH 35/73] x86-32, hibernate: Enable CONFIG_ARCH_HIBERNATION_HEADER on 32bit system Enable CONFIG_ARCH_HIBERNATION_HEADER for 32bit system so that 1. arch_hibernation_header_save/restore() are invoked across hibernation on 32bit system. 2. The checksum handling as well as 'magic' number checking for 32bit system are enabled. Controlled by CONFIG_X86_64 in hibernate.c Signed-off-by: Zhimin Gu Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/Kconfig | 2 +- arch/x86/power/hibernate.c | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1a0be022f91d..e8de5de1057f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2422,7 +2422,7 @@ menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER def_bool y - depends on X86_64 && HIBERNATION + depends on HIBERNATION source "kernel/power/Kconfig" diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index f63793b8de3f..a04ca13bf592 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -53,7 +53,6 @@ int pfn_is_nosave(unsigned long pfn) return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; } -#ifdef CONFIG_X86_64 #define MD5_DIGEST_SIZE 16 @@ -140,7 +139,11 @@ static bool hibernation_e820_mismatch(void *buf) } #endif +#ifdef CONFIG_X86_64 #define RESTORE_MAGIC 0x23456789ABCDEF01UL +#else +#define RESTORE_MAGIC 0x12345678UL +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -154,6 +157,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; rdr->magic = RESTORE_MAGIC; +#ifdef CONFIG_X86_64 rdr->jump_address = (unsigned long)restore_registers; rdr->jump_address_phys = __pa_symbol(restore_registers); @@ -175,6 +179,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) * have any of the PCID bits set. */ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; +#endif return hibernation_e820_save(rdr->e820_digest); } @@ -193,9 +198,11 @@ int arch_hibernation_header_restore(void *addr) return -EINVAL; } +#ifdef CONFIG_X86_64 restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; +#endif if (hibernation_e820_mismatch(rdr->e820_digest)) { pr_crit("Hibernate inconsistent memory map detected!\n"); @@ -205,6 +212,7 @@ int arch_hibernation_header_restore(void *addr) return 0; } +#ifdef CONFIG_X86_64 int relocate_restore_code(void) { pgd_t *pgd; From 72adf47764a0be66ea0c712c7d5a6b8a81e28449 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:27:40 +0800 Subject: [PATCH 36/73] x86, hibernate: Rename temp_level4_pgt to temp_pgt As 32bit system is not using 4-level page, rename it to temp_pgt so that it can be reused for both 32bit and 64bit hibernation. No functional change. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/suspend.h | 2 +- arch/x86/power/hibernate.c | 2 +- arch/x86/power/hibernate_64.c | 2 +- arch/x86/power/hibernate_asm_64.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index 40b02558749f..a892494ca5e4 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -7,7 +7,7 @@ extern unsigned long restore_jump_address __visible; extern unsigned long jump_address_phys; extern unsigned long restore_cr3 __visible; -extern unsigned long temp_level4_pgt __visible; +extern unsigned long temp_pgt __visible; extern unsigned long relocated_restore_code __visible; extern int relocate_restore_code(void); /* Defined in hibernate_asm_32/64.S */ diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index a04ca13bf592..e3409e4a9b6a 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -36,7 +36,7 @@ unsigned long jump_address_phys; * in the image header). */ unsigned long restore_cr3 __visible; -unsigned long temp_level4_pgt __visible; +unsigned long temp_pgt __visible; unsigned long relocated_restore_code __visible; /** diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 8bc2eb0dc464..239f424ccb29 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -121,7 +121,7 @@ static int set_up_temporary_mappings(void) return result; } - temp_level4_pgt = __pa(pgd); + temp_pgt = __pa(pgd); return 0; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index fd369a6e9ff8..3008baa2fa95 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -59,7 +59,7 @@ ENTRY(restore_image) movq restore_cr3(%rip), %r9 /* prepare to switch to temporary page tables */ - movq temp_level4_pgt(%rip), %rax + movq temp_pgt(%rip), %rax movq mmu_cr4_features(%rip), %rbx /* prepare to copy image data to their original locations */ From 7c0a982750b3f8bc6636e8086ac4d8b7775e90f6 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:27:51 +0800 Subject: [PATCH 37/73] x86-32, hibernate: Use temp_pgt as the temporary page table This is to reuse the temp_pgt for both 32bit and 64bit system. No intentional behavior change. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_32.c | 2 ++ arch/x86/power/hibernate_asm_32.S | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index f82fbd279d08..a44bdada4e4e 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -156,6 +156,8 @@ asmlinkage int swsusp_arch_resume(void) if (error) return error; + temp_pgt = __pa(resume_pg_dir); + /* We have got enough memory and from now on we cannot recover */ restore_image(); return 0; diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 671d38d0d931..f0627cf30a36 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -33,8 +33,7 @@ ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) movl mmu_cr4_features, %ecx - movl resume_pg_dir, %eax - subl $__PAGE_OFFSET, %eax + movl temp_pgt, %eax movl %eax, %cr3 jecxz 1f # cr4 Pentium and higher, skip if zero From 0b0a6b1f76835cbaf746e7c72edd374ec0fe818b Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:28:01 +0800 Subject: [PATCH 38/73] x86-32, hibernate: Use the page size macro instead of constant value Convert the hard code into PAGE_SIZE for better scalability. No functional change. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_asm_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index f0627cf30a36..f5103ae96582 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -52,7 +52,7 @@ copy_loop: movl pbe_address(%edx), %esi movl pbe_orig_address(%edx), %edi - movl $1024, %ecx + movl $(PAGE_SIZE >> 2), %ecx rep movsl From 32aa276437f6128df63111af13e57fe8f0272af3 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:28:11 +0800 Subject: [PATCH 39/73] x86-32, hibernate: Switch to original page table after resumed After all the pages are restored to previous address, the page table switches back to current swapper_pg_dir. However the swapper_pg_dir currently in used might not be consistent with previous page table, which might cause issue after resume. Fix this issue by switching to original page table after resume, and the address of the original page table is saved in the hibernation image header. Move the manipulation of restore_cr3 into common code blocks. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate.c | 4 ++-- arch/x86/power/hibernate_asm_32.S | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index e3409e4a9b6a..4935b8139229 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -160,6 +160,7 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) #ifdef CONFIG_X86_64 rdr->jump_address = (unsigned long)restore_registers; rdr->jump_address_phys = __pa_symbol(restore_registers); +#endif /* * The restore code fixes up CR3 and CR4 in the following sequence: @@ -179,7 +180,6 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) * have any of the PCID bits set. */ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; -#endif return hibernation_e820_save(rdr->e820_digest); } @@ -201,8 +201,8 @@ int arch_hibernation_header_restore(void *addr) #ifdef CONFIG_X86_64 restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; - restore_cr3 = rdr->cr3; #endif + restore_cr3 = rdr->cr3; if (hibernation_e820_mismatch(rdr->e820_digest)) { pr_crit("Hibernate inconsistent memory map detected!\n"); diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index f5103ae96582..6b2b94937113 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -25,6 +25,10 @@ ENTRY(swsusp_arch_suspend) pushfl popl saved_context_eflags + /* save cr3 */ + movl %cr3, %eax + movl %eax, restore_cr3 + FRAME_BEGIN call swsusp_save FRAME_END @@ -32,6 +36,8 @@ ENTRY(swsusp_arch_suspend) ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) + movl restore_cr3, %ebp + movl mmu_cr4_features, %ecx movl temp_pgt, %eax movl %eax, %cr3 @@ -66,9 +72,7 @@ done: .align PAGE_SIZE ENTRY(restore_registers) /* go back to the original page tables */ - movl $swapper_pg_dir, %eax - subl $__PAGE_OFFSET, %eax - movl %eax, %cr3 + movl %ebp, %cr3 movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on From 6bae499a0ad437efb67b7c378e6fb4abef1885a1 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:28:22 +0800 Subject: [PATCH 40/73] x86-32, hibernate: Switch to relocated restore code during resume on 32bit system On 64bit system, code should be executed in a safe page during page restoring, as the page where instruction is running during resume might be scribbled and causes issues. Although on 32 bit, we only suspend resuming by same kernel that did the suspend, we'd like to remove that restriction in the future. Porting corresponding code from 64bit system: Allocate a safe page, and copy the restore code to it, then jump to the safe page to run the code. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate.c | 2 -- arch/x86/power/hibernate_32.c | 4 ++++ arch/x86/power/hibernate_asm_32.S | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 4935b8139229..7383cb67ffd7 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -212,7 +212,6 @@ int arch_hibernation_header_restore(void *addr) return 0; } -#ifdef CONFIG_X86_64 int relocate_restore_code(void) { pgd_t *pgd; @@ -251,4 +250,3 @@ int relocate_restore_code(void) __flush_tlb_all(); return 0; } -#endif diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index a44bdada4e4e..a9861095fbb8 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -158,6 +158,10 @@ asmlinkage int swsusp_arch_resume(void) temp_pgt = __pa(resume_pg_dir); + error = relocate_restore_code(); + if (error) + return error; + /* We have got enough memory and from now on we cannot recover */ restore_image(); return 0; diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 6b2b94937113..e9adda6b6b02 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -39,6 +39,13 @@ ENTRY(restore_image) movl restore_cr3, %ebp movl mmu_cr4_features, %ecx + + /* jump to relocated restore code */ + movl relocated_restore_code, %eax + jmpl *%eax + +/* code below has been relocated to a safe page */ +ENTRY(core_restore_code) movl temp_pgt, %eax movl %eax, %cr3 From 5331d2c7efbccab436aa11639d7fa00a1d58abe2 Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:28:32 +0800 Subject: [PATCH 41/73] x86-32, hibernate: Set up temporary text mapping for 32bit system Set up the temporary text mapping for the final jump address so that the system could jump to the right address after all the pages have been copied back to their original address - otherwise the final mapping for the jump address is invalid. Analogous changes were made for 64-bit in commit 65c0554b73c9 (x86/power/64: Fix kernel text mapping corruption during image restoration). Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate.c | 4 ---- arch/x86/power/hibernate_32.c | 31 +++++++++++++++++++++++++++++++ arch/x86/power/hibernate_asm_32.S | 3 +++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 7383cb67ffd7..bcddf09b5aa3 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -157,10 +157,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; rdr->magic = RESTORE_MAGIC; -#ifdef CONFIG_X86_64 rdr->jump_address = (unsigned long)restore_registers; rdr->jump_address_phys = __pa_symbol(restore_registers); -#endif /* * The restore code fixes up CR3 and CR4 in the following sequence: @@ -198,10 +196,8 @@ int arch_hibernation_header_restore(void *addr) return -EINVAL; } -#ifdef CONFIG_X86_64 restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; -#endif restore_cr3 = rdr->cr3; if (hibernation_e820_mismatch(rdr->e820_digest)) { diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index a9861095fbb8..15695e30f982 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -143,6 +143,32 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir) #endif } +static int set_up_temporary_text_mapping(pgd_t *pgd_base) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(restore_jump_address); + + pmd = resume_one_md_table_init(pgd); + if (!pmd) + return -ENOMEM; + + if (boot_cpu_has(X86_FEATURE_PSE)) { + set_pmd(pmd + pmd_index(restore_jump_address), + __pmd((jump_address_phys & PMD_MASK) | pgprot_val(PAGE_KERNEL_LARGE_EXEC))); + } else { + pte = resume_one_page_table_init(pmd); + if (!pte) + return -ENOMEM; + set_pte(pte + pte_index(restore_jump_address), + __pte((jump_address_phys & PAGE_MASK) | pgprot_val(PAGE_KERNEL_EXEC))); + } + + return 0; +} + asmlinkage int swsusp_arch_resume(void) { int error; @@ -152,6 +178,11 @@ asmlinkage int swsusp_arch_resume(void) return -ENOMEM; resume_init_first_level_page_table(resume_pg_dir); + + error = set_up_temporary_text_mapping(resume_pg_dir); + if (error) + return error; + error = resume_physical_mapping_init(resume_pg_dir); if (error) return error; diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index e9adda6b6b02..01f653fae7bd 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -36,6 +36,8 @@ ENTRY(swsusp_arch_suspend) ENDPROC(swsusp_arch_suspend) ENTRY(restore_image) + /* prepare to jump to the image kernel */ + movl restore_jump_address, %ebx movl restore_cr3, %ebp movl mmu_cr4_features, %ecx @@ -74,6 +76,7 @@ copy_loop: .p2align 4,,7 done: + jmpl *%ebx /* code below belongs to the image kernel */ .align PAGE_SIZE From 1fca4ba0b14d79e8a43822f7b0c7288efba4c9fa Mon Sep 17 00:00:00 2001 From: Zhimin Gu Date: Fri, 21 Sep 2018 14:28:41 +0800 Subject: [PATCH 42/73] x86-32, hibernate: Adjust in_suspend after resumed on 32bit system Update the in_suspend variable to reflect the actual hibernation status. Back-port from 64bit system. Signed-off-by: Zhimin Gu Acked-by: Pavel Machek Signed-off-by: Chen Yu Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_asm_32.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 01f653fae7bd..6fe383002125 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -103,5 +103,8 @@ ENTRY(restore_registers) xorl %eax, %eax + /* tell the hibernation core that we've just restored the memory */ + movl %eax, in_suspend + ret ENDPROC(restore_registers) From 03dba278043358bbbf4f029be169e1e73d2fbe2b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 1 Oct 2018 11:56:21 +0200 Subject: [PATCH 43/73] cpuidle: menu: Replace data->predicted_us with local variable The predicted_us field in struct menu_device is only accessed in menu_select(), so replace it with a local variable in that function. With that, stop using expected_interval instead of predicted_us to store the new predicted idle duration value if it is set to the selected state's target residency which is quite confusing. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano --- drivers/cpuidle/governors/menu.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 021b08dd139c..b6684fd89085 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -124,7 +124,6 @@ struct menu_device { int tick_wakeup; unsigned int next_timer_us; - unsigned int predicted_us; unsigned int bucket; unsigned int correction_factor[BUCKETS]; unsigned int intervals[INTERVALS]; @@ -290,6 +289,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int idx; unsigned int interactivity_req; unsigned int expected_interval; + unsigned int predicted_us; unsigned long nr_iowaiters, cpu_load; ktime_t delta_next; @@ -315,7 +315,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * operands are 32 bits. * Make sure to round up for half microseconds. */ - data->predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us * + predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us * data->correction_factor[data->bucket], RESOLUTION * DECAY); @@ -341,7 +341,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* * Use the lowest expected idle interval to pick the idle state. */ - data->predicted_us = min(data->predicted_us, expected_interval); + predicted_us = min(predicted_us, expected_interval); if (tick_nohz_tick_stopped()) { /* @@ -352,19 +352,18 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * the known time till the closest timer event for the idle * state selection. */ - if (data->predicted_us < TICK_USEC) - data->predicted_us = ktime_to_us(delta_next); + if (predicted_us < TICK_USEC) + predicted_us = ktime_to_us(delta_next); } else { /* * Use the performance multiplier and the user-configurable * latency_req to determine the maximum exit latency. */ - interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); + interactivity_req = predicted_us / performance_multiplier(nr_iowaiters, cpu_load); if (latency_req > interactivity_req) latency_req = interactivity_req; } - expected_interval = data->predicted_us; /* * Find the idle state with the lowest power while satisfying * our constraints. @@ -378,8 +377,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; if (idx == -1) idx = i; /* first enabled state */ - if (s->target_residency > data->predicted_us) { - if (data->predicted_us < TICK_USEC) + if (s->target_residency > predicted_us) { + if (predicted_us < TICK_USEC) break; if (!tick_nohz_tick_stopped()) { @@ -389,7 +388,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * tick in that case and let the governor run * again in the next iteration of the loop. */ - expected_interval = drv->states[idx].target_residency; + predicted_us = drv->states[idx].target_residency; break; } @@ -412,7 +411,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * expected idle duration so that the tick is retained * as long as that target residency is low enough. */ - expected_interval = drv->states[idx].target_residency; + predicted_us = drv->states[idx].target_residency; break; } idx = i; @@ -426,7 +425,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * expected idle duration is shorter than the tick period length. */ if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - expected_interval < TICK_USEC) && !tick_nohz_tick_stopped()) { + predicted_us < TICK_USEC) && !tick_nohz_tick_stopped()) { unsigned int delta_next_us = ktime_to_us(delta_next); *stop_tick = false; From 8ff3c22688ff3616ee53c5c54b3ee66a73326175 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 2 Oct 2018 15:34:57 -0700 Subject: [PATCH 44/73] cpufreq / CPPC: Mark acpi_ids as used Clang warns: drivers/cpufreq/cppc_cpufreq.c:431:36: warning: variable 'cppc_acpi_ids' is not needed and will not be emitted [-Wunneeded-internal-declaration] static const struct acpi_device_id cppc_acpi_ids[] = { ^ 1 warning generated. Mark the definition as used so that Clang understands we don't want this warning while not inhibiting Clang's dead code elimination from removing the unreferenced internal symbol when moving the data it contains to the globally available symbol via MODULE_DEVICE_TABLE. $ nm -S drivers/cpufreq/cppc_cpufreq.o | grep acpi | tail -1 0000000000000000 0000000000000040 R __mod_acpi__cppc_acpi_ids_device_table Suggested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 30f302149730..fd25c21cee72 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -428,7 +428,7 @@ MODULE_LICENSE("GPL"); late_initcall(cppc_cpufreq_init); -static const struct acpi_device_id cppc_acpi_ids[] = { +static const struct acpi_device_id cppc_acpi_ids[] __used = { {ACPI_PROCESSOR_DEVICE_HID, }, {} }; From 50b6b87c8484da8bb5dcce00f84ec80aee8fc2bd Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 3 Oct 2018 15:12:06 +0530 Subject: [PATCH 45/73] OPP: Improve error handling in dev_pm_opp_of_cpumask_add_table() The error handling wasn't appropriate in dev_pm_opp_of_cpumask_add_table(). For example it returns 0 on success and also for the case where cpumask is empty or cpu_device wasn't found for any of the CPUs. It should really return error on such cases, so that the callers can be aware of the outcome. Fix it. Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index a71ff3acca0f..67a384c8ead2 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -614,16 +614,18 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table); int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask) { struct device *cpu_dev; - int cpu, ret = 0; + int cpu, ret; - WARN_ON(cpumask_empty(cpumask)); + if (WARN_ON(cpumask_empty(cpumask))) + return -ENODEV; for_each_cpu(cpu, cpumask) { cpu_dev = get_cpu_device(cpu); if (!cpu_dev) { pr_err("%s: failed to get cpu%d device\n", __func__, cpu); - continue; + ret = -ENODEV; + goto remove_table; } ret = dev_pm_opp_of_add_table(cpu_dev); @@ -635,12 +637,16 @@ int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask) pr_debug("%s: couldn't find opp table for cpu:%d, %d\n", __func__, cpu, ret); - /* Free all other OPPs */ - _dev_pm_opp_cpumask_remove_table(cpumask, cpu); - break; + goto remove_table; } } + return 0; + +remove_table: + /* Free all other OPPs */ + _dev_pm_opp_cpumask_remove_table(cpumask, cpu); + return ret; } EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table); From 09f662f95306f3e3d47ab6842bc4b0bb868a80ad Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 3 Oct 2018 15:22:03 +0530 Subject: [PATCH 46/73] OPP: Return error on error from dev_pm_opp_get_opp_count() Return error number instead of 0 on failures. Fixes: a1e8c13600bf ("PM / OPP: "opp-hz" is optional for power domains") Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index cdf918aaac34..2c2df4e4fc14 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -318,7 +318,7 @@ int dev_pm_opp_get_opp_count(struct device *dev) count = PTR_ERR(opp_table); dev_dbg(dev, "%s: OPP table not found (%d)\n", __func__, count); - return 0; + return count; } count = _get_opp_count(opp_table); From 51c99dd2c06b234575661fa1e0a1dea6c3ef566f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 3 Oct 2018 15:35:21 +0530 Subject: [PATCH 47/73] cpufreq: dt: Try freeing static OPPs only if we have added them We can not call dev_pm_opp_of_cpumask_remove_table() freely anymore since the latest OPP core updates as that uses reference counting to free resources. There are cases where no static OPPs are added (using DT) for a platform and trying to remove the OPP table may end up decrementing refcount which is already zero and hence generating warnings. Lets track if we were able to add static OPPs or not and then only remove the table based on that. Some reshuffling of code is also done to do that. Reported-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 0a9ebf00be46..e58bfcb1169e 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -32,6 +32,7 @@ struct private_data { struct device *cpu_dev; struct thermal_cooling_device *cdev; const char *reg_name; + bool have_static_opps; }; static struct freq_attr *cpufreq_dt_attr[] = { @@ -204,6 +205,15 @@ static int cpufreq_init(struct cpufreq_policy *policy) } } + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto out_put_regulator; + } + + priv->reg_name = name; + priv->opp_table = opp_table; + /* * Initialize OPP tables for all policy->cpus. They will be shared by * all CPUs which have marked their CPUs shared with OPP bindings. @@ -214,7 +224,8 @@ static int cpufreq_init(struct cpufreq_policy *policy) * * OPPs might be populated at runtime, don't check for error here */ - dev_pm_opp_of_cpumask_add_table(policy->cpus); + if (!dev_pm_opp_of_cpumask_add_table(policy->cpus)) + priv->have_static_opps = true; /* * But we need OPP table to function so if it is not there let's @@ -240,19 +251,10 @@ static int cpufreq_init(struct cpufreq_policy *policy) __func__, ret); } - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) { - ret = -ENOMEM; - goto out_free_opp; - } - - priv->reg_name = name; - priv->opp_table = opp_table; - ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table); if (ret) { dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); - goto out_free_priv; + goto out_free_opp; } priv->cpu_dev = cpu_dev; @@ -282,10 +284,11 @@ static int cpufreq_init(struct cpufreq_policy *policy) out_free_cpufreq_table: dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table); -out_free_priv: - kfree(priv); out_free_opp: - dev_pm_opp_of_cpumask_remove_table(policy->cpus); + if (priv->have_static_opps) + dev_pm_opp_of_cpumask_remove_table(policy->cpus); + kfree(priv); +out_put_regulator: if (name) dev_pm_opp_put_regulators(opp_table); out_put_clk: @@ -300,7 +303,8 @@ static int cpufreq_exit(struct cpufreq_policy *policy) cpufreq_cooling_unregister(priv->cdev); dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table); - dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); + if (priv->have_static_opps) + dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); if (priv->reg_name) dev_pm_opp_put_regulators(priv->opp_table); From deac8703da5faafce96be3f91c668dcd5da931a5 Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Wed, 3 Oct 2018 16:13:15 +0530 Subject: [PATCH 48/73] PM / OPP: _of_add_opp_table_v2(): increment count only if OPP is added Currently the _of_add_opp_table_v2 call loops through the OPP nodes in the operating-points-v2 table in the device tree and calls _opp_add_static_v2 for each to add them to the table. It counts each iteration through this loop as an added OPP, however there are cases where _opp_add_static_v2() returns 0 but no new OPP is added to the list. This can happen while adding duplicate OPP or if the OPP isn't supported by hardware. Because of this the count variable will contain the number of OPP nodes in the table in device tree but not necessarily the ones that are actually added. As this count value is what is checked to determine if there are any valid OPPs, if a platform has an operating-points-v2 table with all OPP nodes containing opp-supported-hw values that are not currently supported, then _of_add_opp_table_v2 will fail to abort as it should due to an empty table. Additionally, since commit 3ba98324e81a ("PM / OPP: Get performance state using genpd helper"), the same count variable is compared against the number of OPPs containing performance states and requires that either all or none have pstates set, however in the case of any opp table that has any entries that do not get added by _opp_add_static_v2 due to incompatible opp-supported-hw fields, these numbers will not match and _of_add_opp_table_v2 will incorrectly fail. We need to clearly identify all the three cases (success, failure, unsupported/duplicate OPPs) and then increment count only on success case. Change return type of _opp_add_static_v2() to return the pointer to the newly added OPP instead of an integer. This routine now returns a valid pointer if the OPP is really added, NULL for unsupported or duplicate OPPs, and error value cased as a pointer on errors. Ideally the fixes tag in this commit should point back to the commit that introduced OPP v2 initially, as that's where we started incorrectly accounting for duplicate OPPs: commit 274659029c9d ("PM / OPP: Add support to parse "operating-points-v2" bindings") But it wasn't a real problem until recently as the count was only used to check if any OPPs are added or not. And so this commit points to a rather recent commit where we added more code that depends on the value of "count". Fixes: 3ba98324e81a ("PM / OPP: Get performance state using genpd helper") Reported-by: Dave Gerlach Reported-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Dave Gerlach Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 67a384c8ead2..5a4b47958073 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -297,15 +297,21 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); * removed by dev_pm_opp_remove. * * Return: - * 0 On success OR + * Valid OPP pointer: + * On success + * NULL: * Duplicate OPPs (both freq and volt are same) and opp->available - * -EEXIST Freq are same and volt are different OR + * OR if the OPP is not supported by hardware. + * ERR_PTR(-EEXIST): + * Freq are same and volt are different OR * Duplicate OPPs (both freq and volt are same) and !opp->available - * -ENOMEM Memory allocation failure - * -EINVAL Failed parsing the OPP node + * ERR_PTR(-ENOMEM): + * Memory allocation failure + * ERR_PTR(-EINVAL): + * Failed parsing the OPP node */ -static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev, - struct device_node *np) +static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table, + struct device *dev, struct device_node *np) { struct dev_pm_opp *new_opp; u64 rate = 0; @@ -315,7 +321,7 @@ static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev, new_opp = _opp_allocate(opp_table); if (!new_opp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = of_property_read_u64(np, "opp-hz", &rate); if (ret < 0) { @@ -390,12 +396,12 @@ static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev, * frequency/voltage list. */ blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp); - return 0; + return new_opp; free_opp: _opp_free(new_opp); - return ret; + return ERR_PTR(ret); } /* Initializes OPP tables based on new bindings */ @@ -415,14 +421,15 @@ static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table) /* We have opp-table node now, iterate over it and add OPPs */ for_each_available_child_of_node(opp_table->np, np) { - count++; - - ret = _opp_add_static_v2(opp_table, dev, np); - if (ret) { + opp = _opp_add_static_v2(opp_table, dev, np); + if (IS_ERR(opp)) { + ret = PTR_ERR(opp); dev_err(dev, "%s: Failed to add OPP, %d\n", __func__, ret); of_node_put(np); goto put_list_kref; + } else if (opp) { + count++; } } From 1cdda9486f5103fb133f88e662e48c504adbb779 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Fri, 28 Sep 2018 14:24:02 +0530 Subject: [PATCH 49/73] ACPI / PM: LPIT: Register sysfs attributes based on FADT ACPI Low Power S0 Idle capabilities are announced via FADT table and can be used to inform the kernel about the presence of one or more Low Power Idle (LPI) entries as descried in LPIT table. LPIT table can exist independently even if the FADT S0 Idle flag is not set and thus it could confuse user since the following cpuidle attributes are created. /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us /sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us Presence or absence of above attributes could mean that the given platform supports S0ix state or not. This change allows to create the above cpuidle attributes only if FADT table supports Low Power S0 Idle. Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpit.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/acpi_lpit.c b/drivers/acpi/acpi_lpit.c index cf4fc0161164..e43cb71b6972 100644 --- a/drivers/acpi/acpi_lpit.c +++ b/drivers/acpi/acpi_lpit.c @@ -117,11 +117,17 @@ static void lpit_update_residency(struct lpit_residency_info *info, if (!info->iomem_addr) return; + if (!(acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)) + return; + /* Silently fail, if cpuidle attribute group is not present */ sysfs_add_file_to_group(&cpu_subsys.dev_root->kobj, &dev_attr_low_power_idle_system_residency_us.attr, "cpuidle"); } else if (info->gaddr.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) { + if (!(acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)) + return; + /* Silently fail, if cpuidle attribute group is not present */ sysfs_add_file_to_group(&cpu_subsys.dev_root->kobj, &dev_attr_low_power_idle_cpu_residency_us.attr, From 5f26bdceb9c0a5e6c696aa2899d077cd3ae93413 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:42:02 +0200 Subject: [PATCH 50/73] cpuidle: menu: Fix wakeup statistics updates for polling state If the CPU exits the "polling" state due to the time limit in the loop in poll_idle(), this is not a real wakeup and it just means that the "polling" state selection was not adequate. The governor mispredicted short idle duration, but had a more suitable state been selected, the CPU might have spent more time in it. In fact, there is no reason to expect that there would have been a wakeup event earlier than the next timer in that case. Handling such cases as regular wakeups in menu_update() may cause the menu governor to make suboptimal decisions going forward, but ignoring them altogether would not be correct either, because every time menu_select() is invoked, it makes a separate new attempt to predict the idle duration taking distinct time to the closest timer event as input and the outcomes of all those attempts should be recorded. For this reason, make menu_update() always assume that if the "polling" state was exited due to the time limit, the next proper wakeup event for the CPU would be the next timer event (not including the tick). Fixes: a37b969a61c1 "cpuidle: poll_state: Add time limit to poll_idle()" Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Lezcano --- drivers/cpuidle/governors/menu.c | 10 ++++++++++ drivers/cpuidle/poll_state.c | 6 +++++- include/linux/cpuidle.h | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b6684fd89085..8b3f9c7bf221 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -511,6 +511,16 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * duration predictor do a better job next time. */ measured_us = 9 * MAX_INTERESTING / 10; + } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) && + dev->poll_time_limit) { + /* + * The CPU exited the "polling" state due to a time limit, so + * the idle duration prediction leading to the selection of that + * state was inaccurate. If a better prediction had been made, + * the CPU might have been woken up from idle by the next timer. + * Assume that to be the case. + */ + measured_us = data->next_timer_us; } else { /* measured value */ measured_us = dev->last_residency; diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 3f86d23c592e..36ff5a1d9422 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -17,6 +17,8 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, { u64 time_start = local_clock(); + dev->poll_time_limit = false; + local_irq_enable(); if (!current_set_polling_and_test()) { unsigned int loop_count = 0; @@ -27,8 +29,10 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, continue; loop_count = 0; - if (local_clock() - time_start > POLL_IDLE_TIME_LIMIT) + if (local_clock() - time_start > POLL_IDLE_TIME_LIMIT) { + dev->poll_time_limit = true; break; + } } } current_clr_polling(); diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index d262f620890e..faed7a8977e8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -81,6 +81,7 @@ struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; unsigned int use_deepest_state:1; + unsigned int poll_time_limit:1; unsigned int cpu; int last_residency; From 23e8ceb9ce766c81d62434053aef6e7efea6fcc3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:42:56 +0200 Subject: [PATCH 51/73] cpuidle: menu: Compute first_idx when latency_req is known Since menu_select() can only set first_idx to 1 if the exit latency of the second state is not greater than the latency limit, it should first determine that limit. Thus first_idx should be computed after the "interactivity" factor has been taken into account. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewedy-by: Daniel Lezcano --- drivers/cpuidle/governors/menu.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 8b3f9c7bf221..98a39c57acec 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -322,22 +322,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, expected_interval = get_typical_interval(data); expected_interval = min(expected_interval, data->next_timer_us); - first_idx = 0; - if (drv->states[0].flags & CPUIDLE_FLAG_POLLING) { - struct cpuidle_state *s = &drv->states[1]; - unsigned int polling_threshold; - - /* - * Default to a physical idle state, not to busy polling, unless - * a timer is going to trigger really really soon. - */ - polling_threshold = max_t(unsigned int, 20, s->target_residency); - if (data->next_timer_us > polling_threshold && - latency_req > s->exit_latency && !s->disabled && - !dev->states_usage[1].disable) - first_idx = 1; - } - /* * Use the lowest expected idle interval to pick the idle state. */ @@ -364,6 +348,22 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, latency_req = interactivity_req; } + first_idx = 0; + if (drv->states[0].flags & CPUIDLE_FLAG_POLLING) { + struct cpuidle_state *s = &drv->states[1]; + unsigned int polling_threshold; + + /* + * Default to a physical idle state, not to busy polling, unless + * a timer is going to trigger really really soon. + */ + polling_threshold = max_t(unsigned int, 20, s->target_residency); + if (data->next_timer_us > polling_threshold && + latency_req > s->exit_latency && !s->disabled && + !dev->states_usage[1].disable) + first_idx = 1; + } + /* * Find the idle state with the lowest power while satisfying * our constraints. From 96c3d11df15323e7f6e55242f7bda610c4bef402 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:44:06 +0200 Subject: [PATCH 52/73] cpuidle: menu: Get rid of first_idx from menu_select() Rearrange the code in menu_select() so that the loop over idle states always starts from 0 and get rid of the first_idx variable. While at it, add two empty lines to separate conditional statements from one another. No intentional behavior changes. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Lezcano --- drivers/cpuidle/governors/menu.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 98a39c57acec..5c7fafcb1135 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -285,7 +285,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct menu_device *data = this_cpu_ptr(&menu_devices); int latency_req = cpuidle_governor_latency_req(dev->cpu); int i; - int first_idx; int idx; unsigned int interactivity_req; unsigned int expected_interval; @@ -348,36 +347,33 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, latency_req = interactivity_req; } - first_idx = 0; - if (drv->states[0].flags & CPUIDLE_FLAG_POLLING) { - struct cpuidle_state *s = &drv->states[1]; - unsigned int polling_threshold; - - /* - * Default to a physical idle state, not to busy polling, unless - * a timer is going to trigger really really soon. - */ - polling_threshold = max_t(unsigned int, 20, s->target_residency); - if (data->next_timer_us > polling_threshold && - latency_req > s->exit_latency && !s->disabled && - !dev->states_usage[1].disable) - first_idx = 1; - } - /* * Find the idle state with the lowest power while satisfying * our constraints. */ idx = -1; - for (i = first_idx; i < drv->state_count; i++) { + for (i = 0; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; struct cpuidle_state_usage *su = &dev->states_usage[i]; if (s->disabled || su->disable) continue; + if (idx == -1) idx = i; /* first enabled state */ + if (s->target_residency > predicted_us) { + /* + * Use a physical idle state, not busy polling, unless + * a timer is going to trigger really really soon. + */ + if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + i == idx + 1 && latency_req > s->exit_latency && + data->next_timer_us > max_t(unsigned int, 20, + s->target_residency)) { + idx = i; + break; + } if (predicted_us < TICK_USEC) break; From eb40a380bff28f84b6583bba6786b46ef26ef548 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:45:07 +0200 Subject: [PATCH 53/73] cpuidle: menu: Do not update last_state_idx in menu_select() It is not necessary to update data->last_state_idx in menu_select() as it only is used in menu_update() which only runs when data->needs_update is set and that is set only when updating data->last_state_idx in menu_reflect(). Accordingly, drop the update of data->last_state_idx from menu_select() and get rid of the (now redundant) "out" label from it. No intentional behavior changes. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Lezcano --- drivers/cpuidle/governors/menu.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 5c7fafcb1135..968379d4f207 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -398,7 +398,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, s->target_residency <= ktime_to_us(delta_next)) idx = i; - goto out; + return idx; } if (s->exit_latency > latency_req) { /* @@ -445,10 +445,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } -out: - data->last_state_idx = idx; - - return data->last_state_idx; + return idx; } /** From 8b007ebec9a5d120d25c00d9b771aadf45ac5177 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:46:28 +0200 Subject: [PATCH 54/73] cpuidle: menu: Avoid computations for very close timers If the next timer event (not including the tick) is closer than the target residency of the second state or the PM QoS latency constraint is below its exit latency, state[0] will be used regardless of any other factors, so skip the computations in menu_select() then and return 0 straight away from it. Still, do that after the bucket has been determined to avoid updating the correction factor for a stale bucket. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/governors/menu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 968379d4f207..667606c6e284 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -309,6 +309,18 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, get_iowait_load(&nr_iowaiters, &cpu_load); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); + if (unlikely(drv->state_count <= 1) || + ((data->next_timer_us < drv->states[1].target_residency || + latency_req < drv->states[1].exit_latency) && + !drv->states[0].disabled && !dev->states_usage[0].disable)) { + /* + * In this case state[0] will be used no matter what, so return + * it right away and keep the tick running. + */ + *stop_tick = false; + return 0; + } + /* * Force the result of multiplication to be 64 bits even if both * operands are 32 bits. From 53812cdc9100e19f2e782851964355f2db5583de Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:47:43 +0200 Subject: [PATCH 55/73] cpuidle: menu: Move the latency_req == 0 special case check It is better to always update data->bucket before returning from menu_select() to avoid updating the correction factor for a stale bucket, so combine the latency_req == 0 special check with the more general check below. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/governors/menu.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 667606c6e284..b754296eb0c5 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -297,19 +297,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - /* Special case when user has set very strict latency requirement */ - if (unlikely(latency_req == 0)) { - *stop_tick = false; - return 0; - } - /* determine the expected residency time, round up */ data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next)); get_iowait_load(&nr_iowaiters, &cpu_load); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); - if (unlikely(drv->state_count <= 1) || + if (unlikely(drv->state_count <= 1 || latency_req == 0) || ((data->next_timer_us < drv->states[1].target_residency || latency_req < drv->states[1].exit_latency) && !drv->states[0].disabled && !dev->states_usage[0].disable)) { From 01bad1c6896db021db82042e71c2bf1f97cc026b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 2 Oct 2018 23:50:30 +0200 Subject: [PATCH 56/73] cpuidle: poll_state: Revise loop termination condition If need_resched() returns "false", breaking out of the loop in poll_idle() will cause a new idle state to be selected, so in fact it usually doesn't make sense to spin in it longer than the target residency of the second state. [Note that the "polling" state is used only if there is at least one "real" state defined in addition to it, so the second state is always there.] On the other hand, breaking out of it early (say in case the next state is disabled) shouldn't hurt as it is polling anyway. For this reason, make the loop in poll_idle() break if the CPU has been spinning longer than the target residency of the second state (the "polling" state can only be state[0]). Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/poll_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 36ff5a1d9422..85792d371add 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -9,7 +9,6 @@ #include #include -#define POLL_IDLE_TIME_LIMIT (TICK_NSEC / 16) #define POLL_IDLE_RELAX_COUNT 200 static int __cpuidle poll_idle(struct cpuidle_device *dev, @@ -21,6 +20,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, local_irq_enable(); if (!current_set_polling_and_test()) { + u64 limit = (u64)drv->states[1].target_residency * NSEC_PER_USEC; unsigned int loop_count = 0; while (!need_resched()) { @@ -29,7 +29,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, continue; loop_count = 0; - if (local_clock() - time_start > POLL_IDLE_TIME_LIMIT) { + if (local_clock() - time_start > limit) { dev->poll_time_limit = true; break; } From 9d21d33cab2dd5aadd23ad51baf1dc49d544fcc0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 5 Oct 2018 12:00:58 -0700 Subject: [PATCH 57/73] cpufreq: dt-platdev: allow RK3399 to have separate tunables per cluster RK3899 has one cluster with 4 small cores, and another one with 2 big cores, with cores in different clusters having different OPPs and thus needing separate set of tunables. Let's enable this via "have_governor_per_policy" platform data. Signed-off-by: Dmitry Torokhov Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 805f8a0c3470..b1c5468dca16 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -79,7 +79,10 @@ static const struct of_device_id whitelist[] __initconst = { { .compatible = "rockchip,rk3328", }, { .compatible = "rockchip,rk3366", }, { .compatible = "rockchip,rk3368", }, - { .compatible = "rockchip,rk3399", }, + { .compatible = "rockchip,rk3399", + .data = &(struct cpufreq_dt_platform_data) + { .have_governor_per_policy = true, }, + }, { .compatible = "st-ericsson,u8500", }, { .compatible = "st-ericsson,u8540", }, From 2733fb0d0699246711cf622e0e2faf02a05b69dc Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Mon, 8 Oct 2018 14:07:34 +0800 Subject: [PATCH 58/73] cpufreq: imx6q: read OCOTP through nvmem for imx6ul/imx6ull On i.MX6UL/i.MX6ULL, accessing OCOTP directly is wrong because the ocotp clock needs to be enabled first. Add support for reading OCOTP through the nvmem API, and keep the old method there to support old dtb. Signed-off-by: Anson Huang Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/imx6q-cpufreq.c | 52 ++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index b2ff423ad7f8..8cfee0ab804b 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -290,20 +291,32 @@ static void imx6q_opp_check_speed_grading(struct device *dev) #define OCOTP_CFG3_6ULL_SPEED_792MHZ 0x2 #define OCOTP_CFG3_6ULL_SPEED_900MHZ 0x3 -static void imx6ul_opp_check_speed_grading(struct device *dev) +static int imx6ul_opp_check_speed_grading(struct device *dev) { - struct device_node *np; - void __iomem *base; u32 val; + int ret = 0; - np = of_find_compatible_node(NULL, NULL, "fsl,imx6ul-ocotp"); - if (!np) - return; + if (of_find_property(dev->of_node, "nvmem-cells", NULL)) { + ret = nvmem_cell_read_u32(dev, "speed_grade", &val); + if (ret) + return ret; + } else { + struct device_node *np; + void __iomem *base; - base = of_iomap(np, 0); - if (!base) { - dev_err(dev, "failed to map ocotp\n"); - goto put_node; + np = of_find_compatible_node(NULL, NULL, "fsl,imx6ul-ocotp"); + if (!np) + return -ENOENT; + + base = of_iomap(np, 0); + of_node_put(np); + if (!base) { + dev_err(dev, "failed to map ocotp\n"); + return -EFAULT; + } + + val = readl_relaxed(base + OCOTP_CFG3); + iounmap(base); } /* @@ -314,7 +327,6 @@ static void imx6ul_opp_check_speed_grading(struct device *dev) * 2b'11: 900000000Hz on i.MX6ULL only; * We need to set the max speed of ARM according to fuse map. */ - val = readl_relaxed(base + OCOTP_CFG3); val >>= OCOTP_CFG3_SPEED_SHIFT; val &= 0x3; @@ -334,9 +346,7 @@ static void imx6ul_opp_check_speed_grading(struct device *dev) dev_warn(dev, "failed to disable 900MHz OPP\n"); } - iounmap(base); -put_node: - of_node_put(np); + return ret; } static int imx6q_cpufreq_probe(struct platform_device *pdev) @@ -394,10 +404,18 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) } if (of_machine_is_compatible("fsl,imx6ul") || - of_machine_is_compatible("fsl,imx6ull")) - imx6ul_opp_check_speed_grading(cpu_dev); - else + of_machine_is_compatible("fsl,imx6ull")) { + ret = imx6ul_opp_check_speed_grading(cpu_dev); + if (ret == -EPROBE_DEFER) + return ret; + if (ret) { + dev_err(cpu_dev, "failed to read ocotp: %d\n", + ret); + return ret; + } + } else { imx6q_opp_check_speed_grading(cpu_dev); + } /* Because we have added the OPPs here, we must free them */ free_opp = true; From 8c22e2f695920ebd94f9a53bcf2a65eb36d4dba1 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 8 Oct 2018 11:06:18 -0400 Subject: [PATCH 59/73] cpupower: Fix AMD Family 0x17 msr_pstate size The msr_pstate data is only 63 bits long and should be 64 bits. Add in the missing bit from res1 for AMD Family 0x17. Reference: https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf, page 138. Signed-off-by: Prarit Bhargava Cc: Shuah Khan Cc: Stafford Horne Signed-off-by: Shuah Khan (Samsung OSG) --- tools/power/cpupower/utils/helpers/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index bb41cdd0df6b..eed31b27909b 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -33,7 +33,7 @@ union msr_pstate { unsigned vid:8; unsigned iddval:8; unsigned idddiv:2; - unsigned res1:30; + unsigned res1:31; unsigned en:1; } fam17h_bits; unsigned long long val; From f69ffc5d3db8f1f03fd6d1df5930f9a1fbd787b6 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 8 Oct 2018 11:06:19 -0400 Subject: [PATCH 60/73] cpupower: Fix coredump on VMWare cpupower crashes on VMWare guests. The guests have the AMD PStateDef MSR (0xC0010064 + state number) set to zero. As a result fid and did are zero and the crash occurs because of a divide by zero (cof = fid/did). This can be prevented by checking the enable bit in the PStateDef MSR before calculating cof. By doing this the value of pstate[i] remains zero and the value can be tested before displaying the active Pstates. Check the enable bit in the PstateDef register for all supported families and only print out enabled Pstates. Signed-off-by: Prarit Bhargava Cc: Shuah Khan Cc: Stafford Horne Signed-off-by: Shuah Khan (Samsung OSG) --- tools/power/cpupower/utils/cpufreq-info.c | 2 ++ tools/power/cpupower/utils/helpers/amd.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index df43cd45d810..ccd08dd00996 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -200,6 +200,8 @@ static int get_boost_mode(unsigned int cpu) printf(_(" Boost States: %d\n"), b_states); printf(_(" Total States: %d\n"), pstate_no); for (i = 0; i < pstate_no; i++) { + if (!pstates[i]) + continue; if (i < b_states) printf(_(" Pstate-Pb%d: %luMHz (boost state)" "\n"), i, pstates[i]); diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index eed31b27909b..9607ada5b29a 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -119,6 +119,11 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, } if (read_msr(cpu, MSR_AMD_PSTATE + i, &pstate.val)) return -1; + if ((cpu_family == 0x17) && (!pstate.fam17h_bits.en)) + continue; + else if (!pstate.bits.en) + continue; + pstates[i] = get_cof(cpu_family, pstate); } *no = i; From 5484f0334439701900121a107709c461215cadb6 Mon Sep 17 00:00:00 2001 From: Todd Brandt Date: Mon, 8 Oct 2018 15:56:31 -0700 Subject: [PATCH 61/73] PM / tools: sleepgraph: first batch of v5.2 changes general: - add battery charge data before and after test - remove special s0i3 handling - remove melding of dmesg & ftrace data in old kernels, use one only - updates to various kprobes in trace (ksys_sync, etc) - enable pm_debug_messages during the test - instrument more subsystems with dev functions (phy0) error handling: - return codes for tool show the status of the test run - 0: success, 1: general error (no timeline), 2: fail (suspend aborted) - monitor output of /sys/power/state, mark as failure if exception occurs - add signal handler when using -result to catch tool exceptions display control - add -x commands for testing xset with mode settings and status - allow display setting to on, off, suspend, standby - add display mode change info to the log, along with a warning on fail s2idle (freeze) - remove fixed 10-phase dependency, allow any phase order & any count - multiple phase occurences show as phase_nameN e.g. suspend_noirq3 - if multiple freezes occur, print multiple time values in header summary: - add new columns to summary output: issues, worst suspend/resume devices - worst device: includes summation of all phases of suspend or resume - issues: includes WARNING/ERROR/BUG from dmesg log, and other issues - s2idle: multiple freezes show as FREEZExN in the issues column Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- tools/power/pm-graph/Makefile | 4 +- tools/power/pm-graph/sleepgraph.8 | 13 +- tools/power/pm-graph/sleepgraph.py | 1384 +++++++++++++++------------- 3 files changed, 763 insertions(+), 638 deletions(-) diff --git a/tools/power/pm-graph/Makefile b/tools/power/pm-graph/Makefile index c1899cd72c80..845541544570 100644 --- a/tools/power/pm-graph/Makefile +++ b/tools/power/pm-graph/Makefile @@ -23,8 +23,8 @@ install : uninstall install -m 644 config/suspend-x2-proc.cfg $(DESTDIR)$(PREFIX)/lib/pm-graph/config install -d $(DESTDIR)$(PREFIX)/bin - ln -s $(DESTDIR)$(PREFIX)/lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph - ln -s $(DESTDIR)$(PREFIX)/lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph + ln -s ../lib/pm-graph/bootgraph.py $(DESTDIR)$(PREFIX)/bin/bootgraph + ln -s ../lib/pm-graph/sleepgraph.py $(DESTDIR)$(PREFIX)/bin/sleepgraph install -d $(DESTDIR)$(PREFIX)/share/man/man8 install bootgraph.8 $(DESTDIR)$(PREFIX)/share/man/man8 diff --git a/tools/power/pm-graph/sleepgraph.8 b/tools/power/pm-graph/sleepgraph.8 index 070be2cf7f74..24a2e7d0ae63 100644 --- a/tools/power/pm-graph/sleepgraph.8 +++ b/tools/power/pm-graph/sleepgraph.8 @@ -65,9 +65,9 @@ During test, enable/disable runtime suspend for all devices. The test is delayed by 5 seconds to allow runtime suspend changes to occur. The settings are restored after the test is complete. .TP -\fB-display \fIon/off\fR -Turn the display on or off for the test using the xset command. This helps -maintain the consistecy of test data for better comparison. +\fB-display \fIon/off/standby/suspend\fR +Switch the display to the requested mode for the test using the xset command. +This helps maintain the consistency of test data for better comparison. .TP \fB-skiphtml\fR Run the test and capture the trace logs, but skip the timeline generation. @@ -183,6 +183,13 @@ Print out the contents of the ACPI Firmware Performance Data Table. \fB-battery\fR Print out battery status and current charge. .TP +\fB-xon/-xoff/-xstandby/-xsuspend\fR +Test xset by attempting to switch the display to the given mode. This +is the same command which will be issued by \fB-display \fImode\fR. +.TP +\fB-xstat\fR +Get the current DPMS display mode. +.TP \fB-sysinfo\fR Print out system info extracted from BIOS. Reads /dev/mem directly instead of going through dmidecode. .TP diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 0c760478f7d7..343e8000d9ca 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -54,6 +54,7 @@ import os import string import re import platform +import signal from datetime import datetime import struct import ConfigParser @@ -72,7 +73,7 @@ class SystemValues: version = '5.1' ansi = False rs = 0 - display = 0 + display = '' gzip = False sync = False verbose = False @@ -99,6 +100,7 @@ class SystemValues: tpath = '/sys/kernel/debug/tracing/' fpdtpath = '/sys/firmware/acpi/tables/FPDT' epath = '/sys/kernel/debug/tracing/events/power/' + pmdpath = '/sys/power/pm_debug_messages' traceevents = [ 'suspend_resume', 'device_pm_callback_end', @@ -141,12 +143,10 @@ class SystemValues: devprops = dict() predelay = 0 postdelay = 0 - procexecfmt = 'ps - (?P.*)$' - devpropfmt = '# Device Properties: .*' - tracertypefmt = '# tracer: (?P.*)' - firmwarefmt = '# fwsuspend (?P[0-9]*) fwresume (?P[0-9]*)$' + pmdebug = '' tracefuncs = { 'sys_sync': {}, + 'ksys_sync': {}, '__pm_notifier_call_chain': {}, 'pm_prepare_console': {}, 'pm_notifier_call_chain': {}, @@ -187,7 +187,6 @@ class SystemValues: dev_tracefuncs = { # general wait/delay/sleep 'msleep': { 'args_x86_64': {'time':'%di:s32'}, 'ub': 1 }, - 'schedule_timeout_uninterruptible': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 }, 'schedule_timeout': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 }, 'udelay': { 'func':'__const_udelay', 'args_x86_64': {'loops':'%di:s32'}, 'ub': 1 }, 'usleep_range': { 'args_x86_64': {'min':'%di:s32', 'max':'%si:s32'}, 'ub': 1 }, @@ -199,6 +198,9 @@ class SystemValues: # filesystem 'ext4_sync_fs': {}, # 80211 + 'ath10k_bmi_read_memory': { 'args_x86_64': {'length':'%cx:s32'} }, + 'ath10k_bmi_write_memory': { 'args_x86_64': {'length':'%cx:s32'} }, + 'ath10k_bmi_fast_download': { 'args_x86_64': {'length':'%cx:s32'} }, 'iwlagn_mac_start': {}, 'iwlagn_alloc_bcast_station': {}, 'iwl_trans_pcie_start_hw': {}, @@ -241,6 +243,7 @@ class SystemValues: timeformat = '%.3f' cmdline = '%s %s' % \ (os.path.basename(sys.argv[0]), ' '.join(sys.argv[1:])) + sudouser = '' def __init__(self): self.archargs = 'args_'+platform.machine() self.hostname = platform.node() @@ -256,10 +259,32 @@ class SystemValues: if (hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()): self.ansi = True self.testdir = datetime.now().strftime('suspend-%y%m%d-%H%M%S') + if os.getuid() == 0 and 'SUDO_USER' in os.environ and \ + os.environ['SUDO_USER']: + self.sudouser = os.environ['SUDO_USER'] def vprint(self, msg): self.logmsg += msg+'\n' - if(self.verbose): + if self.verbose or msg.startswith('WARNING:'): print(msg) + def signalHandler(self, signum, frame): + if not self.result: + return + signame = self.signames[signum] if signum in self.signames else 'UNKNOWN' + msg = 'Signal %s caused a tool exit, line %d' % (signame, frame.f_lineno) + sysvals.outputResult({'error':msg}) + sys.exit(3) + def signalHandlerInit(self): + capture = ['BUS', 'SYS', 'XCPU', 'XFSZ', 'PWR', 'HUP', 'INT', 'QUIT', + 'ILL', 'ABRT', 'FPE', 'SEGV', 'TERM', 'TSTP'] + self.signames = dict() + for i in capture: + s = 'SIG'+i + try: + signum = getattr(signal, s) + signal.signal(signum, self.signalHandler) + except: + continue + self.signames[signum] = s def rootCheck(self, fatal=True): if(os.access(self.powerfile, os.W_OK)): return True @@ -267,7 +292,7 @@ class SystemValues: msg = 'This command requires sysfs mount and root access' print('ERROR: %s\n') % msg self.outputResult({'error':msg}) - sys.exit() + sys.exit(1) return False def rootUser(self, fatal=False): if 'USER' in os.environ and os.environ['USER'] == 'root': @@ -276,7 +301,7 @@ class SystemValues: msg = 'This command must be run as root' print('ERROR: %s\n') % msg self.outputResult({'error':msg}) - sys.exit() + sys.exit(1) return False def getExec(self, cmd): dirlist = ['/sbin', '/bin', '/usr/sbin', '/usr/bin', @@ -406,8 +431,8 @@ class SystemValues: ktime = m.group('ktime') fp.close() self.dmesgstart = float(ktime) - def getdmesg(self, fwdata=[]): - op = self.writeDatafileHeader(sysvals.dmesgfile, fwdata) + def getdmesg(self, testdata): + op = self.writeDatafileHeader(sysvals.dmesgfile, testdata) # store all new dmesg lines since initdmesg was called fp = Popen('dmesg', stdout=PIPE).stdout for line in fp: @@ -619,6 +644,8 @@ class SystemValues: self.fsetVal('0', 'events/kprobes/enable') self.fsetVal('', 'kprobe_events') self.fsetVal('1024', 'buffer_size_kb') + if self.pmdebug: + self.setVal(self.pmdebug, self.pmdpath) def setupAllKprobes(self): for name in self.tracefuncs: self.defaultKprobe(name, self.tracefuncs[name]) @@ -641,6 +668,11 @@ class SystemValues: # turn trace off self.fsetVal('0', 'tracing_on') self.cleanupFtrace() + # pm debug messages + pv = self.getVal(self.pmdpath) + if pv != '1': + self.setVal('1', self.pmdpath) + self.pmdebug = pv # set the trace clock to global self.fsetVal('global', 'trace_clock') self.fsetVal('nop', 'current_tracer') @@ -728,19 +760,24 @@ class SystemValues: if not self.ansi: return str return '\x1B[%d;40m%s\x1B[m' % (color, str) - def writeDatafileHeader(self, filename, fwdata=[]): + def writeDatafileHeader(self, filename, testdata): fp = self.openlog(filename, 'w') fp.write('%s\n%s\n# command | %s\n' % (self.teststamp, self.sysstamp, self.cmdline)) - if(self.suspendmode == 'mem' or self.suspendmode == 'command'): - for fw in fwdata: + for test in testdata: + if 'fw' in test: + fw = test['fw'] if(fw): fp.write('# fwsuspend %u fwresume %u\n' % (fw[0], fw[1])) + if 'bat' in test: + (a1, c1), (a2, c2) = test['bat'] + fp.write('# battery %s %d %s %d\n' % (a1, c1, a2, c2)) + if test['error'] or len(testdata) > 1: + fp.write('# enter_sleep_error %s\n' % test['error']) return fp - def sudouser(self, dir): - if os.path.exists(dir) and os.getuid() == 0 and \ - 'SUDO_USER' in os.environ: + def sudoUserchown(self, dir): + if os.path.exists(dir) and self.sudouser: cmd = 'chown -R {0}:{0} {1} > /dev/null 2>&1' - call(cmd.format(os.environ['SUDO_USER'], dir), shell=True) + call(cmd.format(self.sudouser, dir), shell=True) def outputResult(self, testdata, num=0): if not self.result: return @@ -762,7 +799,7 @@ class SystemValues: if 'bugurl' in testdata: fp.write('url%s: %s\n' % (n, testdata['bugurl'])) fp.close() - self.sudouser(self.result) + self.sudoUserchown(self.result) def configFile(self, file): dir = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(file): @@ -800,11 +837,12 @@ suspendmodename = { # Simple class which holds property values collected # for all the devices used in the timeline. class DevProps: - syspath = '' - altname = '' - async = True - xtraclass = '' - xtrainfo = '' + def __init__(self): + self.syspath = '' + self.altname = '' + self.async = True + self.xtraclass = '' + self.xtrainfo = '' def out(self, dev): return '%s,%s,%d;' % (dev, self.altname, self.async) def debug(self, dev): @@ -831,9 +869,6 @@ class DevProps: # A container used to create a device hierachy, with a single root node # and a tree of child nodes. Used by Data.deviceTopology() class DeviceNode: - name = '' - children = 0 - depth = 0 def __init__(self, nodename, nodedepth): self.name = nodename self.children = [] @@ -861,71 +896,78 @@ class DeviceNode: # } # class Data: - dmesg = {} # root data structure - phases = [] # ordered list of phases - start = 0.0 # test start - end = 0.0 # test end - tSuspended = 0.0 # low-level suspend start - tResumed = 0.0 # low-level resume start - tKernSus = 0.0 # kernel level suspend start - tKernRes = 0.0 # kernel level resume end - tLow = 0.0 # time spent in low-level suspend (standby/freeze) - fwValid = False # is firmware data available - fwSuspend = 0 # time spent in firmware suspend - fwResume = 0 # time spent in firmware resume - dmesgtext = [] # dmesg text file in memory - pstl = 0 # process timeline - testnumber = 0 - idstr = '' - html_device_id = 0 - stamp = 0 - outfile = '' - devpids = [] - kerror = False + phasedef = { + 'suspend_prepare': {'order': 0, 'color': '#CCFFCC'}, + 'suspend': {'order': 1, 'color': '#88FF88'}, + 'suspend_late': {'order': 2, 'color': '#00AA00'}, + 'suspend_noirq': {'order': 3, 'color': '#008888'}, + 'suspend_machine': {'order': 4, 'color': '#0000FF'}, + 'resume_machine': {'order': 5, 'color': '#FF0000'}, + 'resume_noirq': {'order': 6, 'color': '#FF9900'}, + 'resume_early': {'order': 7, 'color': '#FFCC00'}, + 'resume': {'order': 8, 'color': '#FFFF88'}, + 'resume_complete': {'order': 9, 'color': '#FFFFCC'}, + } + errlist = { + 'HWERROR' : '.*\[ *Hardware Error *\].*', + 'FWBUG' : '.*\[ *Firmware Bug *\].*', + 'BUG' : '.*BUG.*', + 'ERROR' : '.*ERROR.*', + 'WARNING' : '.*WARNING.*', + 'IRQ' : '.*genirq: .*', + 'TASKFAIL': '.*Freezing of tasks failed.*', + } def __init__(self, num): idchar = 'abcdefghij' - self.pstl = dict() + self.start = 0.0 # test start + self.end = 0.0 # test end + self.tSuspended = 0.0 # low-level suspend start + self.tResumed = 0.0 # low-level resume start + self.tKernSus = 0.0 # kernel level suspend start + self.tKernRes = 0.0 # kernel level resume end + self.fwValid = False # is firmware data available + self.fwSuspend = 0 # time spent in firmware suspend + self.fwResume = 0 # time spent in firmware resume + self.html_device_id = 0 + self.stamp = 0 + self.outfile = '' + self.kerror = False + self.battery = 0 + self.enterfail = '' + self.currphase = '' + self.pstl = dict() # process timeline self.testnumber = num self.idstr = idchar[num] - self.dmesgtext = [] - self.phases = [] - self.dmesg = { # fixed list of 10 phases - 'suspend_prepare': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#CCFFCC', 'order': 0}, - 'suspend': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#88FF88', 'order': 1}, - 'suspend_late': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#00AA00', 'order': 2}, - 'suspend_noirq': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#008888', 'order': 3}, - 'suspend_machine': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#0000FF', 'order': 4}, - 'resume_machine': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FF0000', 'order': 5}, - 'resume_noirq': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FF9900', 'order': 6}, - 'resume_early': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFCC00', 'order': 7}, - 'resume': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFFF88', 'order': 8}, - 'resume_complete': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFFFCC', 'order': 9} - } - self.phases = self.sortedPhases() - self.devicegroups = [] - for phase in self.phases: - self.devicegroups.append([phase]) + self.dmesgtext = [] # dmesg text file in memory + self.dmesg = dict() # root data structure self.errorinfo = {'suspend':[],'resume':[]} + self.tLow = [] # time spent in low-level suspends (standby/freeze) + self.devpids = [] + self.devicegroups = 0 + def sortedPhases(self): + return sorted(self.dmesg, key=lambda k:self.dmesg[k]['order']) + def initDevicegroups(self): + # called when phases are all finished being added + for phase in self.dmesg.keys(): + if '*' in phase: + p = phase.split('*') + pnew = '%s%d' % (p[0], len(p)) + self.dmesg[pnew] = self.dmesg.pop(phase) + self.devicegroups = [] + for phase in self.sortedPhases(): + self.devicegroups.append([phase]) + def nextPhase(self, phase, offset): + order = self.dmesg[phase]['order'] + offset + for p in self.dmesg: + if self.dmesg[p]['order'] == order: + return p + return '' + def lastPhase(self): + plist = self.sortedPhases() + if len(plist) < 1: + return '' + return plist[-1] def extractErrorInfo(self): - elist = { - 'HWERROR' : '.*\[ *Hardware Error *\].*', - 'FWBUG' : '.*\[ *Firmware Bug *\].*', - 'BUG' : '.*BUG.*', - 'ERROR' : '.*ERROR.*', - 'WARNING' : '.*WARNING.*', - 'IRQ' : '.*genirq: .*', - 'TASKFAIL': '.*Freezing of tasks failed.*', - } lf = sysvals.openlog(sysvals.dmesgfile, 'r') i = 0 list = [] @@ -939,8 +981,8 @@ class Data: continue dir = 'suspend' if t < self.tSuspended else 'resume' msg = m.group('msg') - for err in elist: - if re.match(elist[err], msg): + for err in self.errlist: + if re.match(self.errlist[err], msg): list.append((err, dir, t, i, i)) self.kerror = True break @@ -956,7 +998,7 @@ class Data: def setEnd(self, time): self.end = time def isTraceEventOutsideDeviceCalls(self, pid, time): - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for dev in list: d = list[dev] @@ -964,16 +1006,10 @@ class Data: time < d['end']): return False return True - def phaseCollision(self, phase, isbegin, line): - key = 'end' - if isbegin: - key = 'start' - if self.dmesg[phase][key] >= 0: - sysvals.vprint('IGNORE: %s' % line.strip()) - return True - return False def sourcePhase(self, start): - for phase in self.phases: + for phase in self.sortedPhases(): + if 'machine' in phase: + continue pend = self.dmesg[phase]['end'] if start <= pend: return phase @@ -1004,14 +1040,15 @@ class Data: return tgtdev def addDeviceFunctionCall(self, displayname, kprobename, proc, pid, start, end, cdata, rdata): # try to place the call in a device - tgtdev = self.sourceDevice(self.phases, start, end, pid, 'device') + phases = self.sortedPhases() + tgtdev = self.sourceDevice(phases, start, end, pid, 'device') # calls with device pids that occur outside device bounds are dropped # TODO: include these somehow if not tgtdev and pid in self.devpids: return False # try to place the call in a thread if not tgtdev: - tgtdev = self.sourceDevice(self.phases, start, end, pid, 'thread') + tgtdev = self.sourceDevice(phases, start, end, pid, 'thread') # create new thread blocks, expand as new calls are found if not tgtdev: if proc == '<...>': @@ -1053,7 +1090,7 @@ class Data: def overflowDevices(self): # get a list of devices that extend beyond the end of this test run devlist = [] - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for devname in list: dev = list[devname] @@ -1064,7 +1101,7 @@ class Data: # merge any devices that overlap devlist for dev in devlist: devname = dev['name'] - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] if devname not in list: continue @@ -1079,7 +1116,7 @@ class Data: del list[devname] def usurpTouchingThread(self, name, dev): # the caller test has priority of this thread, give it to him - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] if name in list: tdev = list[name] @@ -1093,7 +1130,7 @@ class Data: break def stitchTouchingThreads(self, testlist): # merge any threads between tests that touch - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for devname in list: dev = list[devname] @@ -1103,7 +1140,7 @@ class Data: data.usurpTouchingThread(devname, dev) def optimizeDevSrc(self): # merge any src call loops to reduce timeline size - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] for dev in list: if 'src' not in list[dev]: @@ -1141,7 +1178,7 @@ class Data: self.tKernSus = self.trimTimeVal(self.tKernSus, t0, dT, left) self.tKernRes = self.trimTimeVal(self.tKernRes, t0, dT, left) self.end = self.trimTimeVal(self.end, t0, dT, left) - for phase in self.phases: + for phase in self.sortedPhases(): p = self.dmesg[phase] p['start'] = self.trimTimeVal(p['start'], t0, dT, left) p['end'] = self.trimTimeVal(p['end'], t0, dT, left) @@ -1150,6 +1187,7 @@ class Data: d = list[name] d['start'] = self.trimTimeVal(d['start'], t0, dT, left) d['end'] = self.trimTimeVal(d['end'], t0, dT, left) + d['length'] = d['end'] - d['start'] if('ftrace' in d): cg = d['ftrace'] cg.start = self.trimTimeVal(cg.start, t0, dT, left) @@ -1166,30 +1204,59 @@ class Data: tm = self.trimTimeVal(tm, t0, dT, left) list.append((type, tm, idx1, idx2)) self.errorinfo[dir] = list - def normalizeTime(self, tZero): + def trimFreezeTime(self, tZero): # trim out any standby or freeze clock time - if(self.tSuspended != self.tResumed): - if(self.tResumed > tZero): - self.trimTime(self.tSuspended, \ - self.tResumed-self.tSuspended, True) - else: - self.trimTime(self.tSuspended, \ - self.tResumed-self.tSuspended, False) + lp = '' + for phase in self.sortedPhases(): + if 'resume_machine' in phase and 'suspend_machine' in lp: + tS, tR = self.dmesg[lp]['end'], self.dmesg[phase]['start'] + tL = tR - tS + if tL > 0: + left = True if tR > tZero else False + self.trimTime(tS, tL, left) + self.tLow.append('%.0f'%(tL*1000)) + lp = phase def getTimeValues(self): - sktime = (self.dmesg['suspend_machine']['end'] - \ - self.tKernSus) * 1000 - rktime = (self.dmesg['resume_complete']['end'] - \ - self.dmesg['resume_machine']['start']) * 1000 - return (sktime, rktime) - def setPhase(self, phase, ktime, isbegin): - if(isbegin): - self.dmesg[phase]['start'] = ktime + if 'suspend_machine' in self.dmesg: + sktime = (self.dmesg['suspend_machine']['end'] - \ + self.tKernSus) * 1000 else: + sktime = (self.tSuspended - self.tKernSus) * 1000 + if 'resume_machine' in self.dmesg: + rktime = (self.tKernRes - \ + self.dmesg['resume_machine']['start']) * 1000 + else: + rktime = (self.tKernRes - self.tResumed) * 1000 + return (sktime, rktime) + def setPhase(self, phase, ktime, isbegin, order=-1): + if(isbegin): + # phase start over current phase + if self.currphase: + if 'resume_machine' not in self.currphase: + sysvals.vprint('WARNING: phase %s failed to end' % self.currphase) + self.dmesg[self.currphase]['end'] = ktime + phases = self.dmesg.keys() + color = self.phasedef[phase]['color'] + count = len(phases) if order < 0 else order + # create unique name for every new phase + while phase in phases: + phase += '*' + self.dmesg[phase] = {'list': dict(), 'start': -1.0, 'end': -1.0, + 'row': 0, 'color': color, 'order': count} + self.dmesg[phase]['start'] = ktime + self.currphase = phase + else: + # phase end without a start + if phase not in self.currphase: + if self.currphase: + sysvals.vprint('WARNING: %s ended instead of %s, ftrace corruption?' % (phase, self.currphase)) + else: + sysvals.vprint('WARNING: %s ended without a start, ftrace corruption?' % phase) + return phase + phase = self.currphase self.dmesg[phase]['end'] = ktime - def dmesgSortVal(self, phase): - return self.dmesg[phase]['order'] - def sortedPhases(self): - return sorted(self.dmesg, key=self.dmesgSortVal) + self.currphase = '' + return phase def sortedDevices(self, phase): list = self.dmesg[phase]['list'] slist = [] @@ -1208,13 +1275,13 @@ class Data: for devname in phaselist: dev = phaselist[devname] if(dev['end'] < 0): - for p in self.phases: + for p in self.sortedPhases(): if self.dmesg[p]['end'] > dev['start']: dev['end'] = self.dmesg[p]['end'] break sysvals.vprint('%s (%s): callback didnt return' % (devname, phase)) def deviceFilter(self, devicefilter): - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] rmlist = [] for name in list: @@ -1229,7 +1296,7 @@ class Data: del list[name] def fixupInitcallsThatDidntReturn(self): # if any calls never returned, clip them at system resume end - for phase in self.phases: + for phase in self.sortedPhases(): self.fixupInitcalls(phase) def phaseOverlap(self, phases): rmgroups = [] @@ -1248,17 +1315,18 @@ class Data: self.devicegroups.append(newgroup) def newActionGlobal(self, name, start, end, pid=-1, color=''): # which phase is this device callback or action in + phases = self.sortedPhases() targetphase = 'none' htmlclass = '' overlap = 0.0 - phases = [] - for phase in self.phases: + myphases = [] + for phase in phases: pstart = self.dmesg[phase]['start'] pend = self.dmesg[phase]['end'] # see if the action overlaps this phase o = max(0, min(end, pend) - max(start, pstart)) if o > 0: - phases.append(phase) + myphases.append(phase) # set the target phase to the one that overlaps most if o > overlap: if overlap > 0 and phase == 'post_resume': @@ -1267,19 +1335,19 @@ class Data: overlap = o # if no target phase was found, pin it to the edge if targetphase == 'none': - p0start = self.dmesg[self.phases[0]]['start'] + p0start = self.dmesg[phases[0]]['start'] if start <= p0start: - targetphase = self.phases[0] + targetphase = phases[0] else: - targetphase = self.phases[-1] + targetphase = phases[-1] if pid == -2: htmlclass = ' bg' elif pid == -3: htmlclass = ' ps' - if len(phases) > 1: + if len(myphases) > 1: htmlclass = ' bg' - self.phaseOverlap(phases) - if targetphase in self.phases: + self.phaseOverlap(myphases) + if targetphase in phases: newname = self.newAction(targetphase, name, pid, '', start, end, '', htmlclass, color) return (targetphase, newname) return False @@ -1315,7 +1383,7 @@ class Data: sysvals.vprint('Timeline Details:') sysvals.vprint(' test start: %f' % self.start) sysvals.vprint('kernel suspend start: %f' % self.tKernSus) - for phase in self.phases: + for phase in self.sortedPhases(): dc = len(self.dmesg[phase]['list']) sysvals.vprint(' %16s: %f - %f (%d devices)' % (phase, \ self.dmesg[phase]['start'], self.dmesg[phase]['end'], dc)) @@ -1323,7 +1391,7 @@ class Data: sysvals.vprint(' test end: %f' % self.end) def deviceChildrenAllPhases(self, devname): devlist = [] - for phase in self.phases: + for phase in self.sortedPhases(): list = self.deviceChildren(devname, phase) for dev in list: if dev not in devlist: @@ -1344,7 +1412,7 @@ class Data: if node.name: info = '' drv = '' - for phase in self.phases: + for phase in self.sortedPhases(): list = self.dmesg[phase]['list'] if node.name in list: s = list[node.name]['start'] @@ -1478,8 +1546,29 @@ class Data: c = self.addProcessUsageEvent(ps, tres) if c > 0: sysvals.vprint('%25s (res): %d' % (ps, c)) + def handleEndMarker(self, time): + dm = self.dmesg + self.setEnd(time) + self.initDevicegroups() + # give suspend_prepare an end if needed + if 'suspend_prepare' in dm and dm['suspend_prepare']['end'] < 0: + dm['suspend_prepare']['end'] = time + # assume resume machine ends at next phase start + if 'resume_machine' in dm and dm['resume_machine']['end'] < 0: + np = self.nextPhase('resume_machine', 1) + if np: + dm['resume_machine']['end'] = dm[np]['start'] + # if kernel resume end not found, assume its the end marker + if self.tKernRes == 0.0: + self.tKernRes = time + # if kernel suspend start not found, assume its the end marker + if self.tKernSus == 0.0: + self.tKernSus = time + # set resume complete to end at end marker + if 'resume_complete' in dm: + dm['resume_complete']['end'] = time def debugPrint(self): - for p in self.phases: + for p in self.sortedPhases(): list = self.dmesg[p]['list'] for devname in list: dev = list[devname] @@ -1490,9 +1579,9 @@ class Data: # Description: # A container for kprobe function data we want in the dev timeline class DevFunction: - row = 0 - count = 1 def __init__(self, name, args, caller, ret, start, end, u, proc, pid, color): + self.row = 0 + self.count = 1 self.name = name self.args = args self.caller = caller @@ -1546,16 +1635,15 @@ class DevFunction: # suspend_resume: phase or custom exec block data # device_pm_callback: device callback info class FTraceLine: - time = 0.0 - length = 0.0 - fcall = False - freturn = False - fevent = False - fkprobe = False - depth = 0 - name = '' - type = '' def __init__(self, t, m='', d=''): + self.length = 0.0 + self.fcall = False + self.freturn = False + self.fevent = False + self.fkprobe = False + self.depth = 0 + self.name = '' + self.type = '' self.time = float(t) if not m and not d: return @@ -1675,19 +1763,13 @@ class FTraceLine: # Each instance is tied to a single device in a single phase, and is # comprised of an ordered list of FTraceLine objects class FTraceCallGraph: - id = '' - start = -1.0 - end = -1.0 - list = [] - invalid = False - depth = 0 - pid = 0 - name = '' - partial = False vfname = 'missing_function_name' - ignore = False - sv = 0 def __init__(self, pid, sv): + self.id = '' + self.invalid = False + self.name = '' + self.partial = False + self.ignore = False self.start = -1.0 self.end = -1.0 self.list = [] @@ -1943,7 +2025,7 @@ class FTraceCallGraph: dev['ftrace'] = cg found = devname return found - for p in data.phases: + for p in data.sortedPhases(): if(data.dmesg[p]['start'] <= self.start and self.start <= data.dmesg[p]['end']): list = data.dmesg[p]['list'] @@ -1966,7 +2048,7 @@ class FTraceCallGraph: if fs < data.start or fe > data.end: return phase = '' - for p in data.phases: + for p in data.sortedPhases(): if(data.dmesg[p]['start'] <= self.start and self.start < data.dmesg[p]['end']): phase = p @@ -2008,23 +2090,20 @@ class DevItem: # A container for a device timeline which calculates # all the html properties to display it correctly class Timeline: - html = '' - height = 0 # total timeline height - scaleH = 20 # timescale (top) row height - rowH = 30 # device row height - bodyH = 0 # body height - rows = 0 # total timeline rows - rowlines = dict() - rowheight = dict() html_tblock = '
\n' html_device = '
{6}
\n' html_phase = '
{5}
\n' html_phaselet = '
\n' html_legend = '
 {2}
\n' def __init__(self, rowheight, scaleheight): - self.rowH = rowheight - self.scaleH = scaleheight self.html = '' + self.height = 0 # total timeline height + self.scaleH = scaleheight # timescale (top) row height + self.rowH = rowheight # device row height + self.bodyH = 0 # body height + self.rows = 0 # total timeline rows + self.rowlines = dict() + self.rowheight = dict() def createHeader(self, sv, stamp): if(not stamp['time']): return @@ -2251,18 +2330,18 @@ class Timeline: # Description: # A list of values describing the properties of these test runs class TestProps: - stamp = '' - sysinfo = '' - cmdline = '' - kparams = '' - S0i3 = False - fwdata = [] stampfmt = '# [a-z]*-(?P[0-9]{2})(?P[0-9]{2})(?P[0-9]{2})-'+\ '(?P[0-9]{2})(?P[0-9]{2})(?P[0-9]{2})'+\ ' (?P.*) (?P.*) (?P.*)$' + batteryfmt = '^# battery (?P\w*) (?P\d*) (?P\w*) (?P\d*)' + testerrfmt = '^# enter_sleep_error (?P.*)' sysinfofmt = '^# sysinfo .*' cmdlinefmt = '^# command \| (?P.*)' kparamsfmt = '^# kparams \| (?P.*)' + devpropfmt = '# Device Properties: .*' + tracertypefmt = '# tracer: (?P.*)' + firmwarefmt = '# fwsuspend (?P[0-9]*) fwresume (?P[0-9]*)$' + procexecfmt = 'ps - (?P.*)$' ftrace_line_fmt_fg = \ '^ *(?P